Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tensorrt test workflow #3266

Open
wants to merge 23 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions .github/scripts/generate-tensorrt-test-matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/usr/bin/env python3

import argparse
import copy
import json
import sys

CUDA_VERSIONS_DICT = {
"nightly": ["cu124"],
"test": ["cu121", "cu124"],
"release": ["cu121", "cu124"],
}

PYTHON_VERSIONS_DICT = {
"nightly": ["3.9"],
"test": ["3.9", "3.10", "3.11", "3.12"],
"release": ["3.9", "3.10", "3.11", "3.12"],
}

TENSORRT_VERSIONS_DICT = {
"windows": {
"10.4.0": {
"urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/zip/TensorRT-10.4.0.26.Windows.win10.cuda-12.6.zip",
"strip_prefix": "TensorRT-10.4.0.26",
"sha256": "3a7de83778b9e9f812fd8901e07e0d7d6fc54ce633fcff2e340f994df2c6356c",
},
"10.5.0": {
"urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.5.0/zip/TensorRT-10.5.0.18.Windows.win10.cuda-12.6.zip",
"strip_prefix": "TensorRT-10.5.0.18",
"sha256": "e6436f4164db4e44d727354dccf7d93755efb70d6fbfd6fa95bdfeb2e7331b24",
},
"10.6.0": {
"urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/zip/TensorRT-10.6.0.26.Windows.win10.cuda-12.6.zip",
"strip_prefix": "TensorRT-10.6.0.26",
"sha256": "6c6d92c108a1b3368423e8f69f08d31269830f1e4c9da43b37ba34a176797254",
},
},
"linux": {
"10.4.0": {
"urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/tars/TensorRT-10.4.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz",
"strip_prefix": "TensorRT-10.4.0.26",
"sha256": "cb0273ecb3ba4db8993a408eedd354712301a6c7f20704c52cdf9f78aa97bbdb",
},
"10.5.0": {
"urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.5.0/tars/TensorRT-10.5.0.18.Linux.x86_64-gnu.cuda-12.6.tar.gz",
"strip_prefix": "TensorRT-10.5.0.18",
"sha256": "f404d379d639552a3e026cd5267213bd6df18a4eb899d6e47815bbdb34854958",
},
"10.6.0": {
"urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz",
"strip_prefix": "TensorRT-10.6.0.26",
"sha256": "33d3c2f3f4c84dc7991a4337a6fde9ed33f5c8e5c4f03ac2eb6b994a382b03a0",
},
},
}


def main(args: list[str]) -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"--matrix",
help="matrix",
type=str,
default="",
)

options = parser.parse_args(args)
if options.matrix == "":
raise Exception("--matrix is empty, please provide the matrix json str")

matrix_dict = json.loads(options.matrix)
includes = matrix_dict["include"]
assert len(includes) > 0
if "channel" not in includes[0]:
raise Exception(f"channel field is missing from the matrix: {options.matrix}")
channel = includes[0]["channel"]
if channel not in ("nightly", "test", "release"):
raise Exception(
f"channel field: {channel} is not supported, currently supported value: nightly, test, release"
)

if "validation_runner" not in includes[0]:
raise Exception(
f"validation_runner field is missing from the matrix: {options.matrix}"
)
if "windows" in includes[0]["validation_runner"]:
arch = "windows"
elif "linux" in includes[0]["validation_runner"]:
arch = "linux"
else:
raise Exception(
f"{includes[0].validation_runner} is not the supported arch, currently only support windows and linux"
)

cuda_versions = CUDA_VERSIONS_DICT[channel]
python_versions = PYTHON_VERSIONS_DICT[channel]
tensorrt_versions = TENSORRT_VERSIONS_DICT[arch]

filtered_includes = []
for item in includes:
if (
item["desired_cuda"] in cuda_versions
and item["python_version"] in python_versions
):
for tensorrt_version, tensorrt_json in tensorrt_versions.items():
new_item = copy.deepcopy(item)
tensorrt_json["version"] = tensorrt_version
new_item["tensorrt"] = tensorrt_json
filtered_includes.append(new_item)
filtered_matrix_dict = {}
filtered_matrix_dict["include"] = filtered_includes
print(json.dumps(filtered_matrix_dict))


if __name__ == "__main__":
main(sys.argv[1:])
222 changes: 222 additions & 0 deletions .github/workflows/build-tensorrt-linux.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
name: Build Torch-TensorRT wheel on Linux with specified tensorRT version
lanluo-nvidia marked this conversation as resolved.
Show resolved Hide resolved

on:
workflow_call:
inputs:
repository:
description: 'Repository to checkout, defaults to ""'
default: ""
type: string
ref:
description: 'Reference to checkout, defaults to "nightly"'
default: "nightly"
type: string
test-infra-repository:
description: "Test infra repository to use"
default: "pytorch/test-infra"
type: string
test-infra-ref:
description: "Test infra reference to use"
default: ""
type: string
build-matrix:
description: "Build matrix to utilize"
default: ""
type: string
pre-script:
description: "Pre script to run prior to build"
default: ""
type: string
post-script:
description: "Post script to run prior to build"
default: ""
type: string
smoke-test-script:
description: "Script for Smoke Test for a specific domain"
default: ""
type: string
env-var-script:
description: "Script that sets Domain-Specific Environment Variables"
default: ""
type: string
package-name:
description: "Name of the actual python package that is imported"
default: ""
type: string
trigger-event:
description: "Trigger Event in caller that determines whether or not to upload"
default: ""
type: string
cache-path:
description: "The path(s) on the runner to cache or restore. The path is relative to repository."
default: ""
type: string
cache-key:
description: "The key created when saving a cache and the key used to search for a cache."
default: ""
type: string
architecture:
description: Architecture to build for x86_64 for default Linux, or aarch64 for Linux aarch64 builds
required: false
type: string
default: x86_64
submodules:
description: Works as stated in actions/checkout, but the default value is recursive
required: false
type: string
default: recursive
setup-miniconda:
description: Set to true if setup-miniconda is needed
required: false
type: boolean
default: true

permissions:
id-token: write
contents: read

jobs:
build:
strategy:
fail-fast: false
matrix: ${{ fromJSON(inputs.build-matrix) }}
env:
PYTHON_VERSION: ${{ matrix.python_version }}
PACKAGE_TYPE: wheel
REPOSITORY: ${{ inputs.repository }}
REF: ${{ inputs.ref }}
CU_VERSION: ${{ matrix.desired_cuda }}
UPLOAD_TO_BASE_BUCKET: ${{ matrix.upload_to_base_bucket }}
ARCH: ${{ inputs.architecture }}
TENSORRT_STRIP_PREFIX: ${{ matrix.tensorrt.strip_prefix }}
TENSORRT_VERSION: ${{ matrix.tensorrt.version }}
TENSORRT_URLS: ${{ matrix.tensorrt.urls }}
TENSORRT_SHA256: ${{ matrix.tensorrt.sha256 }}
UPLOAD_ARTIFACT_NAME: pytorch_tensorrt_${{ matrix.tensorrt.version }}_${{ matrix.python_version }}_${{ matrix.desired_cuda }}_${{ inputs.architecture }}
name: build_tensorrt${{ matrix.tensorrt.version }}_py${{matrix.python_version}}_${{matrix.desired_cuda}}
runs-on: ${{ matrix.validation_runner }}
container:
image: ${{ matrix.container_image }}
options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all' || ' ' }}
# If a build is taking longer than 120 minutes on these runners we need
# to have a conversation
timeout-minutes: 120

steps:
- name: Clean workspace
shell: bash -l {0}
run: |
set -x
echo "::group::Cleanup debug output"
rm -rf "${GITHUB_WORKSPACE}"
mkdir -p "${GITHUB_WORKSPACE}"
if [[ "${{ inputs.architecture }}" = "aarch64" ]]; then
rm -rf "${RUNNER_TEMP}/*"
fi
echo "::endgroup::"
- uses: actions/checkout@v3
with:
# Support the use case where we need to checkout someone's fork
repository: ${{ inputs.test-infra-repository }}
ref: ${{ inputs.test-infra-ref }}
path: test-infra
- uses: actions/checkout@v3
if: ${{ env.ARCH == 'aarch64' }}
with:
# Support the use case where we need to checkout someone's fork
repository: "pytorch/builder"
ref: "main"
path: builder
- name: Set linux aarch64 CI
if: ${{ inputs.architecture == 'aarch64' }}
shell: bash -l {0}
env:
DESIRED_PYTHON: ${{ matrix.python_version }}
run: |
set +e
# TODO: This is temporary aarch64 setup script, this should be integrated into aarch64 docker.
${GITHUB_WORKSPACE}/builder/aarch64_linux/aarch64_ci_setup.sh
echo "/opt/conda/bin" >> $GITHUB_PATH
set -e
- uses: ./test-infra/.github/actions/set-channel
- name: Set PYTORCH_VERSION
if: ${{ env.CHANNEL == 'test' }}
run: |
# When building RC, set the version to be the current candidate version,
# otherwise, leave it alone so nightly will pick up the latest
echo "PYTORCH_VERSION=${{ matrix.stable_version }}" >> "${GITHUB_ENV}"
- uses: ./test-infra/.github/actions/setup-binary-builds
env:
PLATFORM: ${{ inputs.architecture == 'aarch64' && 'linux-aarch64' || ''}}
with:
repository: ${{ inputs.repository }}
ref: ${{ inputs.ref }}
submodules: ${{ inputs.submodules }}
setup-miniconda: ${{ inputs.setup-miniconda }}
python-version: ${{ env.PYTHON_VERSION }}
cuda-version: ${{ env.CU_VERSION }}
arch: ${{ env.ARCH }}
- name: Combine Env Var and Build Env Files
if: ${{ inputs.env-var-script != '' }}
working-directory: ${{ inputs.repository }}
shell: bash -l {0}
run: |
cat "${{ inputs.env-var-script }}" >> "${BUILD_ENV_FILE}"
- name: Install torch dependency
shell: bash -l {0}
run: |
set -x
# shellcheck disable=SC1090
source "${BUILD_ENV_FILE}"
# shellcheck disable=SC2086
${CONDA_RUN} ${PIP_INSTALL_TORCH}
- name: Run Pre-Script with Caching
if: ${{ inputs.pre-script != '' }}
uses: ./test-infra/.github/actions/run-script-with-cache
with:
cache-path: ${{ inputs.cache-path }}
cache-key: ${{ inputs.cache-key }}
repository: ${{ inputs.repository }}
script: ${{ inputs.pre-script }}
- name: Build clean
working-directory: ${{ inputs.repository }}
shell: bash -l {0}
run: |
set -x
source "${BUILD_ENV_FILE}"
${CONDA_RUN} python setup.py clean
- name: Build the wheel (bdist_wheel)
working-directory: ${{ inputs.repository }}
shell: bash -l {0}
run: |
set -x
source "${BUILD_ENV_FILE}"
${CONDA_RUN} python setup.py bdist_wheel

- name: Run Post-Script
if: ${{ inputs.post-script != '' }}
uses: ./test-infra/.github/actions/run-script-with-cache
with:
repository: ${{ inputs.repository }}
script: ${{ inputs.post-script }}
- name: Smoke Test
shell: bash -l {0}
env:
PACKAGE_NAME: ${{ inputs.package-name }}
SMOKE_TEST_SCRIPT: ${{ inputs.smoke-test-script }}
run: |
set -x
source "${BUILD_ENV_FILE}"
# TODO: add smoke test for the auditwheel tarball built

# NB: Only upload to GitHub after passing smoke tests
- name: Upload wheel to GitHub
continue-on-error: true
uses: actions/upload-artifact@v3
with:
name: ${{ env.UPLOAD_ARTIFACT_NAME }}
path: ${{ inputs.repository }}/dist

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
cancel-in-progress: true
Loading
Loading