HighCWu · HighCWu · Feb 10, 2024 · Feb 10, 2024 · Feb 10, 2024 · Dec 1, 2023
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -23,26 +23,25 @@ jobs:
       matrix-required: ${{ steps.set-matrix.outputs.matrix-required }}
       matrix-optional: ${{ steps.set-matrix.outputs.matrix-optional }}
     steps:
-      - name: Prepare runner matrix
+      - name: Prepare matrix
         id: set-matrix
         run: |
           if [ x"${{ github.repository }}" == x"openai/triton" ]; then
-            echo '::set-output name=matrix-required::[["self-hosted", "A100"], ["self-hosted", "H100"]]'
-            echo '::set-output name=matrix-optional::[["self-hosted", "gfx908"], ["self-hosted", "arc770"]]'
+            echo 'matrix-required={"runner": [["self-hosted", "A100"], ["self-hosted", "H100"]], "python-version": ["3.11"], "cuda-version": ["12.1"], "cc": ["clang"]}' >> "$GITHUB_OUTPUT"
+            echo 'matrix-optional={"runner": [["self-hosted", "gfx908"], ["self-hosted", "arc770"]], "python-version": ["3.11"], "cuda-version": ["12.1"], "cc": ["clang"]}' >> "$GITHUB_OUTPUT"
           else
-            echo '::set-output name=matrix-required::["ubuntu-latest"]'
-            echo '::set-output name=matrix-optional::["ubuntu-latest"]'
+            echo 'matrix-required={"runner":["ubuntu-latest", "windows-latest"], "python-version": ["3.8", "3.9", "3.10", "3.11"], "cuda-version": ["11.8.89", "12.1.1"], "cc": ["clang"]}' >> "$GITHUB_OUTPUT"
+            echo 'matrix-optional={"runner":["ubuntu-latest", "windows-latest"], "python-version": ["3.8", "3.9", "3.10", "3.11"], "cuda-version": ["11.8.89", "12.1.1"], "cc": ["clang"]}' >> "$GITHUB_OUTPUT"
           fi
 
   Integration-Tests:
     needs: Runner-Preparation
 
     runs-on: ${{ matrix.runner }}
-    timeout-minutes: 20
+    timeout-minutes: 60
 
     strategy:
-      matrix:
-        runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-required)}}
+      matrix: ${{fromJson(needs.Runner-Preparation.outputs.matrix-required)}}
 
     steps:
       - name: Checkout
@@ -55,15 +54,99 @@ jobs:
           echo "BACKEND=CUDA" >> "${GITHUB_ENV}"
           echo "TRITON_DISABLE_LINE_INFO=1" >> "${GITHUB_ENV}"
 
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Set up MSVC
+        if: matrix.runner == 'windows-latest'
+        uses: ilammy/msvc-dev-cmd@v1.12.1
+        with:
+          arch: amd64
+
+      - name: Setup Micromamba
+        uses: mamba-org/setup-micromamba@v1
+        if: matrix.runner[0] != 'self-hosted'
+        with:
+          environment-name: triton-env
+          init-shell: bash
+          create-args: >-
+            typer
+            ca-certificates
+            certifi
+            openssl
+            zlib
+            zstd
+            llvm>=17.0
+          condarc: |
+            channels:
+              - nvidia/label/cuda-${{ matrix.cuda-version }}
+              - conda-forge
+              - pytorch
+            channel_priority: strict
+
+      - name: set Environment Variables (Windows)
+        if: matrix.runner == 'windows-latest'
+        shell: bash -el {0}
+        run: |
+          ver=4017f04e
+          curl -L -O https://github.com/wkpark/triton/releases/download/llvm-$ver-windows/llvm-$ver-windows-x64.tar.gz
+          curl -L -O https://github.com/wkpark/triton/releases/download/llvm-$ver-windows/llvm-fix.patch
+          tar xvf llvm-$ver-windows-x64.tar.gz
+          mv llvm-$ver-windows-x64 LLVM
+          patch -p0 < llvm-fix.patch
+          echo "LLVM_SYSPATH=${{ github.workspace }}\\LLVM" >> "$GITHUB_ENV"
+          rm -f llvm-$ver-windows-x64.tar.gz
+
+          ### LLVM_SHORTHASH="$(cat cmake/llvm-hash.txt | cut -c1-8)"
+          # prepare LLVM prebuilt path. will be downloaded and extracted by setup.py step
+          ### echo "~/.triton/llvm/llvm-$LLVM_SHORTHASH-windows-x64/bin" >> "$GITHUB_PATH"
+          #echo "LLVM_SYSPATH=~/.triton/llvm/llvm-$LLVM_SHORTHASH-windows-x64" >> "$GITHUB_ENV"
+          # compile with a selected matrix.cc
+          if [ "${{matrix.cc}}" = "cl" ]; then
+            echo "CC=cl" >> "${GITHUB_ENV}"
+            echo "CXX=cl" >> "${GITHUB_ENV}"
+          elif [ "${{matrix.cc}}" = "clang" ]; then
+            echo "CC=clang" >> "${GITHUB_ENV}"
+            echo "CXX=clang++" >> "${GITHUB_ENV}"
+          fi
+
+      - name: CUDA Setup ${{ matrix.cuda-version }}
+        if: matrix.runner[0] != 'self-hosted'
+        shell: bash -el {0}
+        run: |
+          CUDA_HOME="${{ env.MAMBA_ROOT_PREFIX }}/envs/bnb-env"
+          echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV"
+          echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV"
+
+      - name: Update environment
+        if: matrix.runner[0] != 'self-hosted'
+        shell: bash
+        run: |
+          echo "BACKEND=CUDA" >> "${GITHUB_ENV}"
+          echo "ENABLE_TMA=0" >> "${GITHUB_ENV}"
+          echo "TRITON_DISABLE_LINE_INFO=1" >> "${GITHUB_ENV}"
+
+      - name: Set reusable strings
+        # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
+        id: strings
+        shell: bash
+        run: |
+          echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"
+
       - name: Clear cache
+        shell: bash
         run: |
           rm -rf ~/.triton
 
       - name: Update PATH
+        if: matrix.runner[0] == 'self-hosted'
         run: |
           echo "PATH=${HOME}/.local/bin:${PATH}" >> "${GITHUB_ENV}"
 
       - name: Check pre-commit
+        shell: bash
         run: |
           python3 -m pip install --upgrade pre-commit
           # TODO: ignore the first yapf failure until https://github.com/google/yapf/issues/1164 is fixed
@@ -73,17 +156,29 @@ jobs:
           python3 -m pre_commit run --all-files --verbose
 
       - name: Install Triton
-        if: ${{ env.BACKEND == 'CUDA'}}
+        if: matrix.runner != 'windows-latest'
         run: |
           cd python
           python3 -m pip install --upgrade pip
-          python3 -m pip install cmake==3.24 ninja pytest-xdist
+          python3 -m pip install cmake==3.24 ninja pytest-xdist wheel
           sudo apt-get update -y
           sudo apt-get install -y ccache clang lld
           TRITON_BUILD_WITH_CLANG_LLD=true TRITON_BUILD_WITH_CCACHE=true python3 -m pip install --no-build-isolation -vvv '.[tests]'
+          if [ "${{ matrix.runner }}" = 'ubuntu-latest' ]; then
+            python3 setup.py bdist_wheel
+          fi
+
+      - name: Install Triton (Windows)
+        if: matrix.runner == 'windows-latest'
+        run: |
+          cd python
+          python -m pip install --upgrade pip
+          python -m pip install cmake==3.24 ninja pytest-xdist wheel
+          python -m pip install --no-build-isolation -vvv .
+          python setup.py bdist_wheel
 
       - name: Run lit tests
-        if: ${{ env.BACKEND == 'CUDA'}}
+        if: matrix.runner[0] == 'self-hosted' && env.BACKEND == 'CUDA'
         run: |
           python3 -m pip install lit
           cd python
@@ -94,7 +189,7 @@ jobs:
           lit -v "${LIT_TEST_DIR}"
 
       - name: Run python tests on CUDA
-        if: ${{ env.BACKEND == 'CUDA' }}
+        if: ${{ (matrix.runner[0] == 'self-hosted') && env.BACKEND == 'CUDA' }}
         run: |
           cd python/test/unit
           python3 -m pytest -vvv -n 8 --ignore=runtime --ignore=operators --ignore=language/test_line_info.py --ignore=language/test_subprocess.py
@@ -107,10 +202,12 @@ jobs:
           python3 -m pytest -vvv hopper/test_flashattention.py
 
       - name: Clear cache
+        shell: bash
         run: |
           rm -rf ~/.triton
 
       - name: Run interpreter tests
+        if: matrix.runner[0] == 'self-hosted'
         env:
           # TRITON_INTERPRET: "1"
           CUA_VISIBLE_DEVICES: ""
@@ -119,11 +216,19 @@ jobs:
           python3 -m pytest -vvv -s operators/test_flash_attention.py
 
       - name: Run partial tests on CUDA
-        if: ${{ env.BACKEND == 'CUDA' }}
+        if: ${{ (matrix.runner[0] == 'self-hosted') && env.BACKEND == 'CUDA' }}
         run: |
           cd python/test/unit
           python3 -m pytest -vvv -n 8 operators
 
+      - name: Upload Build artifacts
+        if: matrix.runner[0] != 'self-hosted'
+        uses: actions/upload-artifact@v3
+        with:
+          name: triton-dist ${{ matrix.runner }} python-${{ matrix.python-version }} cuda-${{ matrix.cuda-version }}
+          path: |
+            ${{ github.workspace }}/python/dist/
+
       - name: Create artifacts archive
         if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'V100' || matrix.runner[1] == 'A100' || matrix.runner[1] == 'H100')}}
         run: |
@@ -132,20 +237,21 @@ jobs:
 
       - name: Upload artifacts archive
         if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'V100' || matrix.runner[1] == 'A100' || matrix.runner[1] == 'H100')}}
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: artifacts ${{ matrix.runner[1] }}
           path: ~/.triton/artifacts.tar.gz
 
       - name: Run CXX unittests
-        if: ${{ env.BACKEND == 'CUDA'}}
+        if: ${{(matrix.runner[0] == 'self-hosted') && env.BACKEND == 'CUDA'}}
         run: |
           cd python
           cd "build/$(ls build | grep -i cmake)"
           ctest
 
 
   Compare-artifacts:
+    if: ${{(github.repository == 'openai/triton')}}
     needs: Integration-Tests
     timeout-minutes: 5
 

diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ python/build/
 python/triton.egg-info/
 python/triton/_C/libtriton.pyd
 python/triton/_C/libtriton.so
+python/triton/_C/triton.dll
 
 # Backends copied from submodules
 python/triton/backends/

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -30,8 +30,17 @@ set(TRITON_CODEGEN_BACKENDS "" CACHE STRING "Enable different codegen backends")
 # used conditionally in this file and by lit tests
 
 # Customized release build type with assertions: TritonRelBuildWithAsserts
-set(CMAKE_C_FLAGS_TRITONRELBUILDWITHASSERTS "-O2 -g")
-set(CMAKE_CXX_FLAGS_TRITONRELBUILDWITHASSERTS "-O2 -g")
+if(NOT MSVC)
+  set(CMAKE_C_FLAGS_TRITONRELBUILDWITHASSERTS "-O2 -g")
+  set(CMAKE_CXX_FLAGS_TRITONRELBUILDWITHASSERTS "-O2 -g")
+else()
+  set(CMAKE_C_FLAGS_TRITONRELBUILDWITHASSERTS "/Zi /Ob0 /Od /RTC1")
+  set(CMAKE_CXX_FLAGS_TRITONRELBUILDWITHASSERTS "/Zi /Ob0 /Od /RTC1")
+  set(CMAKE_EXE_LINKER_FLAGS_TRITONRELBUILDWITHASSERTS "/debug:fastlink /INCREMENTAL")
+  set(CMAKE_MODULE_LINKER_FLAGS_TRITONRELBUILDWITHASSERTS "/debug:fastlink /INCREMENTAL")
+  set(CMAKE_SHARED_LINKER_FLAGS_TRITONRELBUILDWITHASSERTS "/debug:fastlink /INCREMENTAL")
+  set(CMAKE_STATIC_LINKER_FLAGS_TRITONRELBUILDWITHASSERTS "/debug:fastlink /INCREMENTAL")
+endif()
 
 # Default build type
 if(NOT CMAKE_BUILD_TYPE)
@@ -45,7 +54,15 @@ endif()
 
 # Compiler flags
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
-set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -D__STDC_FORMAT_MACROS  -fPIC -std=gnu++17")
+if(NOT MSVC)
+  if(NOT WIN32)
+    set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -D__STDC_FORMAT_MACROS  -fPIC -std=gnu++17")
+  else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -D__STDC_FORMAT_MACROS -std=gnu++17 -Wno-deprecated")
+  endif()
+else()
+  set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -D__STDC_FORMAT_MACROS  /wd4244 /wd4624 /wd4715 /wd4530")
+endif()
 
 # Third-party
 include_directories(${PYBIND11_INCLUDE_DIR})
@@ -103,7 +120,11 @@ endfunction()
 
 
 # Disable warnings that show up in external code (gtest;pybind11)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-covered-switch-default -fvisibility=hidden")
+if(NOT MSVC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-covered-switch-default -fvisibility=hidden")
+else()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /WX-")
+endif()
 
 include_directories(".")
 include_directories(${MLIR_INCLUDE_DIRS})
@@ -137,6 +158,8 @@ if(TRITON_BUILD_PYTHON_MODULE)
 
   if(PYTHON_INCLUDE_DIRS)
     include_directories(${PYTHON_INCLUDE_DIRS})
+    message(STATUS "PYTHON_LIB_DIRS ${PYTHON_LIB_DIRS}")
+    link_directories(${PYTHON_LIB_DIRS})
   else()
     find_package(Python3 REQUIRED COMPONENTS Development Interpreter)
     include_directories(${Python3_INCLUDE_DIRS})
@@ -203,6 +226,8 @@ if(TRITON_BUILD_PYTHON_MODULE)
   target_link_libraries(triton PUBLIC ${TRITON_LIBRARIES})
   if(WIN32)
     target_link_libraries(triton PRIVATE ${CMAKE_DL_LIBS})
+    set_target_properties(triton PROPERTIES SUFFIX ".pyd")
+    set_target_properties(triton PROPERTIES PREFIX "lib")
   else()
     target_link_libraries(triton PRIVATE z)
   endif()
@@ -220,6 +245,11 @@ if(TRITON_BUILD_PYTHON_MODULE AND NOT WIN32)
   target_link_libraries(triton PRIVATE ${PYTHON_LDFLAGS})
 endif()
 
+if(WIN32)
+  option(CMAKE_USE_WIN32_THREADS_INIT "using WIN32 threads" ON)
+  option(gtest_disable_pthreads "Disable uses of pthreads in gtest." ON)
+endif()
+
 add_subdirectory(bin)
 add_subdirectory(test)
 add_subdirectory(unittest)
diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
@@ -71,6 +71,7 @@ mlir_check_all_link_libraries(triton-lsp)
 
 
 add_llvm_executable(triton-llvm-opt
+  PARTIAL_SOURCES_INTENDED
   triton-llvm-opt.cpp
 
   DEPENDS

diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -1433,6 +1433,7 @@ MfmaEncodingAttr::getShapePerCTATileForDotOperands(ArrayRef<int64_t> shape,
     return {32, parentShapePerCTA[1]};
   } else {
     assert(0 && "DotOperandEncodingAttr opIdx must be 0 or 1");
+    return {};
   }
 }
 

diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
@@ -634,8 +634,8 @@ void mlir::triton::asyncLaunchDots(scf::ForOp forOp) {
       lastOp = op;
       op = op->getBlock()->getParentOp();
     }
-    return std::distance(lastOp->getBlock()->getParent()->begin(),
-                         lastOp->getBlock()->getIterator());
+    return (long)std::distance(lastOp->getBlock()->getParent()->begin(),
+                               lastOp->getBlock()->getIterator());
   };
   /// XXX(Keren): Clean up the following duplicate code with checkDotOp
   /// dots to be pipelined
-Original file line number
+Diff line change
@@ Expand Up / @@ -71,6 +71,7 @@ mlir_check_all_link_libraries(triton-lsp) @@
     add_llvm_executable(triton-llvm-opt
+      PARTIAL_SOURCES_INTENDED
       triton-llvm-opt.cpp
       DEPENDS
@@ Expand Down @@