pytorch · jansel · Sep 28, 2022 · Sep 24, 2022 · Sep 25, 2022 · Sep 25, 2022
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -403,6 +403,7 @@ jobs:
       - install_deps
       - run:
           name: TIMM TorchInductor inference run
+          no_output_timeout: 30m
           command: |
             source .circleci/setup_env.sh
             make develop
@@ -425,6 +426,7 @@ jobs:
       - install_deps
       - run:
           name: TIMM TorchInductor inference run
+          no_output_timeout: 30m
           command: |
             source .circleci/setup_env.sh
             make develop
@@ -447,6 +449,7 @@ jobs:
       - install_deps
       - run:
           name: TIMM TorchInductor inference run
+          no_output_timeout: 30m
           command: |
             source .circleci/setup_env.sh
             make develop
@@ -469,6 +472,7 @@ jobs:
       - install_deps
       - run:
           name: TIMM TorchInductor inference run
+          no_output_timeout: 30m
           command: |
             source .circleci/setup_env.sh
             make develop
@@ -491,6 +495,7 @@ jobs:
       - install_deps
       - run:
           name: TIMM TorchInductor training run
+          no_output_timeout: 30m
           command: |
             source .circleci/setup_env.sh
             make develop
@@ -513,6 +518,7 @@ jobs:
       - install_deps
       - run:
           name: TIMM TorchInductor training run
+          no_output_timeout: 30m
           command: |
             source .circleci/setup_env.sh
             make develop
@@ -535,6 +541,7 @@ jobs:
       - install_deps
       - run:
           name: TIMM TorchInductor training run
+          no_output_timeout: 30m
           command: |
             source .circleci/setup_env.sh
             make develop
@@ -557,6 +564,7 @@ jobs:
       - install_deps
       - run:
           name: TIMM TorchInductor training run
+          no_output_timeout: 30m
           command: |
             source .circleci/setup_env.sh
             make develop
@@ -580,6 +588,7 @@ jobs:
       - install_deps
       - run:
           name: TIMM TorchInductor training run
+          no_output_timeout: 30m
           command: |
             source .circleci/setup_env.sh
             make develop
@@ -602,6 +611,7 @@ jobs:
       - install_deps
       - run:
           name: TIMM TorchInductor training run
+          no_output_timeout: 30m
           command: |
             source .circleci/setup_env.sh
             make develop

diff --git a/Makefile b/Makefile
@@ -10,7 +10,7 @@ PIP ?= python -m pip
 
 # versions used in CI
 PYTORCH_VERSION ?= dev20220928
-TRITON_VERSION ?= 889d9e34a114b1fe2e8871d21e713794344d12d3
+TRITON_VERSION ?= 998fd5f9afe166247f441999c605dfe624ca9331
 
 
 default: develop

diff --git a/test/test_torchinductor.py b/test/test_torchinductor.py
@@ -4,6 +4,7 @@
 import functools
 import importlib
 import random
+import sys
 import unittest
 from unittest.mock import patch
 
@@ -37,7 +38,8 @@
     assert get_decompositions([torch.ops.aten.trace])
     # Requires functorch
     from torchinductor.compile_fx import compile_fx_inner
-except (ImportError, ModuleNotFoundError, AssertionError):
+except (ImportError, ModuleNotFoundError, AssertionError) as e:
+    sys.stderr.write(f"{type(e)}: {e}\n")
     raise unittest.SkipTest("requires sympy/functorch")
 
 

diff --git a/torchinductor/codecache.py b/torchinductor/codecache.py
@@ -9,7 +9,11 @@
 import sysconfig
 import tempfile
 import types
+from concurrent.futures import Future
+from concurrent.futures import ThreadPoolExecutor
 from ctypes import cdll
+from typing import Any
+from typing import Dict
 
 from torch.utils import cpp_extension
 
@@ -160,9 +164,10 @@ def load(cls, source_code):
                 code = compile(f.read(), path, "exec")
                 mod = types.ModuleType(f"{__name__}.{key}")
                 mod.__file__ = path
+                mod.key = key
                 exec(code, mod.__dict__, mod.__dict__)
-                cls.cache[key] = mod
-                cls.cache[key].key = key
+                # another thread might set this first
+                cls.cache.setdefault(key, mod)
         return cls.cache[key]
 
 
@@ -174,7 +179,54 @@ def patch_triton_dir():
 
 
 class TritonCodeCache:
+    @staticmethod
+    def get_name(mod):
+        (name,) = [n for n in dir(mod) if n.startswith("kernel")]
+        return name
+
     @classmethod
     def load(cls, source_code):
         patch_triton_dir()
-        return PyCodeCache.load(source_code)
+        mod = PyCodeCache.load(source_code)
+        return getattr(mod, cls.get_name(mod))
+
+
+class AsyncCompile:
+    @staticmethod
+    @functools.lru_cache(1)
+    def pool():
+        assert config.compile_threads > 1
+        return ThreadPoolExecutor(config.compile_threads)
+
+    @classmethod
+    def submit(cls, task):
+        if config.compile_threads <= 1:
+            return task()
+        return cls.pool().submit(task)
+
+    @classmethod
+    def map(cls, fn, seq):
+        if config.compile_threads <= 1 or len(seq) <= 1:
+            return list(map(fn, seq))
+        return [t.result() for t in [cls.pool().submit(fn, x) for x in seq]]
+
+    def triton(self, source_code):
+        kernel = TritonCodeCache.load(source_code)
+
+        def task():
+            kernel.precompile()
+            return kernel
+
+        return self.submit(task)
+
+    def cpp(self, source_code):
+        def task():
+            return CppCodeCache.load(source_code).kernel
+
+        return self.submit(task)
+
+    def wait(self, scope: Dict[str, Any]):
+        if config.compile_threads > 1:
+            for key, result in list(scope.items()):
+                if isinstance(result, Future):
+                    scope[key] = result.result()
diff --git a/torchinductor/codegen/common.py b/torchinductor/codegen/common.py
@@ -6,6 +6,7 @@
 import re
 import textwrap
 import typing
+from collections import namedtuple
 from io import StringIO
 from itertools import chain
 
@@ -22,6 +23,9 @@
 
 log = logging.getLogger(__name__)
 
+TensorArg = namedtuple("TensorArg", ["name", "dtype"])
+SizeArg = namedtuple("SizeArg", ["name", "expr"])
+
 
 def index_prevent_reordering(index: typing.List[sympy.Expr], index_vars, sizes):
     from ..ir import FlexibleLayout
@@ -358,20 +362,28 @@ def cpp_argdefs(self):
     def python_argdefs(self):
         arg_defs = []
         call_args = []
+        precompile_args = []
         for inplaced in unique(self.inplace_buffers.values()):
             arg_defs.append(inplaced.inner_name)
             call_args.append(inplaced.other_names[-1])
+            precompile_args.append(
+                TensorArg(
+                    inplaced.inner_name, V.graph.get_dtype(inplaced.other_names[-1])
+                )
+            )
         for outer, inner in chain(
             self.input_buffers.items(), self.output_buffers.items()
         ):
             if outer in self.inplace_buffers or inner == "REMOVED":
                 continue
             arg_defs.append(inner)
             call_args.append(outer)
+            precompile_args.append(TensorArg(inner, V.graph.get_dtype(outer)))
         for outer, inner in self.sizevars.items():
             arg_defs.append(inner)
             call_args.append(outer)
-        return arg_defs, call_args
+            precompile_args.append(SizeArg(inner, sympy.expand(outer)))
+        return arg_defs, call_args, precompile_args
 
     def aliases(self):
         for inplaced in unique(self.inplace_buffers.values()):

diff --git a/torchinductor/codegen/cpp.py b/torchinductor/codegen/cpp.py
@@ -569,9 +569,9 @@ def codegen_define_and_call(self, wrapper):
             code.splice(self.loops_code)
 
         codecache_def = IndentedBuffer()
-        codecache_def.writeline("CppCodeCache.load('''")
+        codecache_def.writeline("async_compile.cpp('''")
         codecache_def.splice(code)
-        codecache_def.writeline("''').kernel")
+        codecache_def.writeline("''')")
 
         kernel_name = wrapper.next_kernel_name()
         codecache_str = codecache_def.getvalue()