openxla · eaplatanios · Jul 29, 2024 · Jul 29, 2024 · Jul 29, 2024 · Jul 30, 2024
diff --git a/third_party/tsl/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/tsl/third_party/gpus/cuda/build_defs.bzl.tpl
@@ -104,9 +104,16 @@ def if_cuda_newer_than(wanted_ver, if_true, if_false = []):
     wanted_major = int(wanted_ver.split('_')[0])
     wanted_minor = int(wanted_ver.split('_')[1])
 
-    configured_version = "%{cuda_version}"
-    configured_major = int(configured_version.split('.')[0])
-    configured_minor = int(configured_version.split('.')[1])
+    # Strip "64_" which appears in the CUDA version on Windows.
+    configured_version = "%{cuda_version}".rsplit("_", 1)[-1]
+    configured_version_parts = configured_version.split('.')
+
+    # On Windows, the major and minor versions are concatenated without a period and the minor only contains one digit.
+    if len(configured_version_parts) == 1:
+        configured_version_parts = [configured_version[0:-1], configured_version[-1:]]
+
+    configured_major = int(configured_version_parts[0])
+    configured_minor = int(configured_version_parts[1])
 
     if %{cuda_is_configured} and (wanted_major, wanted_minor) <= (configured_major, configured_minor):
       return select({"//conditions:default": if_true})

diff --git a/xla/backends/profiler/gpu/cupti_buffer_events.cc b/xla/backends/profiler/gpu/cupti_buffer_events.cc
@@ -186,18 +186,18 @@ void AddGraphTraceActivityEvent(CuptiEventCollectorDelegate &collector,
   AnnotationMap::AnnotationInfo info = collector.annotation_map.LookUp(
       graph_trace->deviceId, graph_trace->correlationId);
   collector.receive(CuptiTracerEvent{
-      .type = CuptiTracerEventType::CudaGraph,
-      .source = CuptiTracerEventSource::Activity,
-      .name = absl::StrCat("CudaGraphExec:", graph_trace->graphId),
-      .annotation = info.annotation,
-      .nvtx_range = info.nvtx_range,
-      .start_time_ns = graph_trace->start,
-      .end_time_ns = graph_trace->end,
-      .device_id = graph_trace->deviceId,
-      .correlation_id = graph_trace->correlationId,
-      .context_id = graph_trace->contextId,
-      .stream_id = graph_trace->streamId,
-      .graph_id = graph_trace->graphId,
+      /* .type = */ CuptiTracerEventType::CudaGraph,
+      /* .source = */ CuptiTracerEventSource::Activity,
+      /* .name = */ absl::StrCat("CudaGraphExec:", graph_trace->graphId),
+      /* .annotation = */ info.annotation,
+      /* .nvtx_range = */ info.nvtx_range,
+      /* .start_time_ns = */ graph_trace->start,
+      /* .end_time_ns = */ graph_trace->end,
+      /* .device_id = */ graph_trace->deviceId,
+      /* .correlation_id = */ graph_trace->correlationId,
+      /* .context_id = */ graph_trace->contextId,
+      /* .stream_id = */ graph_trace->streamId,
+      /* .graph_id = */ graph_trace->graphId,
   });
 }
 

diff --git a/xla/backends/profiler/gpu/cupti_buffer_events.h b/xla/backends/profiler/gpu/cupti_buffer_events.h
@@ -56,7 +56,7 @@ struct MemcpyDetails {
   int8_t dst_mem_kind;
 
   // ID of the hardware channel on which this operation ran.
-  uint32_t channel_id = -1;
+  uint32_t channel_id = static_cast<uint32_t>(-1);
   // CUpti_ChannelType of the channel above.
   int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
 };

diff --git a/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -2129,7 +2129,7 @@ PJRT_Error* PJRT_Layouts_MemoryLayout_Serialize(
       PJRT_Layouts_MemoryLayout_Serialize_Args_STRUCT_SIZE, args->struct_size));
 
   PJRT_Layouts_SerializedLayout* s_layout = new PJRT_Layouts_SerializedLayout{
-      .serialized = args->layout->layout->Serialize()};
+      /* .serialized = */ args->layout->layout->Serialize()};
   args->serialized_layout = s_layout;
   args->serialized_bytes = s_layout->serialized.data();
   args->serialized_bytes_size = s_layout->serialized.size();

diff --git a/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -199,13 +199,17 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
 #endif
 }
 
+#if TENSORFLOW_USE_ROCM
 STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, {
   PjRtRegisterCompiler(
-#if TENSORFLOW_USE_ROCM
-      RocmName(),
+    RocmName(),
+    std::make_unique<StreamExecutorGpuCompiler>());
+});
 #else
-                       CudaName(),
-#endif
-      std::make_unique<StreamExecutorGpuCompiler>());
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, {
+  PjRtRegisterCompiler(
+    CudaName(),
+    std::make_unique<StreamExecutorGpuCompiler>());
 });
+#endif
 }  // namespace xla
diff --git a/xla/service/cpu/runtime/conv_impl.h b/xla/service/cpu/runtime/conv_impl.h
@@ -41,7 +41,7 @@ void EigenConv2DImpl(
     Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,
     Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
     Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,
-    std::optional<std::function<void()>> done_callback = std::nullopt) {
+    std::optional<std::function<void()>> done_callback) {
   const Eigen::TensorMap<Eigen::Tensor<const ScalarType, 4, Eigen::RowMajor>,
                          Eigen::Aligned>
       input(lhs, input_batch, input_x, input_y, input_channels);
@@ -129,7 +129,7 @@ void EigenConv3DImpl(
     Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,
     Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,
     Eigen::Index feature_group_count,
-    std::optional<std::function<void()>> done_callback = std::nullopt) {
+    std::optional<std::function<void()>> done_callback) {
   using ConstTType =
       Eigen::TensorMap<Eigen::Tensor<const ScalarType, 5, Eigen::RowMajor>,
                        Eigen::Aligned>;
@@ -223,7 +223,7 @@ void EigenConv3DImpl(
       Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,           \
       Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,            \
       Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,       \
-      std::optional<std::function<void()>> done_callback = std::nullopt)
+      std::optional<std::function<void()>> done_callback)
 
 CONV2D_EXTERN_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
 CONV2D_EXTERN_TEMPLATE(Eigen::DefaultDevice, float);
@@ -249,7 +249,7 @@ CONV2D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, float);
       Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,                \
       Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,                \
       Eigen::Index feature_group_count,                                        \
-      std::optional<std::function<void()>> done_callback = std::nullopt)
+      std::optional<std::function<void()>> done_callback)
 
 CONV3D_EXTERN_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
 CONV3D_EXTERN_TEMPLATE(Eigen::DefaultDevice, float);

diff --git a/xla/service/cpu/runtime_conv2d.cc b/xla/service/cpu/runtime_conv2d.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/service/cpu/runtime_conv2d.h"
 
+#include <optional>
+
 #define EIGEN_USE_THREADS
 
 #include "absl/base/dynamic_annotations.h"
@@ -41,7 +43,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF32(
       kernel_channels, kernel_filters, output_rows, output_cols, row_stride,
       col_stride, padding_top, padding_bottom, padding_left, padding_right,
       lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
-      feature_group_count);
+      feature_group_count, std::nullopt);
 }
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF16(
@@ -63,5 +65,5 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF16(
       kernel_channels, kernel_filters, output_rows, output_cols, row_stride,
       col_stride, padding_top, padding_bottom, padding_left, padding_right,
       lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
-      feature_group_count);
+      feature_group_count, std::nullopt);
 }
diff --git a/xla/service/cpu/runtime_conv3d.cc b/xla/service/cpu/runtime_conv3d.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/service/cpu/runtime_conv3d.h"
 
+#include <optional>
+
 #define EIGEN_USE_THREADS
 
 #include "absl/base/dynamic_annotations.h"
@@ -44,7 +46,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF32(
       y_stride, z_stride, padding_x_before, padding_x_after, padding_y_before,
       padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
       lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation, feature_group_count);
+      rhs_z_dilation, feature_group_count, std::nullopt);
 }
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF16(
@@ -69,5 +71,5 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF16(
       y_stride, z_stride, padding_x_before, padding_x_after, padding_y_before,
       padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
       lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation, feature_group_count);
+      rhs_z_dilation, feature_group_count, std::nullopt);
 }
diff --git a/xla/service/cpu/runtime_single_threaded_conv2d.cc b/xla/service/cpu/runtime_single_threaded_conv2d.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/service/cpu/runtime_single_threaded_conv2d.h"
 
+#include <optional>
+
 #include "absl/base/dynamic_annotations.h"
 #include "xla/service/cpu/runtime/conv_impl.h"
 
@@ -35,7 +37,7 @@ __xla_cpu_runtime_EigenSingleThreadedConv2DF16(
       kernel_filters, output_rows, output_cols, row_stride, col_stride,
       padding_top, padding_bottom, padding_left, padding_right,
       lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
-      feature_group_count);
+      feature_group_count, std::nullopt);
 }
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
@@ -55,5 +57,5 @@ __xla_cpu_runtime_EigenSingleThreadedConv2DF32(
       kernel_filters, output_rows, output_cols, row_stride, col_stride,
       padding_top, padding_bottom, padding_left, padding_right,
       lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
-      feature_group_count);
+      feature_group_count, std::nullopt);
 }
diff --git a/xla/service/cpu/runtime_single_threaded_conv3d.cc b/xla/service/cpu/runtime_single_threaded_conv3d.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/service/cpu/runtime_single_threaded_conv3d.h"
 
+#include <optional>
+
 #include "absl/base/dynamic_annotations.h"
 #include "xla/service/cpu/runtime/conv_impl.h"
 
@@ -38,7 +40,7 @@ __xla_cpu_runtime_EigenSingleThreadedConv3DF32(
       z_stride, padding_x_before, padding_x_after, padding_y_before,
       padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
       lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation, feature_group_count);
+      rhs_z_dilation, feature_group_count, std::nullopt);
 }
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
@@ -61,5 +63,5 @@ __xla_cpu_runtime_EigenSingleThreadedConv3DF16(
       z_stride, padding_x_before, padding_x_after, padding_y_before,
       padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
       lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation, feature_group_count);
+      rhs_z_dilation, feature_group_count, std::nullopt);
 }
diff --git a/xla/service/gpu/fusions/mlir/computation_partitioner.cc b/xla/service/gpu/fusions/mlir/computation_partitioner.cc
@@ -300,12 +300,12 @@ PartitionedComputation::PartitionedComputation(
         absl::StrJoin(roots, "_", [](std::string* out, const auto* root) {
           absl::StrAppend(out, root->name());
         })));
-    subgraphs_.push_back(
-        Subgraph{.name = std::move(name),
-                 .instructions = {instructions.begin(), instructions.end()},
-                 .roots = std::move(roots),
-                 .index_ranges = std::move(ranges),
-                 .root_indexing = std::move(root_indexing)});
+    subgraphs_.push_back(Subgraph{
+        /* .name = */ std::move(name),
+        /* .instructions = */ {instructions.begin(), instructions.end()},
+        /* .roots = */ std::move(roots),
+        /* .index_ranges = */ std::move(ranges),
+        /* .root_indexing = */ std::move(root_indexing)});
   }
 
   for (const auto& subgraph : subgraphs_) {

diff --git a/xla/service/gpu/kernels/BUILD b/xla/service/gpu/kernels/BUILD
@@ -328,7 +328,10 @@ cc_library(
 cuda_library(
     name = "cutlass_gemm_adaptor",
     hdrs = if_cuda_is_configured(["cutlass_gemm_adaptor.cu.h"]),
-    copts = ["-Wno-unknown-attributes"],  # __grid_constant__ is not supported by clang
+    copts = select({
+        "@xla//xla/tsl:windows": [],
+        "//conditions:default": ["-Wno-unknown-attributes"],  # __grid_constant__ is not supported by clang
+    }),
     deps = if_cuda_is_configured([
         ":cutlass_gemm",
         "@cutlass_archive//:cutlass",
@@ -370,7 +373,10 @@ cc_library(
 cuda_library(
     name = "cutlass_gemm_kernel_bf16xbf16_to_bf16",
     srcs = if_cuda_is_configured(["cutlass_gemm_kernel_bf16xbf16_to_bf16.cu.cc"]),
-    copts = ["-Wno-unknown-attributes -mllvm -unroll-threshold=100000"],
+    copts = ["-mllvm", "-unroll-threshold=100000"] + select({
+        "@xla//xla/tsl:windows": [],
+        "//conditions:default": ["-Wno-unknown-attributes"],
+    }),
     deps = if_cuda_is_configured([
         ":cutlass_gemm_adaptor",
         "@cutlass_archive//:cutlass",
@@ -381,7 +387,10 @@ cuda_library(
 cuda_library(
     name = "cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80",
     srcs = if_cuda_is_configured(["cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80.cu.cc"]),
-    copts = ["-Wno-unknown-attributes -mllvm -unroll-threshold=100000"],
+    copts = ["-mllvm", "-unroll-threshold=100000"] + select({
+        "@xla//xla/tsl:windows": [],
+        "//conditions:default": ["-Wno-unknown-attributes"],
+    }),
     deps = if_cuda_is_configured([
         ":cutlass_gemm_adaptor",
         "@cutlass_archive//:cutlass",
@@ -392,7 +401,10 @@ cuda_library(
 cuda_library(
     name = "cutlass_gemm_kernel_bf16xbf16_to_bf16_sm90",
     srcs = if_cuda_is_configured(["cutlass_gemm_kernel_bf16xbf16_to_bf16_sm90.cu.cc"]),
-    copts = ["-Wno-ctad-maybe-unsupported -Wno-unknown-attributes -mllvm -unroll-threshold=100000"],
+    copts = ["-mllvm", "-unroll-threshold=100000"] + select({
+        "@xla//xla/tsl:windows": [],
+        "//conditions:default": ["-Wno-ctad-maybe-unsupported", "-Wno-unknown-attributes"],
+    }),
     deps = if_cuda_is_configured([
         ":cutlass_gemm_adaptor",
         ":cutlass_gemm_epilogue",
@@ -404,7 +416,10 @@ cuda_library(
 cuda_library(
     name = "cutlass_gemm_kernel_f32xf32_to_f32",
     srcs = if_cuda_is_configured(["cutlass_gemm_kernel_f32xf32_to_f32.cu.cc"]),
-    copts = ["-Wno-unknown-attributes"],
+    copts = select({
+        "@xla//xla/tsl:windows": [],
+        "//conditions:default": ["-Wno-unknown-attributes"],
+    }),
     deps = if_cuda_is_configured([
         ":cutlass_gemm_adaptor",
         "@cutlass_archive//:cutlass",

diff --git a/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc b/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
@@ -101,7 +101,7 @@ KernelArgsPacking ArgsPacking(int32_t m, int32_t n, int32_t k,
   // object constructed in the storage. For now we ignore it, and it's textbook
   // definition of UB, but for CUTLASS kernels we use today it's perfectly safe.
   struct Params {
-    alignas(128) std::byte storage[1024];
+    alignas(64) std::byte storage[1024];
   };
 
   return [=](const se::Kernel& kernel, const se::KernelArgs& args) -> Packed {

diff --git a/xla/service/gpu/model/gpu_collective_performance_model.cc b/xla/service/gpu/model/gpu_collective_performance_model.cc
@@ -133,7 +133,7 @@ float GpuPerformanceWithCollectiveModel::GetNvlinkBw(
 }
 
 /*static*/ bool GpuPerformanceWithCollectiveModel::InitNvml() {
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA && defined(PLATFORM_POSIX)
   void* libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
   CHECK(libhandle != nullptr) << "Failed to open libnvidia-ml.so.1";
 

diff --git a/xla/service/gpu/model/gpu_collective_performance_model.h b/xla/service/gpu/model/gpu_collective_performance_model.h
@@ -26,7 +26,9 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 
 #if GOOGLE_CUDA
+#if defined(PLATFORM_POSIX)
 #include <dlfcn.h>
+#endif
 
 #include "third_party/gpus/cuda/nvml/include/nvml.h"
 // Below is a list of function pointers to be used

diff --git a/xla/service/gpu/stream_executor_util.cc b/xla/service/gpu/stream_executor_util.cc
@@ -437,7 +437,7 @@ static void InitializeTypedBuffer(se::Stream* stream,
 
   // Use a large prime number to fragment the accesses.
   constexpr int host_buffer_size = 10069;
-  static std::vector<T>* host_buffer = [] {
+  static std::vector<T>* host_buffer = [&] {
     auto* ret = new std::vector<T>(host_buffer_size);
     // Default-seeded random numbers.
     std::mt19937 gen;

diff --git a/xla/stream_executor/cuda/cuda_diagnostics.cc b/xla/stream_executor/cuda/cuda_diagnostics.cc
@@ -108,6 +108,8 @@ namespace gpu {
 
 #if !defined(PLATFORM_WINDOWS)
 static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
+#else
+static const char *kDriverVersionPath = "NO NVIDIA DRIVER VERSION FILE";
 #endif
 
 // -- class Diagnostician

diff --git a/xla/stream_executor/cuda/cuda_dnn.cc b/xla/stream_executor/cuda/cuda_dnn.cc
@@ -1749,8 +1749,8 @@ absl::Status CheckAndFetchProjectionWeights(
     int64_t size =
         dims[0] * dims[1] * dims[2] * CudnnDataTypeToByteSize(data_type);
 #endif  // CUDNN_VERSION >= 8100
-    dnn::RnnDescriptor::ParamsRegion region = {
-        reinterpret_cast<int64_t>(offset), size};
+    dnn::RnnDescriptor::ParamsRegion region = {static_cast<int64_t>(offset),
+                                               size};
     weights->push_back(region);
   }
   return absl::OkStatus();
@@ -1891,8 +1891,8 @@ absl::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
             /*nbDims=*/&n_dims, /*filterDimA=*/dims));
         int64_t size =
             dims[0] * dims[1] * dims[2] * CudnnDataTypeToByteSize(data_type);
-        dnn::RnnDescriptor::ParamsRegion region = {
-            reinterpret_cast<int64_t>(offset), size};
+        dnn::RnnDescriptor::ParamsRegion region = {static_cast<int64_t>(offset),
+                                                   size};
         (type == 0 ? weights : biases).push_back(region);
       }
 #endif  // CUDNN_VERSION >= 8100

diff --git a/xla/tsl/framework/BUILD b/xla/tsl/framework/BUILD
@@ -358,7 +358,10 @@ cc_library(
     hdrs = [
         "cancellation.h",
     ],
-    copts = ["-Wno-thread-safety-precise"],
+    copts = select({
+        "@xla//xla/tsl:windows": [],
+        "//conditions:default": ["-Wno-thread-safety-precise"],
+    }),
     visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/memory",