Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed some issues around compiling on Windows. #15444

Closed
Closed
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions third_party/tsl/third_party/gpus/cuda/build_defs.bzl.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,16 @@ def if_cuda_newer_than(wanted_ver, if_true, if_false = []):
wanted_major = int(wanted_ver.split('_')[0])
wanted_minor = int(wanted_ver.split('_')[1])

configured_version = "%{cuda_version}"
configured_major = int(configured_version.split('.')[0])
configured_minor = int(configured_version.split('.')[1])
# Strip "64_" which appears in the CUDA version on Windows.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is mostly self-explanatory. In Windows, the CUDA version looks something like 64_121. There are other parts of the build that already handle such version numbers but it was not being handled properly here.

configured_version = "%{cuda_version}".rsplit("_", 1)[-1]
configured_version_parts = configured_version.split('.')

# On Windows, the major and minor versions are concatenated without a period and the minor only contains one digit.
if len(configured_version_parts) == 1:
configured_version_parts = [configured_version[0:-1], configured_version[-1:]]

configured_major = int(configured_version_parts[0])
configured_minor = int(configured_version_parts[1])

if %{cuda_is_configured} and (wanted_major, wanted_minor) <= (configured_major, configured_minor):
return select({"//conditions:default": if_true})
Expand Down
24 changes: 12 additions & 12 deletions xla/backends/profiler/gpu/cupti_buffer_events.cc
Original file line number Diff line number Diff line change
Expand Up @@ -186,18 +186,18 @@ void AddGraphTraceActivityEvent(CuptiEventCollectorDelegate &collector,
AnnotationMap::AnnotationInfo info = collector.annotation_map.LookUp(
graph_trace->deviceId, graph_trace->correlationId);
collector.receive(CuptiTracerEvent{
.type = CuptiTracerEventType::CudaGraph,
.source = CuptiTracerEventSource::Activity,
.name = absl::StrCat("CudaGraphExec:", graph_trace->graphId),
.annotation = info.annotation,
.nvtx_range = info.nvtx_range,
.start_time_ns = graph_trace->start,
.end_time_ns = graph_trace->end,
.device_id = graph_trace->deviceId,
.correlation_id = graph_trace->correlationId,
.context_id = graph_trace->contextId,
.stream_id = graph_trace->streamId,
.graph_id = graph_trace->graphId,
/* .type = */ CuptiTracerEventType::CudaGraph,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

XLA is configured to build using C++ 17. However, this is a C++ 20 feature, resulting in the following error when trying to compile on Windows:

error C7555: use of designated initializers requires at least '/std:c++20'

/* .source = */ CuptiTracerEventSource::Activity,
/* .name = */ absl::StrCat("CudaGraphExec:", graph_trace->graphId),
/* .annotation = */ info.annotation,
/* .nvtx_range = */ info.nvtx_range,
/* .start_time_ns = */ graph_trace->start,
/* .end_time_ns = */ graph_trace->end,
/* .device_id = */ graph_trace->deviceId,
/* .correlation_id = */ graph_trace->correlationId,
/* .context_id = */ graph_trace->contextId,
/* .stream_id = */ graph_trace->streamId,
/* .graph_id = */ graph_trace->graphId,
});
}

Expand Down
2 changes: 1 addition & 1 deletion xla/backends/profiler/gpu/cupti_buffer_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ struct MemcpyDetails {
int8_t dst_mem_kind;

// ID of the hardware channel on which this operation ran.
uint32_t channel_id = -1;
uint32_t channel_id = static_cast<uint32_t>(-1);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This resulted in an implicit type narrowing error (I believe it was C2397). The explicit static cast fixes it.

// CUpti_ChannelType of the channel above.
int8_t channel_type = 0; // CUPTI_CHANNEL_TYPE_INVALID
};
Expand Down
2 changes: 1 addition & 1 deletion xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2129,7 +2129,7 @@ PJRT_Error* PJRT_Layouts_MemoryLayout_Serialize(
PJRT_Layouts_MemoryLayout_Serialize_Args_STRUCT_SIZE, args->struct_size));

PJRT_Layouts_SerializedLayout* s_layout = new PJRT_Layouts_SerializedLayout{
.serialized = args->layout->layout->Serialize()};
/* .serialized = */ args->layout->layout->Serialize()};
args->serialized_layout = s_layout;
args->serialized_bytes = s_layout->serialized.data();
args->serialized_bytes_size = s_layout->serialized.size();
Expand Down
14 changes: 9 additions & 5 deletions xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -199,13 +199,17 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
#endif
}

#if TENSORFLOW_USE_ROCM
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The nested macro seems to not be supported by MSVC. Pushing the inner ifdef outside the other macro seems to work and doesn't change the behavior/functionality of the code here.

STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, {
PjRtRegisterCompiler(
#if TENSORFLOW_USE_ROCM
RocmName(),
RocmName(),
std::make_unique<StreamExecutorGpuCompiler>());
});
#else
CudaName(),
#endif
std::make_unique<StreamExecutorGpuCompiler>());
STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, {
PjRtRegisterCompiler(
CudaName(),
std::make_unique<StreamExecutorGpuCompiler>());
});
#endif
} // namespace xla
8 changes: 4 additions & 4 deletions xla/service/cpu/runtime/conv_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ void EigenConv2DImpl(
Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,
Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,
std::optional<std::function<void()>> done_callback = std::nullopt) {
Copy link
Contributor Author

@eaplatanios eaplatanios Jul 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This resulted in this error:

error C2765: 'tensorflow::xla::EigenConv2DImpl': an explicit specialization or instantiation of a function template cannot have any default arguments

I just removed a couple default arguments that were causing this error and propagated them at call sites where they were missing.

std::optional<std::function<void()>> done_callback) {
const Eigen::TensorMap<Eigen::Tensor<const ScalarType, 4, Eigen::RowMajor>,
Eigen::Aligned>
input(lhs, input_batch, input_x, input_y, input_channels);
Expand Down Expand Up @@ -129,7 +129,7 @@ void EigenConv3DImpl(
Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,
Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,
Eigen::Index feature_group_count,
std::optional<std::function<void()>> done_callback = std::nullopt) {
std::optional<std::function<void()>> done_callback) {
using ConstTType =
Eigen::TensorMap<Eigen::Tensor<const ScalarType, 5, Eigen::RowMajor>,
Eigen::Aligned>;
Expand Down Expand Up @@ -223,7 +223,7 @@ void EigenConv3DImpl(
Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation, \
Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation, \
Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count, \
std::optional<std::function<void()>> done_callback = std::nullopt)
std::optional<std::function<void()>> done_callback)

CONV2D_EXTERN_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
CONV2D_EXTERN_TEMPLATE(Eigen::DefaultDevice, float);
Expand All @@ -249,7 +249,7 @@ CONV2D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, float);
Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation, \
Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation, \
Eigen::Index feature_group_count, \
std::optional<std::function<void()>> done_callback = std::nullopt)
std::optional<std::function<void()>> done_callback)

CONV3D_EXTERN_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
CONV3D_EXTERN_TEMPLATE(Eigen::DefaultDevice, float);
Expand Down
6 changes: 4 additions & 2 deletions xla/service/cpu/runtime_conv2d.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ limitations under the License.

#include "xla/service/cpu/runtime_conv2d.h"

#include <optional>

#define EIGEN_USE_THREADS

#include "absl/base/dynamic_annotations.h"
Expand All @@ -41,7 +43,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF32(
kernel_channels, kernel_filters, output_rows, output_cols, row_stride,
col_stride, padding_top, padding_bottom, padding_left, padding_right,
lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
feature_group_count);
feature_group_count, std::nullopt);
}

ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF16(
Expand All @@ -63,5 +65,5 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF16(
kernel_channels, kernel_filters, output_rows, output_cols, row_stride,
col_stride, padding_top, padding_bottom, padding_left, padding_right,
lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
feature_group_count);
feature_group_count, std::nullopt);
}
6 changes: 4 additions & 2 deletions xla/service/cpu/runtime_conv3d.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ limitations under the License.

#include "xla/service/cpu/runtime_conv3d.h"

#include <optional>

#define EIGEN_USE_THREADS

#include "absl/base/dynamic_annotations.h"
Expand Down Expand Up @@ -44,7 +46,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF32(
y_stride, z_stride, padding_x_before, padding_x_after, padding_y_before,
padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
rhs_z_dilation, feature_group_count);
rhs_z_dilation, feature_group_count, std::nullopt);
}

ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF16(
Expand All @@ -69,5 +71,5 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF16(
y_stride, z_stride, padding_x_before, padding_x_after, padding_y_before,
padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
rhs_z_dilation, feature_group_count);
rhs_z_dilation, feature_group_count, std::nullopt);
}
6 changes: 4 additions & 2 deletions xla/service/cpu/runtime_single_threaded_conv2d.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ limitations under the License.

#include "xla/service/cpu/runtime_single_threaded_conv2d.h"

#include <optional>

#include "absl/base/dynamic_annotations.h"
#include "xla/service/cpu/runtime/conv_impl.h"

Expand All @@ -35,7 +37,7 @@ __xla_cpu_runtime_EigenSingleThreadedConv2DF16(
kernel_filters, output_rows, output_cols, row_stride, col_stride,
padding_top, padding_bottom, padding_left, padding_right,
lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
feature_group_count);
feature_group_count, std::nullopt);
}

ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
Expand All @@ -55,5 +57,5 @@ __xla_cpu_runtime_EigenSingleThreadedConv2DF32(
kernel_filters, output_rows, output_cols, row_stride, col_stride,
padding_top, padding_bottom, padding_left, padding_right,
lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
feature_group_count);
feature_group_count, std::nullopt);
}
6 changes: 4 additions & 2 deletions xla/service/cpu/runtime_single_threaded_conv3d.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ limitations under the License.

#include "xla/service/cpu/runtime_single_threaded_conv3d.h"

#include <optional>

#include "absl/base/dynamic_annotations.h"
#include "xla/service/cpu/runtime/conv_impl.h"

Expand All @@ -38,7 +40,7 @@ __xla_cpu_runtime_EigenSingleThreadedConv3DF32(
z_stride, padding_x_before, padding_x_after, padding_y_before,
padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
rhs_z_dilation, feature_group_count);
rhs_z_dilation, feature_group_count, std::nullopt);
}

ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
Expand All @@ -61,5 +63,5 @@ __xla_cpu_runtime_EigenSingleThreadedConv3DF16(
z_stride, padding_x_before, padding_x_after, padding_y_before,
padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
rhs_z_dilation, feature_group_count);
rhs_z_dilation, feature_group_count, std::nullopt);
}
12 changes: 6 additions & 6 deletions xla/service/gpu/fusions/mlir/computation_partitioner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -300,12 +300,12 @@ PartitionedComputation::PartitionedComputation(
absl::StrJoin(roots, "_", [](std::string* out, const auto* root) {
absl::StrAppend(out, root->name());
})));
subgraphs_.push_back(
Subgraph{.name = std::move(name),
.instructions = {instructions.begin(), instructions.end()},
.roots = std::move(roots),
.index_ranges = std::move(ranges),
.root_indexing = std::move(root_indexing)});
subgraphs_.push_back(Subgraph{
/* .name = */ std::move(name),
/* .instructions = */ {instructions.begin(), instructions.end()},
/* .roots = */ std::move(roots),
/* .index_ranges = */ std::move(ranges),
/* .root_indexing = */ std::move(root_indexing)});
}

for (const auto& subgraph : subgraphs_) {
Expand Down
25 changes: 20 additions & 5 deletions xla/service/gpu/kernels/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,10 @@ cc_library(
cuda_library(
name = "cutlass_gemm_adaptor",
hdrs = if_cuda_is_configured(["cutlass_gemm_adaptor.cu.h"]),
copts = ["-Wno-unknown-attributes"], # __grid_constant__ is not supported by clang
copts = select({
"@xla//xla/tsl:windows": [],
"//conditions:default": ["-Wno-unknown-attributes"], # __grid_constant__ is not supported by clang
}),
deps = if_cuda_is_configured([
":cutlass_gemm",
"@cutlass_archive//:cutlass",
Expand Down Expand Up @@ -370,7 +373,10 @@ cc_library(
cuda_library(
name = "cutlass_gemm_kernel_bf16xbf16_to_bf16",
srcs = if_cuda_is_configured(["cutlass_gemm_kernel_bf16xbf16_to_bf16.cu.cc"]),
copts = ["-Wno-unknown-attributes -mllvm -unroll-threshold=100000"],
copts = ["-mllvm", "-unroll-threshold=100000"] + select({
"@xla//xla/tsl:windows": [],
"//conditions:default": ["-Wno-unknown-attributes"],
}),
deps = if_cuda_is_configured([
":cutlass_gemm_adaptor",
"@cutlass_archive//:cutlass",
Expand All @@ -381,7 +387,10 @@ cuda_library(
cuda_library(
name = "cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80",
srcs = if_cuda_is_configured(["cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80.cu.cc"]),
copts = ["-Wno-unknown-attributes -mllvm -unroll-threshold=100000"],
copts = ["-mllvm", "-unroll-threshold=100000"] + select({
"@xla//xla/tsl:windows": [],
"//conditions:default": ["-Wno-unknown-attributes"],
}),
deps = if_cuda_is_configured([
":cutlass_gemm_adaptor",
"@cutlass_archive//:cutlass",
Expand All @@ -392,7 +401,10 @@ cuda_library(
cuda_library(
name = "cutlass_gemm_kernel_bf16xbf16_to_bf16_sm90",
srcs = if_cuda_is_configured(["cutlass_gemm_kernel_bf16xbf16_to_bf16_sm90.cu.cc"]),
copts = ["-Wno-ctad-maybe-unsupported -Wno-unknown-attributes -mllvm -unroll-threshold=100000"],
copts = ["-mllvm", "-unroll-threshold=100000"] + select({
"@xla//xla/tsl:windows": [],
"//conditions:default": ["-Wno-ctad-maybe-unsupported", "-Wno-unknown-attributes"],
}),
deps = if_cuda_is_configured([
":cutlass_gemm_adaptor",
":cutlass_gemm_epilogue",
Expand All @@ -404,7 +416,10 @@ cuda_library(
cuda_library(
name = "cutlass_gemm_kernel_f32xf32_to_f32",
srcs = if_cuda_is_configured(["cutlass_gemm_kernel_f32xf32_to_f32.cu.cc"]),
copts = ["-Wno-unknown-attributes"],
copts = select({
"@xla//xla/tsl:windows": [],
"//conditions:default": ["-Wno-unknown-attributes"],
}),
deps = if_cuda_is_configured([
":cutlass_gemm_adaptor",
"@cutlass_archive//:cutlass",
Expand Down
2 changes: 1 addition & 1 deletion xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ KernelArgsPacking ArgsPacking(int32_t m, int32_t n, int32_t k,
// object constructed in the storage. For now we ignore it, and it's textbook
// definition of UB, but for CUTLASS kernels we use today it's perfectly safe.
struct Params {
alignas(128) std::byte storage[1024];
alignas(64) std::byte storage[1024];
};

return [=](const se::Kernel& kernel, const se::KernelArgs& args) -> Packed {
Expand Down
2 changes: 1 addition & 1 deletion xla/service/gpu/model/gpu_collective_performance_model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ float GpuPerformanceWithCollectiveModel::GetNvlinkBw(
}

/*static*/ bool GpuPerformanceWithCollectiveModel::InitNvml() {
#if GOOGLE_CUDA
#if GOOGLE_CUDA && defined(PLATFORM_POSIX)
void* libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
CHECK(libhandle != nullptr) << "Failed to open libnvidia-ml.so.1";

Expand Down
2 changes: 2 additions & 0 deletions xla/service/gpu/model/gpu_collective_performance_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ limitations under the License.
#include "xla/stream_executor/device_description.h"

#if GOOGLE_CUDA
#if defined(PLATFORM_POSIX)
#include <dlfcn.h>
#endif

#include "third_party/gpus/cuda/nvml/include/nvml.h"
// Below is a list of function pointers to be used
Expand Down
2 changes: 1 addition & 1 deletion xla/service/gpu/stream_executor_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ static void InitializeTypedBuffer(se::Stream* stream,

// Use a large prime number to fragment the accesses.
constexpr int host_buffer_size = 10069;
static std::vector<T>* host_buffer = [] {
static std::vector<T>* host_buffer = [&] {
auto* ret = new std::vector<T>(host_buffer_size);
// Default-seeded random numbers.
std::mt19937 gen;
Expand Down
2 changes: 2 additions & 0 deletions xla/stream_executor/cuda/cuda_diagnostics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ namespace gpu {

#if !defined(PLATFORM_WINDOWS)
static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
#else
static const char *kDriverVersionPath = "NO NVIDIA DRIVER VERSION FILE";
#endif

// -- class Diagnostician
Expand Down
8 changes: 4 additions & 4 deletions xla/stream_executor/cuda/cuda_dnn.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1749,8 +1749,8 @@ absl::Status CheckAndFetchProjectionWeights(
int64_t size =
dims[0] * dims[1] * dims[2] * CudnnDataTypeToByteSize(data_type);
#endif // CUDNN_VERSION >= 8100
dnn::RnnDescriptor::ParamsRegion region = {
reinterpret_cast<int64_t>(offset), size};
dnn::RnnDescriptor::ParamsRegion region = {static_cast<int64_t>(offset),
size};
weights->push_back(region);
}
return absl::OkStatus();
Expand Down Expand Up @@ -1891,8 +1891,8 @@ absl::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
/*nbDims=*/&n_dims, /*filterDimA=*/dims));
int64_t size =
dims[0] * dims[1] * dims[2] * CudnnDataTypeToByteSize(data_type);
dnn::RnnDescriptor::ParamsRegion region = {
reinterpret_cast<int64_t>(offset), size};
dnn::RnnDescriptor::ParamsRegion region = {static_cast<int64_t>(offset),
size};
(type == 0 ? weights : biases).push_back(region);
}
#endif // CUDNN_VERSION >= 8100
Expand Down
5 changes: 4 additions & 1 deletion xla/tsl/framework/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,10 @@ cc_library(
hdrs = [
"cancellation.h",
],
copts = ["-Wno-thread-safety-precise"],
copts = select({
"@xla//xla/tsl:windows": [],
"//conditions:default": ["-Wno-thread-safety-precise"],
}),
visibility = ["//visibility:public"],
deps = [
"@com_google_absl//absl/memory",
Expand Down
Loading