From eacee95f41abc49a21516ee389861d84a40eca85 Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Mon, 29 Jul 2024 08:30:41 -0700 Subject: [PATCH 01/25] Fixed some issues around compiling on Windows. --- .../third_party/gpus/cuda/build_defs.bzl.tpl | 13 +++++++--- .../profiler/gpu/cupti_buffer_events.cc | 24 +++++++++---------- .../profiler/gpu/cupti_buffer_events.h | 2 +- xla/pjrt/c/pjrt_c_api_wrapper_impl.cc | 2 +- xla/service/cpu/runtime/conv_impl.h | 10 ++++---- xla/service/cpu/runtime_conv2d.cc | 6 +++-- xla/service/cpu/runtime_conv3d.cc | 6 +++-- .../cpu/runtime_single_threaded_conv2d.cc | 6 +++-- .../cpu/runtime_single_threaded_conv3d.cc | 6 +++-- 9 files changed, 45 insertions(+), 30 deletions(-) diff --git a/third_party/tsl/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/tsl/third_party/gpus/cuda/build_defs.bzl.tpl index bc865cecb3240..955103611afe1 100644 --- a/third_party/tsl/third_party/gpus/cuda/build_defs.bzl.tpl +++ b/third_party/tsl/third_party/gpus/cuda/build_defs.bzl.tpl @@ -104,9 +104,16 @@ def if_cuda_newer_than(wanted_ver, if_true, if_false = []): wanted_major = int(wanted_ver.split('_')[0]) wanted_minor = int(wanted_ver.split('_')[1]) - configured_version = "%{cuda_version}" - configured_major = int(configured_version.split('.')[0]) - configured_minor = int(configured_version.split('.')[1]) + # Strip "64_" which appears in the CUDA version on Windows. + configured_version = "%{cuda_version}".rsplit("_", 1)[-1] + configured_version_parts = configured_version.split('.') + + # On Windows, the major and minor versions are concatenated without a period and the minor only contains one digit. + if len(configured_version_parts) == 1: + configured_version_parts = [configured_version[0:-1], configured_version[-1:]] + + configured_major = int(configured_version_parts[0]) + configured_minor = int(configured_version_parts[1]) if %{cuda_is_configured} and (wanted_major, wanted_minor) <= (configured_major, configured_minor): return select({"//conditions:default": if_true}) diff --git a/xla/backends/profiler/gpu/cupti_buffer_events.cc b/xla/backends/profiler/gpu/cupti_buffer_events.cc index ccda1b0790235..376b1809ad4b1 100644 --- a/xla/backends/profiler/gpu/cupti_buffer_events.cc +++ b/xla/backends/profiler/gpu/cupti_buffer_events.cc @@ -186,18 +186,18 @@ void AddGraphTraceActivityEvent(CuptiEventCollectorDelegate &collector, AnnotationMap::AnnotationInfo info = collector.annotation_map.LookUp( graph_trace->deviceId, graph_trace->correlationId); collector.receive(CuptiTracerEvent{ - .type = CuptiTracerEventType::CudaGraph, - .source = CuptiTracerEventSource::Activity, - .name = absl::StrCat("CudaGraphExec:", graph_trace->graphId), - .annotation = info.annotation, - .nvtx_range = info.nvtx_range, - .start_time_ns = graph_trace->start, - .end_time_ns = graph_trace->end, - .device_id = graph_trace->deviceId, - .correlation_id = graph_trace->correlationId, - .context_id = graph_trace->contextId, - .stream_id = graph_trace->streamId, - .graph_id = graph_trace->graphId, + /* .type = */ CuptiTracerEventType::CudaGraph, + /* .source = */ CuptiTracerEventSource::Activity, + /* .name = */ absl::StrCat("CudaGraphExec:", graph_trace->graphId), + /* .annotation = */ info.annotation, + /* .nvtx_range = */ info.nvtx_range, + /* .start_time_ns = */ graph_trace->start, + /* .end_time_ns = */ graph_trace->end, + /* .device_id = */ graph_trace->deviceId, + /* .correlation_id = */ graph_trace->correlationId, + /* .context_id = */ graph_trace->contextId, + /* .stream_id = */ graph_trace->streamId, + /* .graph_id = */ graph_trace->graphId, }); } diff --git a/xla/backends/profiler/gpu/cupti_buffer_events.h b/xla/backends/profiler/gpu/cupti_buffer_events.h index ac708ed94faed..f58dda54e623c 100644 --- a/xla/backends/profiler/gpu/cupti_buffer_events.h +++ b/xla/backends/profiler/gpu/cupti_buffer_events.h @@ -56,7 +56,7 @@ struct MemcpyDetails { int8_t dst_mem_kind; // ID of the hardware channel on which this operation ran. - uint32_t channel_id = -1; + uint32_t channel_id = static_cast(-1); // CUpti_ChannelType of the channel above. int8_t channel_type = 0; // CUPTI_CHANNEL_TYPE_INVALID }; diff --git a/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc index e1ba7c832f314..54b8dbb651435 100644 --- a/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc +++ b/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc @@ -2129,7 +2129,7 @@ PJRT_Error* PJRT_Layouts_MemoryLayout_Serialize( PJRT_Layouts_MemoryLayout_Serialize_Args_STRUCT_SIZE, args->struct_size)); PJRT_Layouts_SerializedLayout* s_layout = new PJRT_Layouts_SerializedLayout{ - .serialized = args->layout->layout->Serialize()}; + /* .serialized = */ args->layout->layout->Serialize()}; args->serialized_layout = s_layout; args->serialized_bytes = s_layout->serialized.data(); args->serialized_bytes_size = s_layout->serialized.size(); diff --git a/xla/service/cpu/runtime/conv_impl.h b/xla/service/cpu/runtime/conv_impl.h index c6b9747bc0ed5..7a1865c224a75 100644 --- a/xla/service/cpu/runtime/conv_impl.h +++ b/xla/service/cpu/runtime/conv_impl.h @@ -18,7 +18,7 @@ limitations under the License. #include #include -#include "unsupported/Eigen/CXX11/Tensor" +#include "unsupported/Eigen/CXX11/Tensor" // from @eigen_archive #include "xla/tsl/framework/convolution/eigen_spatial_convolutions.h" #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL) @@ -41,7 +41,7 @@ void EigenConv2DImpl( Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation, Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count, - std::optional> done_callback = std::nullopt) { + std::optional> done_callback) { const Eigen::TensorMap, Eigen::Aligned> input(lhs, input_batch, input_x, input_y, input_channels); @@ -129,7 +129,7 @@ void EigenConv3DImpl( Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation, Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation, Eigen::Index feature_group_count, - std::optional> done_callback = std::nullopt) { + std::optional> done_callback) { using ConstTType = Eigen::TensorMap, Eigen::Aligned>; @@ -223,7 +223,7 @@ void EigenConv3DImpl( Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation, \ Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation, \ Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count, \ - std::optional> done_callback = std::nullopt) + std::optional> done_callback) CONV2D_EXTERN_TEMPLATE(Eigen::DefaultDevice, Eigen::half); CONV2D_EXTERN_TEMPLATE(Eigen::DefaultDevice, float); @@ -249,7 +249,7 @@ CONV2D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, float); Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation, \ Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation, \ Eigen::Index feature_group_count, \ - std::optional> done_callback = std::nullopt) + std::optional> done_callback) CONV3D_EXTERN_TEMPLATE(Eigen::DefaultDevice, Eigen::half); CONV3D_EXTERN_TEMPLATE(Eigen::DefaultDevice, float); diff --git a/xla/service/cpu/runtime_conv2d.cc b/xla/service/cpu/runtime_conv2d.cc index 907f0f5734602..4bc0d03fe8099 100644 --- a/xla/service/cpu/runtime_conv2d.cc +++ b/xla/service/cpu/runtime_conv2d.cc @@ -15,6 +15,8 @@ limitations under the License. #include "xla/service/cpu/runtime_conv2d.h" +#include + #define EIGEN_USE_THREADS #include "absl/base/dynamic_annotations.h" @@ -41,7 +43,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF32( kernel_channels, kernel_filters, output_rows, output_cols, row_stride, col_stride, padding_top, padding_bottom, padding_left, padding_right, lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation, - feature_group_count); + feature_group_count, std::nullopt); } ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF16( @@ -63,5 +65,5 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF16( kernel_channels, kernel_filters, output_rows, output_cols, row_stride, col_stride, padding_top, padding_bottom, padding_left, padding_right, lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation, - feature_group_count); + feature_group_count, std::nullopt); } diff --git a/xla/service/cpu/runtime_conv3d.cc b/xla/service/cpu/runtime_conv3d.cc index ad86203609e1a..7e83269e289fd 100644 --- a/xla/service/cpu/runtime_conv3d.cc +++ b/xla/service/cpu/runtime_conv3d.cc @@ -15,6 +15,8 @@ limitations under the License. #include "xla/service/cpu/runtime_conv3d.h" +#include + #define EIGEN_USE_THREADS #include "absl/base/dynamic_annotations.h" @@ -44,7 +46,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF32( y_stride, z_stride, padding_x_before, padding_x_after, padding_y_before, padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation, lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation, - rhs_z_dilation, feature_group_count); + rhs_z_dilation, feature_group_count, std::nullopt); } ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF16( @@ -69,5 +71,5 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF16( y_stride, z_stride, padding_x_before, padding_x_after, padding_y_before, padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation, lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation, - rhs_z_dilation, feature_group_count); + rhs_z_dilation, feature_group_count, std::nullopt); } diff --git a/xla/service/cpu/runtime_single_threaded_conv2d.cc b/xla/service/cpu/runtime_single_threaded_conv2d.cc index 999e53cc29602..a770681987400 100644 --- a/xla/service/cpu/runtime_single_threaded_conv2d.cc +++ b/xla/service/cpu/runtime_single_threaded_conv2d.cc @@ -15,6 +15,8 @@ limitations under the License. #include "xla/service/cpu/runtime_single_threaded_conv2d.h" +#include + #include "absl/base/dynamic_annotations.h" #include "xla/service/cpu/runtime/conv_impl.h" @@ -35,7 +37,7 @@ __xla_cpu_runtime_EigenSingleThreadedConv2DF16( kernel_filters, output_rows, output_cols, row_stride, col_stride, padding_top, padding_bottom, padding_left, padding_right, lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation, - feature_group_count); + feature_group_count, std::nullopt); } ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void @@ -55,5 +57,5 @@ __xla_cpu_runtime_EigenSingleThreadedConv2DF32( kernel_filters, output_rows, output_cols, row_stride, col_stride, padding_top, padding_bottom, padding_left, padding_right, lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation, - feature_group_count); + feature_group_count, std::nullopt); } diff --git a/xla/service/cpu/runtime_single_threaded_conv3d.cc b/xla/service/cpu/runtime_single_threaded_conv3d.cc index 91dd6c8794871..08ff94d06e7e7 100644 --- a/xla/service/cpu/runtime_single_threaded_conv3d.cc +++ b/xla/service/cpu/runtime_single_threaded_conv3d.cc @@ -15,6 +15,8 @@ limitations under the License. #include "xla/service/cpu/runtime_single_threaded_conv3d.h" +#include + #include "absl/base/dynamic_annotations.h" #include "xla/service/cpu/runtime/conv_impl.h" @@ -38,7 +40,7 @@ __xla_cpu_runtime_EigenSingleThreadedConv3DF32( z_stride, padding_x_before, padding_x_after, padding_y_before, padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation, lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation, - rhs_z_dilation, feature_group_count); + rhs_z_dilation, feature_group_count, std::nullopt); } ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void @@ -61,5 +63,5 @@ __xla_cpu_runtime_EigenSingleThreadedConv3DF16( z_stride, padding_x_before, padding_x_after, padding_y_before, padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation, lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation, - rhs_z_dilation, feature_group_count); + rhs_z_dilation, feature_group_count, std::nullopt); } From b12e4cf0d23c2690111125a651e486ec6a112e54 Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Mon, 29 Jul 2024 08:32:52 -0700 Subject: [PATCH 02/25] . --- xla/service/cpu/runtime/conv_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xla/service/cpu/runtime/conv_impl.h b/xla/service/cpu/runtime/conv_impl.h index 7a1865c224a75..b97bc85a4edc7 100644 --- a/xla/service/cpu/runtime/conv_impl.h +++ b/xla/service/cpu/runtime/conv_impl.h @@ -18,7 +18,7 @@ limitations under the License. #include #include -#include "unsupported/Eigen/CXX11/Tensor" // from @eigen_archive +#include "unsupported/Eigen/CXX11/Tensor" #include "xla/tsl/framework/convolution/eigen_spatial_convolutions.h" #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL) From e23ef176de72cf04555242174a19a407884f3f0e Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Mon, 29 Jul 2024 15:57:23 -0700 Subject: [PATCH 03/25] . --- .../gpu/fusions/mlir/computation_partitioner.cc | 10 +++++----- xla/service/gpu/model/hlo_op_profiles_data.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/xla/service/gpu/fusions/mlir/computation_partitioner.cc b/xla/service/gpu/fusions/mlir/computation_partitioner.cc index c1cc0de31de57..6b7a35668b732 100644 --- a/xla/service/gpu/fusions/mlir/computation_partitioner.cc +++ b/xla/service/gpu/fusions/mlir/computation_partitioner.cc @@ -301,11 +301,11 @@ PartitionedComputation::PartitionedComputation( absl::StrAppend(out, root->name()); }))); subgraphs_.push_back( - Subgraph{.name = std::move(name), - .instructions = {instructions.begin(), instructions.end()}, - .roots = std::move(roots), - .index_ranges = std::move(ranges), - .root_indexing = std::move(root_indexing)}); + Subgraph{/* .name = */ std::move(name), + /* .instructions = */ {instructions.begin(), instructions.end()}, + /* .roots = */ std::move(roots), + /* .index_ranges = */ std::move(ranges), + /* .root_indexing = */ std::move(root_indexing)}); } for (const auto& subgraph : subgraphs_) { diff --git a/xla/service/gpu/model/hlo_op_profiles_data.h b/xla/service/gpu/model/hlo_op_profiles_data.h index 043596a51fef9..4510ff80fac64 100644 --- a/xla/service/gpu/model/hlo_op_profiles_data.h +++ b/xla/service/gpu/model/hlo_op_profiles_data.h @@ -742,7 +742,7 @@ constexpr char kDeviceHloOpProfiles[] = R"pb( } } } - + "" entries { key: "sm_86" value { From bdae19b9e15c396985703bb7e88a4db6fcddc7f6 Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Mon, 29 Jul 2024 18:51:40 -0700 Subject: [PATCH 04/25] . --- xla/service/gpu/model/gpu_collective_performance_model.cc | 2 +- xla/service/gpu/model/gpu_collective_performance_model.h | 2 ++ xla/service/gpu/stream_executor_util.cc | 2 +- xla/stream_executor/cuda/cuda_dnn.cc | 4 ++-- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/xla/service/gpu/model/gpu_collective_performance_model.cc b/xla/service/gpu/model/gpu_collective_performance_model.cc index f04771f789691..a2934df808ba0 100644 --- a/xla/service/gpu/model/gpu_collective_performance_model.cc +++ b/xla/service/gpu/model/gpu_collective_performance_model.cc @@ -133,7 +133,7 @@ float GpuPerformanceWithCollectiveModel::GetNvlinkBw( } /*static*/ bool GpuPerformanceWithCollectiveModel::InitNvml() { -#if GOOGLE_CUDA +#if GOOGLE_CUDA && defined(PLATFORM_POSIX) void* libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW); CHECK(libhandle != nullptr) << "Failed to open libnvidia-ml.so.1"; diff --git a/xla/service/gpu/model/gpu_collective_performance_model.h b/xla/service/gpu/model/gpu_collective_performance_model.h index c11a78c684e80..fcb7c89017229 100644 --- a/xla/service/gpu/model/gpu_collective_performance_model.h +++ b/xla/service/gpu/model/gpu_collective_performance_model.h @@ -26,7 +26,9 @@ limitations under the License. #include "xla/stream_executor/device_description.h" #if GOOGLE_CUDA +#if defined(PLATFORM_POSIX) #include +#endif #include "third_party/gpus/cuda/nvml/include/nvml.h" // Below is a list of function pointers to be used diff --git a/xla/service/gpu/stream_executor_util.cc b/xla/service/gpu/stream_executor_util.cc index cde9b554bd504..d0375e1538099 100644 --- a/xla/service/gpu/stream_executor_util.cc +++ b/xla/service/gpu/stream_executor_util.cc @@ -437,7 +437,7 @@ static void InitializeTypedBuffer(se::Stream* stream, // Use a large prime number to fragment the accesses. constexpr int host_buffer_size = 10069; - static std::vector* host_buffer = [] { + static std::vector* host_buffer = [&] { auto* ret = new std::vector(host_buffer_size); // Default-seeded random numbers. std::mt19937 gen; diff --git a/xla/stream_executor/cuda/cuda_dnn.cc b/xla/stream_executor/cuda/cuda_dnn.cc index bbc6a6dc2cca7..58e4b261ebe00 100644 --- a/xla/stream_executor/cuda/cuda_dnn.cc +++ b/xla/stream_executor/cuda/cuda_dnn.cc @@ -1750,7 +1750,7 @@ absl::Status CheckAndFetchProjectionWeights( dims[0] * dims[1] * dims[2] * CudnnDataTypeToByteSize(data_type); #endif // CUDNN_VERSION >= 8100 dnn::RnnDescriptor::ParamsRegion region = { - reinterpret_cast(offset), size}; + static_cast(offset), size}; weights->push_back(region); } return absl::OkStatus(); @@ -1892,7 +1892,7 @@ absl::StatusOr CudnnRnnParamsDescriptor::Create( int64_t size = dims[0] * dims[1] * dims[2] * CudnnDataTypeToByteSize(data_type); dnn::RnnDescriptor::ParamsRegion region = { - reinterpret_cast(offset), size}; + static_cast(offset), size}; (type == 0 ? weights : biases).push_back(region); } #endif // CUDNN_VERSION >= 8100 From 2f90e6ba564f92fafa564b104ed0ce82b7642563 Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Mon, 29 Jul 2024 18:52:12 -0700 Subject: [PATCH 05/25] . --- xla/stream_executor/cuda/cuda_diagnostics.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xla/stream_executor/cuda/cuda_diagnostics.cc b/xla/stream_executor/cuda/cuda_diagnostics.cc index 561ac0d401e2f..2060fb2e296ea 100644 --- a/xla/stream_executor/cuda/cuda_diagnostics.cc +++ b/xla/stream_executor/cuda/cuda_diagnostics.cc @@ -108,6 +108,8 @@ namespace gpu { #if !defined(PLATFORM_WINDOWS) static const char *kDriverVersionPath = "/proc/driver/nvidia/version"; +#else +static const char *kDriverVersionPath = "NO NVIDIA DRIVER VERSION FILE"; #endif // -- class Diagnostician From 57009793b74c4d7d51fb39547a70a3ec142dadab Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Mon, 29 Jul 2024 19:15:38 -0700 Subject: [PATCH 06/25] . --- .../gpu/fusions/mlir/computation_partitioner.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/xla/service/gpu/fusions/mlir/computation_partitioner.cc b/xla/service/gpu/fusions/mlir/computation_partitioner.cc index 6b7a35668b732..53d8678e95307 100644 --- a/xla/service/gpu/fusions/mlir/computation_partitioner.cc +++ b/xla/service/gpu/fusions/mlir/computation_partitioner.cc @@ -300,12 +300,12 @@ PartitionedComputation::PartitionedComputation( absl::StrJoin(roots, "_", [](std::string* out, const auto* root) { absl::StrAppend(out, root->name()); }))); - subgraphs_.push_back( - Subgraph{/* .name = */ std::move(name), - /* .instructions = */ {instructions.begin(), instructions.end()}, - /* .roots = */ std::move(roots), - /* .index_ranges = */ std::move(ranges), - /* .root_indexing = */ std::move(root_indexing)}); + subgraphs_.push_back(Subgraph{ + /* .name = */ std::move(name), + /* .instructions = */ {instructions.begin(), instructions.end()}, + /* .roots = */ std::move(roots), + /* .index_ranges = */ std::move(ranges), + /* .root_indexing = */ std::move(root_indexing)}); } for (const auto& subgraph : subgraphs_) { From a978b1f7f70d49f1426fe46b107fdcc3618e3085 Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Mon, 29 Jul 2024 19:48:57 -0700 Subject: [PATCH 07/25] . --- xla/service/gpu/model/hlo_op_profiles_data.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xla/service/gpu/model/hlo_op_profiles_data.h b/xla/service/gpu/model/hlo_op_profiles_data.h index 4510ff80fac64..043596a51fef9 100644 --- a/xla/service/gpu/model/hlo_op_profiles_data.h +++ b/xla/service/gpu/model/hlo_op_profiles_data.h @@ -742,7 +742,7 @@ constexpr char kDeviceHloOpProfiles[] = R"pb( } } } - "" + entries { key: "sm_86" value { From d7fe81dc9cf909a6a8d70e2be8cfffca4063493e Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Mon, 29 Jul 2024 19:49:37 -0700 Subject: [PATCH 08/25] . --- xla/stream_executor/cuda/cuda_dnn.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xla/stream_executor/cuda/cuda_dnn.cc b/xla/stream_executor/cuda/cuda_dnn.cc index 58e4b261ebe00..46a275720d2a9 100644 --- a/xla/stream_executor/cuda/cuda_dnn.cc +++ b/xla/stream_executor/cuda/cuda_dnn.cc @@ -1749,8 +1749,8 @@ absl::Status CheckAndFetchProjectionWeights( int64_t size = dims[0] * dims[1] * dims[2] * CudnnDataTypeToByteSize(data_type); #endif // CUDNN_VERSION >= 8100 - dnn::RnnDescriptor::ParamsRegion region = { - static_cast(offset), size}; + dnn::RnnDescriptor::ParamsRegion region = {static_cast(offset), + size}; weights->push_back(region); } return absl::OkStatus(); @@ -1891,8 +1891,8 @@ absl::StatusOr CudnnRnnParamsDescriptor::Create( /*nbDims=*/&n_dims, /*filterDimA=*/dims)); int64_t size = dims[0] * dims[1] * dims[2] * CudnnDataTypeToByteSize(data_type); - dnn::RnnDescriptor::ParamsRegion region = { - static_cast(offset), size}; + dnn::RnnDescriptor::ParamsRegion region = {static_cast(offset), + size}; (type == 0 ? weights : biases).push_back(region); } #endif // CUDNN_VERSION >= 8100 From fc40d919619330bce596555613e425cb6267eea4 Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Mon, 29 Jul 2024 20:12:22 -0700 Subject: [PATCH 09/25] . --- xla/pjrt/gpu/se_gpu_pjrt_compiler.cc | 14 +++++++++----- xla/service/gpu/kernels/BUILD | 25 ++++++++++++++++++++----- xla/tsl/framework/BUILD | 5 ++++- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc index 22de6c126af4a..ebb125b18a6a1 100644 --- a/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc +++ b/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc @@ -199,13 +199,17 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options, #endif } +#if TENSORFLOW_USE_ROCM STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, { PjRtRegisterCompiler( -#if TENSORFLOW_USE_ROCM - RocmName(), + RocmName(), + std::make_unique()); +}); #else - CudaName(), -#endif - std::make_unique()); +STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, { + PjRtRegisterCompiler( + CudaName(), + std::make_unique()); }); +#endif } // namespace xla diff --git a/xla/service/gpu/kernels/BUILD b/xla/service/gpu/kernels/BUILD index 9d04094c5fd7c..86e8d499ae3f7 100644 --- a/xla/service/gpu/kernels/BUILD +++ b/xla/service/gpu/kernels/BUILD @@ -328,7 +328,10 @@ cc_library( cuda_library( name = "cutlass_gemm_adaptor", hdrs = if_cuda_is_configured(["cutlass_gemm_adaptor.cu.h"]), - copts = ["-Wno-unknown-attributes"], # __grid_constant__ is not supported by clang + copts = select({ + "@xla//xla/tsl:windows": [], + "//conditions:default": ["-Wno-unknown-attributes"], # __grid_constant__ is not supported by clang + }), deps = if_cuda_is_configured([ ":cutlass_gemm", "@cutlass_archive//:cutlass", @@ -370,7 +373,10 @@ cc_library( cuda_library( name = "cutlass_gemm_kernel_bf16xbf16_to_bf16", srcs = if_cuda_is_configured(["cutlass_gemm_kernel_bf16xbf16_to_bf16.cu.cc"]), - copts = ["-Wno-unknown-attributes -mllvm -unroll-threshold=100000"], + copts = ["-mllvm", "-unroll-threshold=100000"] + select({ + "@xla//xla/tsl:windows": [], + "//conditions:default": ["-Wno-unknown-attributes"], + }), deps = if_cuda_is_configured([ ":cutlass_gemm_adaptor", "@cutlass_archive//:cutlass", @@ -381,7 +387,10 @@ cuda_library( cuda_library( name = "cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80", srcs = if_cuda_is_configured(["cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80.cu.cc"]), - copts = ["-Wno-unknown-attributes -mllvm -unroll-threshold=100000"], + copts = ["-mllvm", "-unroll-threshold=100000"] + select({ + "@xla//xla/tsl:windows": [], + "//conditions:default": ["-Wno-unknown-attributes"], + }), deps = if_cuda_is_configured([ ":cutlass_gemm_adaptor", "@cutlass_archive//:cutlass", @@ -392,7 +401,10 @@ cuda_library( cuda_library( name = "cutlass_gemm_kernel_bf16xbf16_to_bf16_sm90", srcs = if_cuda_is_configured(["cutlass_gemm_kernel_bf16xbf16_to_bf16_sm90.cu.cc"]), - copts = ["-Wno-ctad-maybe-unsupported -Wno-unknown-attributes -mllvm -unroll-threshold=100000"], + copts = ["-mllvm", "-unroll-threshold=100000"] + select({ + "@xla//xla/tsl:windows": [], + "//conditions:default": ["-Wno-ctad-maybe-unsupported", "-Wno-unknown-attributes"], + }), deps = if_cuda_is_configured([ ":cutlass_gemm_adaptor", ":cutlass_gemm_epilogue", @@ -404,7 +416,10 @@ cuda_library( cuda_library( name = "cutlass_gemm_kernel_f32xf32_to_f32", srcs = if_cuda_is_configured(["cutlass_gemm_kernel_f32xf32_to_f32.cu.cc"]), - copts = ["-Wno-unknown-attributes"], + copts = select({ + "@xla//xla/tsl:windows": [], + "//conditions:default": ["-Wno-unknown-attributes"], + }), deps = if_cuda_is_configured([ ":cutlass_gemm_adaptor", "@cutlass_archive//:cutlass", diff --git a/xla/tsl/framework/BUILD b/xla/tsl/framework/BUILD index 8fa1ca738fabc..871fccb119549 100644 --- a/xla/tsl/framework/BUILD +++ b/xla/tsl/framework/BUILD @@ -358,7 +358,10 @@ cc_library( hdrs = [ "cancellation.h", ], - copts = ["-Wno-thread-safety-precise"], + copts = select({ + "@xla//xla/tsl:windows": [], + "//conditions:default": ["-Wno-thread-safety-precise"], + }), visibility = ["//visibility:public"], deps = [ "@com_google_absl//absl/memory", From 326aec3fd73a67ca3c667cfeb5c88a8ffa52eb3d Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Mon, 29 Jul 2024 20:16:45 -0700 Subject: [PATCH 10/25] . --- xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc b/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc index ae39cfbe293d1..dd067f614d3d1 100644 --- a/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc +++ b/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc @@ -101,7 +101,7 @@ KernelArgsPacking ArgsPacking(int32_t m, int32_t n, int32_t k, // object constructed in the storage. For now we ignore it, and it's textbook // definition of UB, but for CUTLASS kernels we use today it's perfectly safe. struct Params { - alignas(128) std::byte storage[1024]; + alignas(64) std::byte storage[1024]; }; return [=](const se::Kernel& kernel, const se::KernelArgs& args) -> Packed { From a7603b7e1be990ff012440c74bd2c2ecbc2b1e2f Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Mon, 29 Jul 2024 20:19:41 -0700 Subject: [PATCH 11/25] . --- xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc b/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc index dd067f614d3d1..81b2dbd5df7f1 100644 --- a/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc +++ b/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc @@ -101,7 +101,11 @@ KernelArgsPacking ArgsPacking(int32_t m, int32_t n, int32_t k, // object constructed in the storage. For now we ignore it, and it's textbook // definition of UB, but for CUTLASS kernels we use today it's perfectly safe. struct Params { +#if defined(_MSC_VER) alignas(64) std::byte storage[1024]; +#else + alignas(128) std::byte storage[1024]; +#endif }; return [=](const se::Kernel& kernel, const se::KernelArgs& args) -> Packed { From edcc97a67016584c285d84ac732952c572283119 Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Mon, 29 Jul 2024 20:29:51 -0700 Subject: [PATCH 12/25] . --- .../spmd/shardy/mhlo_round_trip/mhlo_import.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/xla/service/spmd/shardy/mhlo_round_trip/mhlo_import.cc b/xla/service/spmd/shardy/mhlo_round_trip/mhlo_import.cc index f30815c641692..f72cc4a885c7b 100644 --- a/xla/service/spmd/shardy/mhlo_round_trip/mhlo_import.cc +++ b/xla/service/spmd/shardy/mhlo_round_trip/mhlo_import.cc @@ -246,10 +246,10 @@ SmallVector getOrderedSubDimsFromIotaTileAssignment( tileDimIndex--; } subDims.push_back(SubDimInfo{ - .tileDimIndex = tileDimIndex, - .tileSubDimIndex = subDim++, - .reshapeDimIndex = iota.transpose_perm()[transPermIndex], - .size = axisSize, + /* .tileDimIndex = */ tileDimIndex, + /* .tileSubDimIndex = */ subDim++, + /* .reshapeDimIndex = */ iota.transpose_perm()[transPermIndex], + /* .size = */ axisSize, }); accTileSize *= axisSize; accDeviceSize *= axisSize; @@ -296,8 +296,10 @@ AnalyzeTileAssignmentResult analyzeTileAssignment( for (SubDimInfo subDimInfo : subDims) { mesh.push_back(subDimInfo.size); } - return AnalyzeTileAssignmentResult{.subDims = std::move(subDims), - .localMesh = std::move(mesh)}; + return AnalyzeTileAssignmentResult{ + /* .subDims = */ std::move(subDims), + /* .localMesh = */ std::move(mesh), + }; } // Collect shardings with the attr name kXlaShardingAttr in the `moduleOp`. From cec244808a8df163f9a803db450ca2bebdda9315 Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Mon, 29 Jul 2024 20:38:23 -0700 Subject: [PATCH 13/25] . --- xla/pjrt/gpu/se_gpu_pjrt_compiler.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc index ebb125b18a6a1..ea9541ce8a03b 100644 --- a/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc +++ b/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc @@ -201,15 +201,13 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options, #if TENSORFLOW_USE_ROCM STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, { - PjRtRegisterCompiler( - RocmName(), - std::make_unique()); + PjRtRegisterCompiler(RocmName(), + std::make_unique()); }); #else STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, { - PjRtRegisterCompiler( - CudaName(), - std::make_unique()); + PjRtRegisterCompiler(CudaName(), + std::make_unique()); }); #endif } // namespace xla From df3eb2215eea9076cb352378c5745e113df7cc7d Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Mon, 29 Jul 2024 23:46:49 -0700 Subject: [PATCH 14/25] . --- xla/hlo/evaluator/hlo_evaluator.cc | 2 -- xla/hlo/evaluator/hlo_evaluator.h | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/xla/hlo/evaluator/hlo_evaluator.cc b/xla/hlo/evaluator/hlo_evaluator.cc index 9b51dca772101..8eb6caf24ab88 100644 --- a/xla/hlo/evaluator/hlo_evaluator.cc +++ b/xla/hlo/evaluator/hlo_evaluator.cc @@ -535,8 +535,6 @@ std::optional EvaluateWhileLoopParamInitValue( namespace internal { -constexpr absl::string_view kEvalErrorDetailUrl = "EvalErrorDetailUrl"; - std::optional ParseEvalErrorDetail(const absl::Status& error) { auto error_detail = error.GetPayload(kEvalErrorDetailUrl); if (!error_detail.has_value() || error_detail->empty()) { diff --git a/xla/hlo/evaluator/hlo_evaluator.h b/xla/hlo/evaluator/hlo_evaluator.h index 2f91c39c857c9..37dbb9efda855 100644 --- a/xla/hlo/evaluator/hlo_evaluator.h +++ b/xla/hlo/evaluator/hlo_evaluator.h @@ -530,7 +530,7 @@ enum class EvalErrorDetail : uint32_t { kDynamicValueDependence = 0, }; -extern const absl::string_view kEvalErrorDetailUrl; +extern const absl::string_view kEvalErrorDetailUrl = "EvalErrorDetailUrl"; std::optional ParseEvalErrorDetail(const absl::Status& error); From 8997345fd1e1aa6f55e445615460124c6e14417c Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Mon, 29 Jul 2024 23:47:16 -0700 Subject: [PATCH 15/25] . --- xla/service/gpu/kernels/cutlass_gemm_adaptor.cu.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xla/service/gpu/kernels/cutlass_gemm_adaptor.cu.h b/xla/service/gpu/kernels/cutlass_gemm_adaptor.cu.h index b8171d615dcfe..53a6ac6dc6cad 100644 --- a/xla/service/gpu/kernels/cutlass_gemm_adaptor.cu.h +++ b/xla/service/gpu/kernels/cutlass_gemm_adaptor.cu.h @@ -199,8 +199,9 @@ namespace adaptor_3x { template static std::optional ClusterDim() { typename Traits::Kernel::DispatchPolicy::ClusterShape cluster; - return Dim3{cute::get<0>(cluster), cute::get<1>(cluster), - cute::get<2>(cluster)}; + return Dim3{static_cast(cute::get<0>(cluster)), + static_cast(cute::get<1>(cluster)), + static_cast(cute::get<2>(cluster))}; } template From 219a9f1bff7fb12c3407ab2e47512560001900fe Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Tue, 30 Jul 2024 11:57:43 -0700 Subject: [PATCH 16/25] . --- .../tsl/third_party/gpus/cuda/build_defs.bzl.tpl | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/third_party/tsl/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/tsl/third_party/gpus/cuda/build_defs.bzl.tpl index 955103611afe1..bc865cecb3240 100644 --- a/third_party/tsl/third_party/gpus/cuda/build_defs.bzl.tpl +++ b/third_party/tsl/third_party/gpus/cuda/build_defs.bzl.tpl @@ -104,16 +104,9 @@ def if_cuda_newer_than(wanted_ver, if_true, if_false = []): wanted_major = int(wanted_ver.split('_')[0]) wanted_minor = int(wanted_ver.split('_')[1]) - # Strip "64_" which appears in the CUDA version on Windows. - configured_version = "%{cuda_version}".rsplit("_", 1)[-1] - configured_version_parts = configured_version.split('.') - - # On Windows, the major and minor versions are concatenated without a period and the minor only contains one digit. - if len(configured_version_parts) == 1: - configured_version_parts = [configured_version[0:-1], configured_version[-1:]] - - configured_major = int(configured_version_parts[0]) - configured_minor = int(configured_version_parts[1]) + configured_version = "%{cuda_version}" + configured_major = int(configured_version.split('.')[0]) + configured_minor = int(configured_version.split('.')[1]) if %{cuda_is_configured} and (wanted_major, wanted_minor) <= (configured_major, configured_minor): return select({"//conditions:default": if_true}) From 73f3cd7e0135ec05c97595f795ec318fb635bd32 Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Tue, 30 Jul 2024 13:18:45 -0700 Subject: [PATCH 17/25] . --- xla/hlo/evaluator/hlo_evaluator.cc | 4 ++++ xla/hlo/evaluator/hlo_evaluator.h | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/xla/hlo/evaluator/hlo_evaluator.cc b/xla/hlo/evaluator/hlo_evaluator.cc index 8eb6caf24ab88..761006071dd1f 100644 --- a/xla/hlo/evaluator/hlo_evaluator.cc +++ b/xla/hlo/evaluator/hlo_evaluator.cc @@ -535,6 +535,10 @@ std::optional EvaluateWhileLoopParamInitValue( namespace internal { +#if !defined(_MSC_VER) +constexpr absl::string_view kEvalErrorDetailUrl = "EvalErrorDetailUrl"; +#endif + std::optional ParseEvalErrorDetail(const absl::Status& error) { auto error_detail = error.GetPayload(kEvalErrorDetailUrl); if (!error_detail.has_value() || error_detail->empty()) { diff --git a/xla/hlo/evaluator/hlo_evaluator.h b/xla/hlo/evaluator/hlo_evaluator.h index 37dbb9efda855..0eab57a0d68de 100644 --- a/xla/hlo/evaluator/hlo_evaluator.h +++ b/xla/hlo/evaluator/hlo_evaluator.h @@ -530,7 +530,11 @@ enum class EvalErrorDetail : uint32_t { kDynamicValueDependence = 0, }; +#if defined(_MSC_VER) extern const absl::string_view kEvalErrorDetailUrl = "EvalErrorDetailUrl"; +#else +extern const absl::string_view kEvalErrorDetailUrl; +#endif std::optional ParseEvalErrorDetail(const absl::Status& error); From d266259d4f467011bcd754bbdea14cc10723a01d Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Tue, 30 Jul 2024 13:49:17 -0700 Subject: [PATCH 18/25] . --- xla/service/gpu/model/hlo_op_profiles_data.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/xla/service/gpu/model/hlo_op_profiles_data.h b/xla/service/gpu/model/hlo_op_profiles_data.h index 043596a51fef9..08caad39413e8 100644 --- a/xla/service/gpu/model/hlo_op_profiles_data.h +++ b/xla/service/gpu/model/hlo_op_profiles_data.h @@ -741,8 +741,8 @@ constexpr char kDeviceHloOpProfiles[] = R"pb( clock_cycles: 15 } } - } - + })pb" + R"pb( entries { key: "sm_86" value { @@ -1307,8 +1307,8 @@ constexpr char kDeviceHloOpProfiles[] = R"pb( clock_cycles: 97 } } - } - + })pb" + R"pb( entries { key: "sm_80" # "NVIDIA A100-SXM4-40GB" value { @@ -1607,8 +1607,8 @@ constexpr char kDeviceHloOpProfiles[] = R"pb( clock_cycles: 6054 } } - } - + })pb" + R"pb( entries { key: "sm_70" # "Tesla V100-SXM2-16GB" value { @@ -2341,8 +2341,8 @@ constexpr char kDeviceHloOpProfiles[] = R"pb( clock_cycles: 15 } } - } - + })pb" + R"pb( entries { key: "sm_60" # "Tesla P100-SXM2-16GB" value { @@ -3117,8 +3117,8 @@ constexpr char kDeviceHloOpProfiles[] = R"pb( clock_cycles: 20 } } - } - + })pb" + R"pb( entries { key: "sm_75" value { From d5c3cce7dec3d80e43462af8e6a5de5804341526 Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Tue, 30 Jul 2024 13:50:58 -0700 Subject: [PATCH 19/25] . --- xla/service/gpu/model/hlo_op_profiles_data.h | 7441 +++++++++--------- 1 file changed, 3721 insertions(+), 3720 deletions(-) diff --git a/xla/service/gpu/model/hlo_op_profiles_data.h b/xla/service/gpu/model/hlo_op_profiles_data.h index 08caad39413e8..1d18bff8ff793 100644 --- a/xla/service/gpu/model/hlo_op_profiles_data.h +++ b/xla/service/gpu/model/hlo_op_profiles_data.h @@ -1,3720 +1,3721 @@ -/* Copyright 2023 The OpenXLA Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_DATA_H_ -#define XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_DATA_H_ - -namespace xla { -namespace gpu { - -// The data below is obtained with -// xla/service/gpu/model:hlo_op_profiler_run - -constexpr char kDeviceHloOpProfiles[] = R"pb( - entries { - key: "sm_90" # "NVIDIA H100 80GB HBM3" - value { - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 356 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S8 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 364 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S16 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 297 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: S32 } - } - clock_cycles: 71 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 685 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S64 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 253 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 300 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U8 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 304 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U16 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 126 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: U32 } - } - clock_cycles: 71 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 629 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U64 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 253 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 201 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 997 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 102 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 217 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 182 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 245 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F16 } - } - clock_cycles: 993 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 502 - } - entries { - instruction { - opcode: "add" - shape { element_type: F16 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 451 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F16 } - } - clock_cycles: 43 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F16 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 526 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F16 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 178 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 978 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 79 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 190 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 166 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 229 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 75 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 958 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 75 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 467 - } - entries { - instruction { - opcode: "add" - shape { element_type: F32 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 431 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F32 } - } - clock_cycles: 19 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 510 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F32 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 586 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 558 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 376 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 712 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 815 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 1259 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 277 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 554 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 332 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 431 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 930 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 526 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 2205 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C64 } - } - clock_cycles: 2415 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 641 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 2055 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 756 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 633 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 3148 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C64 } - } - clock_cycles: 2324 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 4344 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C64 } - } - clock_cycles: 2379 - } - entries { - instruction { - opcode: "add" - shape { element_type: C64 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 6462 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C64 } - } - clock_cycles: 498 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C64 } - } - clock_cycles: 79 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 5532 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C64 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 1750 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C128 } - } - clock_cycles: 1342 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C128 } - } - clock_cycles: 1275 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 2455 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 2403 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 5500 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 1999 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 6636 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 4613 - } - entries { - instruction { - opcode: "add" - shape { element_type: C128 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 13131 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 2280 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C128 } - } - clock_cycles: 39 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 8363 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C128 } - } - clock_cycles: 15 - } - } - })pb" - R"pb( - entries { - key: "sm_86" - value { - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 370 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 392 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 367 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 396 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 306 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 918 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 601 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 306 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 388 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 302 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 399 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 115 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 838 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 604 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 925 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 691 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 108 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 396 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 266 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 284 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F16 } - } - clock_cycles: 226 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 212 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 482 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 975 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 867 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 662 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 86 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 381 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 244 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 262 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F32 } - } - clock_cycles: 176 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 75 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 662 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 75 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 190 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 486 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 925 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 6339 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 1717 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 1652 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 1900 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 608 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 2073 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F64 } - } - clock_cycles: 2412 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 698 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 1789 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 986 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 1609 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 3747 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 2016 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 5511 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 1360 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 1400 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 950 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 842 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 2383 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 3193 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 5353 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C64 } - } - clock_cycles: 687 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 3351 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 6613 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C128 } - } - clock_cycles: 4028 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C128 } - } - clock_cycles: 4161 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 7599 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 6962 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 11318 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 5878 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 15606 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 9939 - } - entries { - instruction { - opcode: "add" - shape { element_type: C128 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 39027 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 7941 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C128 } - } - clock_cycles: 270 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 18205 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C128 } - } - clock_cycles: 97 - } - } - })pb" - R"pb( - entries { - key: "sm_80" # "NVIDIA A100-SXM4-40GB" - value { - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 417 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 468 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 1094 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 420 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 417 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 391 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 454 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 908 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 744 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 1195 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 321 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 346 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 124 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 499 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 259 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 504 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 1221 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 1638 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 572 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 699 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 1223 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 329 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 597 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 397 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 733 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 1080 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 831 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 1861 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 1037 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 1029 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 6618 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 4131 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 2309 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 2371 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 2405 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 3945 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 2284 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 5304 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 3618 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 13564 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 3037 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 6054 - } - } - })pb" - R"pb( - entries { - key: "sm_70" # "Tesla V100-SXM2-16GB" - value { - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 336 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S8 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 345 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 183 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 287 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: S32 } - } - clock_cycles: 104 - } - entries { - instruction { - opcode: "add" - shape { element_type: S64 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 685 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S64 } - } - clock_cycles: 12 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 376 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 293 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U8 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 293 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 183 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 113 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: U32 } - } - clock_cycles: 104 - } - entries { - instruction { - opcode: "add" - shape { element_type: U64 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 599 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U64 } - } - clock_cycles: 12 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 376 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 226 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 425 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 128 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 241 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 232 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 266 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F16 } - } - clock_cycles: 425 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 284 - } - entries { - instruction { - opcode: "add" - shape { element_type: F16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 449 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F16 } - } - clock_cycles: 73 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 709 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 373 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 79 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 205 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 180 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 217 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 76 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 373 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 76 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 269 - } - entries { - instruction { - opcode: "add" - shape { element_type: F32 } - } - clock_cycles: 6 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 406 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F32 } - } - clock_cycles: 21 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F32 } - } - clock_cycles: 6 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 673 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F32 } - } - clock_cycles: 6 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 599 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 624 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 358 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 410 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 318 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 633 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 263 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 618 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 324 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 406 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 973 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 501 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 2099 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C64 } - } - clock_cycles: 780 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 722 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 703 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 758 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 654 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 3261 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C64 } - } - clock_cycles: 789 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 6282 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C64 } - } - clock_cycles: 1924 - } - entries { - instruction { - opcode: "add" - shape { element_type: C64 } - } - clock_cycles: 12 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 8151 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C64 } - } - clock_cycles: 480 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C64 } - } - clock_cycles: 42 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 8105 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C64 } - } - clock_cycles: 12 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 1808 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C128 } - } - clock_cycles: 1487 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C128 } - } - clock_cycles: 1334 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 1805 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 1618 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 7261 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 2013 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 8237 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 6343 - } - entries { - instruction { - opcode: "add" - shape { element_type: C128 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 15355 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 2423 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C128 } - } - clock_cycles: 45 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 9810 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C128 } - } - clock_cycles: 15 - } - } - })pb" - R"pb( - entries { - key: "sm_60" # "Tesla P100-SXM2-16GB" - value { - entries { - instruction { - opcode: "add" - shape { element_type: S8 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 426 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S8 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 216 - } - entries { - instruction { - opcode: "add" - shape { element_type: S16 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 420 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S16 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 216 - } - entries { - instruction { - opcode: "add" - shape { element_type: S32 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 444 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S32 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "power" - shape { element_type: S32 } - } - clock_cycles: 417 - } - entries { - instruction { - opcode: "add" - shape { element_type: S64 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 1018 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S64 } - } - clock_cycles: 82 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 1569 - } - entries { - instruction { - opcode: "add" - shape { element_type: U8 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 299 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U8 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 213 - } - entries { - instruction { - opcode: "add" - shape { element_type: U16 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 307 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U16 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 216 - } - entries { - instruction { - opcode: "add" - shape { element_type: U32 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U32 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "power" - shape { element_type: U32 } - } - clock_cycles: 420 - } - entries { - instruction { - opcode: "add" - shape { element_type: U64 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 888 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U64 } - } - clock_cycles: 79 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 1548 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 233 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 532 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 142 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 364 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 325 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 373 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 100 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F16 } - } - clock_cycles: 497 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 100 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 458 - } - entries { - instruction { - opcode: "add" - shape { element_type: F16 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 675 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F16 } - } - clock_cycles: 68 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F16 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 1012 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F16 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 213 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 494 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 109 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 337 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 284 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 328 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 71 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 473 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 71 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 426 - } - entries { - instruction { - opcode: "add" - shape { element_type: F32 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 663 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F32 } - } - clock_cycles: 35 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F32 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 988 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F32 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 645 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 1427 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 405 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 544 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 441 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 784 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 355 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 1640 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 417 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 473 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 1169 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 565 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 2682 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C64 } - } - clock_cycles: 1128 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 1021 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 991 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 1107 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 994 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 2158 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C64 } - } - clock_cycles: 1139 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 2934 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C64 } - } - clock_cycles: 1883 - } - entries { - instruction { - opcode: "add" - shape { element_type: C64 } - } - clock_cycles: 20 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 16282 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C64 } - } - clock_cycles: 760 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C64 } - } - clock_cycles: 65 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 8335 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C64 } - } - clock_cycles: 20 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 4302 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C128 } - } - clock_cycles: 3665 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C128 } - } - clock_cycles: 3656 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 2057 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 1806 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 6135 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 4169 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 8595 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 5294 - } - entries { - instruction { - opcode: "add" - shape { element_type: C128 } - } - clock_cycles: 20 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 22278 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 3194 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C128 } - } - clock_cycles: 65 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 17893 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C128 } - } - clock_cycles: 20 - } - } - })pb" - R"pb( - entries { - key: "sm_75" - value { - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 360 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 336 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 357 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 339 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 296 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 979 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 495 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 293 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 334 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 290 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 336 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 118 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 812 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 515 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 792 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 815 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 132 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 342 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 239 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 239 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F16 } - } - clock_cycles: 262 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 126 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F16 } - } - clock_cycles: 794 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 123 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 175 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 414 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F16 } - } - clock_cycles: 74 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 1120 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 783 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 737 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 83 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 319 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 201 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 218 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F32 } - } - clock_cycles: 181 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 74 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 717 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 74 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 167 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 414 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 1085 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 6494 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 1800 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 1630 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 1929 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 596 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 1774 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F64 } - } - clock_cycles: 2430 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 705 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 1805 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 984 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 1535 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 3744 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 1915 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 5538 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C64 } - } - clock_cycles: 1702 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 1503 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 1474 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 835 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 737 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 2232 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C64 } - } - clock_cycles: 1632 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 2989 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C64 } - } - clock_cycles: 2263 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 4847 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 3219 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 6474 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C128 } - } - clock_cycles: 4962 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C128 } - } - clock_cycles: 4037 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 7286 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 6848 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 10748 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 5391 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 15981 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 9653 - } - entries { - instruction { - opcode: "add" - shape { element_type: C128 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 38206 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 8040 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C128 } - } - clock_cycles: 273 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 18550 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C128 } - } - clock_cycles: 97 - } - } - } -)pb"; - -} // namespace gpu -} // namespace xla - -#endif // XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_DATA_H_ +/* Copyright 2023 The OpenXLA Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_DATA_H_ +#define XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_DATA_H_ + +namespace xla { +namespace gpu { + +// The data below is obtained with +// xla/service/gpu/model:hlo_op_profiler_run + +constexpr char kDeviceHloOpProfiles[] = + R"pb( + entries { + key: "sm_90" # "NVIDIA H100 80GB HBM3" + value { + entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 356 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S8 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 364 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 297 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: S32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 685 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S64 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 253 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 300 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U8 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 304 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 126 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: U32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 629 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U64 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 253 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 201 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 997 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 102 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 217 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 182 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 245 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 993 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 502 + } + entries { + instruction { + opcode: "add" + shape { element_type: F16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 451 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 43 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 526 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 178 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 978 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 79 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 190 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 166 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 229 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 75 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 958 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 75 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 467 + } + entries { + instruction { + opcode: "add" + shape { element_type: F32 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 431 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F32 } + } + clock_cycles: 19 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 510 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F32 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 586 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 558 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 376 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 712 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 815 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 1259 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 277 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 554 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 332 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 431 + } + entries { + instruction { + opcode: "add" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 930 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 526 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 2205 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C64 } + } + clock_cycles: 2415 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C64 } + } + clock_cycles: 641 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C64 } + } + clock_cycles: 2055 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 756 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 633 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C64 } + } + clock_cycles: 3148 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C64 } + } + clock_cycles: 2324 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C64 } + } + clock_cycles: 4344 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C64 } + } + clock_cycles: 2379 + } + entries { + instruction { + opcode: "add" + shape { element_type: C64 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 6462 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C64 } + } + clock_cycles: 498 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C64 } + } + clock_cycles: 79 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 5532 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C64 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C128 } + } + clock_cycles: 1750 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C128 } + } + clock_cycles: 1342 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C128 } + } + clock_cycles: 1275 + } + entries { + instruction { + opcode: "log" + shape { element_type: C128 } + } + clock_cycles: 2455 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C128 } + } + clock_cycles: 2403 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C128 } + } + clock_cycles: 5500 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C128 } + } + clock_cycles: 1999 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C128 } + } + clock_cycles: 6636 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C128 } + } + clock_cycles: 4613 + } + entries { + instruction { + opcode: "add" + shape { element_type: C128 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C128 } + } + clock_cycles: 13131 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C128 } + } + clock_cycles: 2280 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C128 } + } + clock_cycles: 39 + } + entries { + instruction { + opcode: "power" + shape { element_type: C128 } + } + clock_cycles: 8363 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C128 } + } + clock_cycles: 15 + } + } + })pb" + R"pb( + entries { + key: "sm_86" + value { + entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 370 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 392 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 367 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 396 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 306 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 918 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 601 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 306 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 388 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 302 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 399 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 115 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 838 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 604 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 925 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 691 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 108 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 396 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 266 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 284 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F16 } + } + clock_cycles: 226 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 212 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 482 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 975 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 867 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 662 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 86 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 381 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 244 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 262 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F32 } + } + clock_cycles: 176 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 75 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 662 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 75 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 190 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 486 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 925 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 6339 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 1717 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 1652 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 1900 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 608 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 2073 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F64 } + } + clock_cycles: 2412 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 698 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 1789 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 986 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 1609 + } + entries { + instruction { + opcode: "add" + shape { element_type: F64 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 3747 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 2016 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F64 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 5511 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F64 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C64 } + } + clock_cycles: 1360 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C64 } + } + clock_cycles: 1400 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 950 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 842 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C64 } + } + clock_cycles: 2383 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C64 } + } + clock_cycles: 3193 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 5353 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C64 } + } + clock_cycles: 687 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 3351 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C128 } + } + clock_cycles: 6613 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C128 } + } + clock_cycles: 4028 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C128 } + } + clock_cycles: 4161 + } + entries { + instruction { + opcode: "log" + shape { element_type: C128 } + } + clock_cycles: 7599 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C128 } + } + clock_cycles: 6962 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C128 } + } + clock_cycles: 11318 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C128 } + } + clock_cycles: 5878 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C128 } + } + clock_cycles: 15606 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C128 } + } + clock_cycles: 9939 + } + entries { + instruction { + opcode: "add" + shape { element_type: C128 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C128 } + } + clock_cycles: 39027 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C128 } + } + clock_cycles: 7941 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C128 } + } + clock_cycles: 270 + } + entries { + instruction { + opcode: "power" + shape { element_type: C128 } + } + clock_cycles: 18205 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C128 } + } + clock_cycles: 97 + } + } + })pb" + R"pb( + entries { + key: "sm_80" # "NVIDIA A100-SXM4-40GB" + value { + entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 417 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 468 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 1094 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 420 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 417 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 391 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 454 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 908 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 744 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 1195 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 321 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 346 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 124 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 499 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 259 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 504 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 1221 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 1638 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 572 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 699 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 1223 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 329 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 597 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 397 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 733 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 1080 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 831 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 1861 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 1037 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 1029 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 6618 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 4131 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C128 } + } + clock_cycles: 2309 + } + entries { + instruction { + opcode: "log" + shape { element_type: C128 } + } + clock_cycles: 2371 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C128 } + } + clock_cycles: 2405 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C128 } + } + clock_cycles: 3945 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C128 } + } + clock_cycles: 2284 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C128 } + } + clock_cycles: 5304 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C128 } + } + clock_cycles: 3618 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C128 } + } + clock_cycles: 13564 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C128 } + } + clock_cycles: 3037 + } + entries { + instruction { + opcode: "power" + shape { element_type: C128 } + } + clock_cycles: 6054 + } + } + })pb" + R"pb( + entries { + key: "sm_70" # "Tesla V100-SXM2-16GB" + value { + entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 336 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S8 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 345 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 183 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 287 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: S32 } + } + clock_cycles: 104 + } + entries { + instruction { + opcode: "add" + shape { element_type: S64 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 685 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S64 } + } + clock_cycles: 12 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 376 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 293 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U8 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 293 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 183 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 113 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: U32 } + } + clock_cycles: 104 + } + entries { + instruction { + opcode: "add" + shape { element_type: U64 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 599 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U64 } + } + clock_cycles: 12 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 376 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 226 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 425 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 128 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 241 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 232 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 266 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 425 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 284 + } + entries { + instruction { + opcode: "add" + shape { element_type: F16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 449 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 73 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 709 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 373 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 79 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 205 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 180 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 217 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 76 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 373 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 76 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 269 + } + entries { + instruction { + opcode: "add" + shape { element_type: F32 } + } + clock_cycles: 6 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 406 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F32 } + } + clock_cycles: 21 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F32 } + } + clock_cycles: 6 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 673 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F32 } + } + clock_cycles: 6 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 599 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 624 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 358 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 410 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 318 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 633 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 263 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 618 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 324 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 406 + } + entries { + instruction { + opcode: "add" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 973 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 501 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 2099 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C64 } + } + clock_cycles: 780 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C64 } + } + clock_cycles: 722 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C64 } + } + clock_cycles: 703 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 758 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 654 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C64 } + } + clock_cycles: 3261 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C64 } + } + clock_cycles: 789 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C64 } + } + clock_cycles: 6282 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C64 } + } + clock_cycles: 1924 + } + entries { + instruction { + opcode: "add" + shape { element_type: C64 } + } + clock_cycles: 12 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 8151 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C64 } + } + clock_cycles: 480 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C64 } + } + clock_cycles: 42 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 8105 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C64 } + } + clock_cycles: 12 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C128 } + } + clock_cycles: 1808 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C128 } + } + clock_cycles: 1487 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C128 } + } + clock_cycles: 1334 + } + entries { + instruction { + opcode: "log" + shape { element_type: C128 } + } + clock_cycles: 1805 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C128 } + } + clock_cycles: 1618 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C128 } + } + clock_cycles: 7261 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C128 } + } + clock_cycles: 2013 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C128 } + } + clock_cycles: 8237 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C128 } + } + clock_cycles: 6343 + } + entries { + instruction { + opcode: "add" + shape { element_type: C128 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C128 } + } + clock_cycles: 15355 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C128 } + } + clock_cycles: 2423 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C128 } + } + clock_cycles: 45 + } + entries { + instruction { + opcode: "power" + shape { element_type: C128 } + } + clock_cycles: 9810 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C128 } + } + clock_cycles: 15 + } + } + })pb" + R"pb( + entries { + key: "sm_60" # "Tesla P100-SXM2-16GB" + value { + entries { + instruction { + opcode: "add" + shape { element_type: S8 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 426 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S8 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 216 + } + entries { + instruction { + opcode: "add" + shape { element_type: S16 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 420 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S16 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 216 + } + entries { + instruction { + opcode: "add" + shape { element_type: S32 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 444 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S32 } + } + clock_cycles: 14 + } + entries { + instruction { + opcode: "power" + shape { element_type: S32 } + } + clock_cycles: 417 + } + entries { + instruction { + opcode: "add" + shape { element_type: S64 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 1018 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S64 } + } + clock_cycles: 82 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 1569 + } + entries { + instruction { + opcode: "add" + shape { element_type: U8 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 299 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U8 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 213 + } + entries { + instruction { + opcode: "add" + shape { element_type: U16 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 307 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U16 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 216 + } + entries { + instruction { + opcode: "add" + shape { element_type: U32 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U32 } + } + clock_cycles: 14 + } + entries { + instruction { + opcode: "power" + shape { element_type: U32 } + } + clock_cycles: 420 + } + entries { + instruction { + opcode: "add" + shape { element_type: U64 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 888 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U64 } + } + clock_cycles: 79 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 1548 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 233 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 532 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 142 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 364 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 325 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 373 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 100 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 497 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 100 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 458 + } + entries { + instruction { + opcode: "add" + shape { element_type: F16 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 675 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 68 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F16 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 1012 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F16 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 213 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 494 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 109 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 337 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 284 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 328 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 473 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 426 + } + entries { + instruction { + opcode: "add" + shape { element_type: F32 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 663 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F32 } + } + clock_cycles: 35 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F32 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 988 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F32 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 645 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 1427 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 405 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 544 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 441 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 784 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 355 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 1640 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 417 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 473 + } + entries { + instruction { + opcode: "add" + shape { element_type: F64 } + } + clock_cycles: 14 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 1169 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 565 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F64 } + } + clock_cycles: 14 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 2682 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F64 } + } + clock_cycles: 14 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C64 } + } + clock_cycles: 1128 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C64 } + } + clock_cycles: 1021 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C64 } + } + clock_cycles: 991 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 1107 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 994 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C64 } + } + clock_cycles: 2158 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C64 } + } + clock_cycles: 1139 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C64 } + } + clock_cycles: 2934 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C64 } + } + clock_cycles: 1883 + } + entries { + instruction { + opcode: "add" + shape { element_type: C64 } + } + clock_cycles: 20 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 16282 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C64 } + } + clock_cycles: 760 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C64 } + } + clock_cycles: 65 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 8335 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C64 } + } + clock_cycles: 20 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C128 } + } + clock_cycles: 4302 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C128 } + } + clock_cycles: 3665 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C128 } + } + clock_cycles: 3656 + } + entries { + instruction { + opcode: "log" + shape { element_type: C128 } + } + clock_cycles: 2057 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C128 } + } + clock_cycles: 1806 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C128 } + } + clock_cycles: 6135 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C128 } + } + clock_cycles: 4169 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C128 } + } + clock_cycles: 8595 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C128 } + } + clock_cycles: 5294 + } + entries { + instruction { + opcode: "add" + shape { element_type: C128 } + } + clock_cycles: 20 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C128 } + } + clock_cycles: 22278 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C128 } + } + clock_cycles: 3194 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C128 } + } + clock_cycles: 65 + } + entries { + instruction { + opcode: "power" + shape { element_type: C128 } + } + clock_cycles: 17893 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C128 } + } + clock_cycles: 20 + } + } + })pb" + R"pb( + entries { + key: "sm_75" + value { + entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 360 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 336 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 357 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 339 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 296 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 979 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 495 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 293 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 334 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 290 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 336 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 118 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 812 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 515 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 792 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 815 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 132 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 342 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 239 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 239 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F16 } + } + clock_cycles: 262 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 126 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 794 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 123 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 175 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 414 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 74 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 1120 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 783 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 737 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 83 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 319 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 201 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 218 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F32 } + } + clock_cycles: 181 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 74 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 717 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 74 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 167 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 414 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 1085 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 6494 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 1800 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 1630 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 1929 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 596 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 1774 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F64 } + } + clock_cycles: 2430 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 705 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 1805 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 984 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 1535 + } + entries { + instruction { + opcode: "add" + shape { element_type: F64 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 3744 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 1915 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F64 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 5538 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F64 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C64 } + } + clock_cycles: 1702 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C64 } + } + clock_cycles: 1503 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C64 } + } + clock_cycles: 1474 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 835 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 737 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C64 } + } + clock_cycles: 2232 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C64 } + } + clock_cycles: 1632 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C64 } + } + clock_cycles: 2989 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C64 } + } + clock_cycles: 2263 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 4847 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 3219 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C128 } + } + clock_cycles: 6474 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C128 } + } + clock_cycles: 4962 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C128 } + } + clock_cycles: 4037 + } + entries { + instruction { + opcode: "log" + shape { element_type: C128 } + } + clock_cycles: 7286 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C128 } + } + clock_cycles: 6848 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C128 } + } + clock_cycles: 10748 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C128 } + } + clock_cycles: 5391 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C128 } + } + clock_cycles: 15981 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C128 } + } + clock_cycles: 9653 + } + entries { + instruction { + opcode: "add" + shape { element_type: C128 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C128 } + } + clock_cycles: 38206 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C128 } + } + clock_cycles: 8040 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C128 } + } + clock_cycles: 273 + } + entries { + instruction { + opcode: "power" + shape { element_type: C128 } + } + clock_cycles: 18550 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C128 } + } + clock_cycles: 97 + } + } + } + )pb"; + +} // namespace gpu +} // namespace xla + +#endif // XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_DATA_H_ From d5840041014526901ff36efc7bd61051bf92989c Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Tue, 30 Jul 2024 13:53:09 -0700 Subject: [PATCH 20/25] . --- xla/service/gpu/model/hlo_op_profiles_data.h | 5030 +++++++++--------- 1 file changed, 2515 insertions(+), 2515 deletions(-) diff --git a/xla/service/gpu/model/hlo_op_profiles_data.h b/xla/service/gpu/model/hlo_op_profiles_data.h index 1d18bff8ff793..92d5979c0f530 100644 --- a/xla/service/gpu/model/hlo_op_profiles_data.h +++ b/xla/service/gpu/model/hlo_op_profiles_data.h @@ -24,330 +24,959 @@ namespace gpu { constexpr char kDeviceHloOpProfiles[] = R"pb( - entries { - key: "sm_90" # "NVIDIA H100 80GB HBM3" - value { + entries { key: "sm_90" # "NVIDIA H100 80GB HBM3" + + value { entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 356 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S8 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 364 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 297 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: S32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 685 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S64 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 253 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 300 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U8 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 304 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 126 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: U32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 629 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U64 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 253 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 201 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 997 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 102 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 217 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 182 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 245 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 993 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 502 + } + entries { + instruction { + opcode: "add" + shape { element_type: F16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 451 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 43 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 526 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 178 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 978 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 79 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 190 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 166 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 229 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 75 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 958 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 75 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 467 + } + entries { + instruction { + opcode: "add" + shape { element_type: F32 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 431 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F32 } + } + clock_cycles: 19 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 510 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F32 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 586 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 558 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 376 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 712 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 815 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 1259 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 277 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 554 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 332 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 431 + } + entries { + instruction { + opcode: "add" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 930 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 526 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 2205 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C64 } + } + clock_cycles: 2415 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C64 } + } + clock_cycles: 641 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C64 } + } + clock_cycles: 2055 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 756 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 633 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C64 } + } + clock_cycles: 3148 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C64 } + } + clock_cycles: 2324 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C64 } + } + clock_cycles: 4344 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C64 } + } + clock_cycles: 2379 + } + entries { + instruction { + opcode: "add" + shape { element_type: C64 } + } + clock_cycles: 7 + })pb" + R"pb( + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 6462 + } entries { instruction { opcode: "divide" - shape { element_type: S8 } + shape { element_type: C64 } } - clock_cycles: 356 + clock_cycles: 498 } entries { instruction { opcode: "multiply" - shape { element_type: S8 } + shape { element_type: C64 } } - clock_cycles: 7 + clock_cycles: 79 } entries { instruction { opcode: "power" - shape { element_type: S8 } + shape { element_type: C64 } } - clock_cycles: 122 + clock_cycles: 5532 } entries { instruction { - opcode: "divide" - shape { element_type: S16 } + opcode: "subtract" + shape { element_type: C64 } } - clock_cycles: 364 + clock_cycles: 7 } entries { instruction { - opcode: "multiply" - shape { element_type: S16 } + opcode: "cosine" + shape { element_type: C128 } } - clock_cycles: 7 + clock_cycles: 1750 } entries { instruction { - opcode: "power" - shape { element_type: S16 } + opcode: "exponential" + shape { element_type: C128 } } - clock_cycles: 122 + clock_cycles: 1342 } entries { instruction { - opcode: "divide" - shape { element_type: S32 } + opcode: "exponential-minus-one" + shape { element_type: C128 } } - clock_cycles: 297 + clock_cycles: 1275 } entries { instruction { - opcode: "multiply" - shape { element_type: S32 } + opcode: "log" + shape { element_type: C128 } } - clock_cycles: 3 + clock_cycles: 2455 } entries { instruction { - opcode: "power" - shape { element_type: S32 } + opcode: "log-plus-one" + shape { element_type: C128 } } - clock_cycles: 71 + clock_cycles: 2403 } entries { instruction { - opcode: "divide" - shape { element_type: S64 } + opcode: "rsqrt" + shape { element_type: C128 } } - clock_cycles: 685 + clock_cycles: 5500 } entries { instruction { - opcode: "multiply" - shape { element_type: S64 } + opcode: "sine" + shape { element_type: C128 } } - clock_cycles: 11 + clock_cycles: 1999 } entries { instruction { - opcode: "power" - shape { element_type: S64 } + opcode: "sqrt" + shape { element_type: C128 } + } + clock_cycles: 6636 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C128 } + } + clock_cycles: 4613 + } + entries { + instruction { + opcode: "add" + shape { element_type: C128 } } - clock_cycles: 253 + clock_cycles: 15 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C128 } + } + clock_cycles: 13131 } entries { instruction { opcode: "divide" - shape { element_type: U8 } + shape { element_type: C128 } } - clock_cycles: 300 + clock_cycles: 2280 } entries { instruction { opcode: "multiply" - shape { element_type: U8 } + shape { element_type: C128 } } - clock_cycles: 7 + clock_cycles: 39 } entries { instruction { opcode: "power" - shape { element_type: U8 } + shape { element_type: C128 } + } + clock_cycles: 8363 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C128 } } - clock_cycles: 122 + clock_cycles: 15 } + } + })pb" + R"pb( + entries { + key: "sm_86" + value { entries { instruction { opcode: "divide" - shape { element_type: U16 } + shape { element_type: S8 } } - clock_cycles: 304 + clock_cycles: 370 } entries { instruction { - opcode: "multiply" - shape { element_type: U16 } + opcode: "power" + shape { element_type: S8 } } - clock_cycles: 7 + clock_cycles: 392 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 367 } entries { instruction { opcode: "power" - shape { element_type: U16 } + shape { element_type: S16 } } - clock_cycles: 126 + clock_cycles: 396 } entries { instruction { opcode: "divide" - shape { element_type: U32 } + shape { element_type: S32 } } - clock_cycles: 122 + clock_cycles: 306 } entries { instruction { - opcode: "multiply" - shape { element_type: U32 } + opcode: "divide" + shape { element_type: S64 } } - clock_cycles: 3 + clock_cycles: 918 } entries { instruction { opcode: "power" - shape { element_type: U32 } + shape { element_type: S64 } } - clock_cycles: 71 + clock_cycles: 601 } entries { instruction { opcode: "divide" - shape { element_type: U64 } + shape { element_type: U8 } } - clock_cycles: 629 + clock_cycles: 306 } entries { instruction { - opcode: "multiply" - shape { element_type: U64 } + opcode: "power" + shape { element_type: U8 } } - clock_cycles: 11 + clock_cycles: 388 } entries { instruction { - opcode: "power" - shape { element_type: U64 } + opcode: "divide" + shape { element_type: U16 } } - clock_cycles: 253 + clock_cycles: 302 } entries { instruction { - opcode: "cbrt" - shape { element_type: F16 } + opcode: "power" + shape { element_type: U16 } } - clock_cycles: 201 + clock_cycles: 399 } entries { instruction { - opcode: "cosine" - shape { element_type: F16 } + opcode: "divide" + shape { element_type: U32 } } - clock_cycles: 997 + clock_cycles: 115 } entries { instruction { - opcode: "exponential" - shape { element_type: F16 } + opcode: "divide" + shape { element_type: U64 } } - clock_cycles: 102 + clock_cycles: 838 } entries { instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } + opcode: "power" + shape { element_type: U64 } } - clock_cycles: 217 + clock_cycles: 604 } entries { instruction { - opcode: "log" + opcode: "cbrt" shape { element_type: F16 } } - clock_cycles: 182 + clock_cycles: 925 } entries { instruction { - opcode: "log-plus-one" + opcode: "cosine" shape { element_type: F16 } } - clock_cycles: 245 + clock_cycles: 691 } entries { instruction { - opcode: "rsqrt" + opcode: "exponential" shape { element_type: F16 } } - clock_cycles: 95 + clock_cycles: 108 } entries { instruction { - opcode: "sine" + opcode: "exponential-minus-one" shape { element_type: F16 } } - clock_cycles: 993 + clock_cycles: 396 } entries { instruction { - opcode: "sqrt" + opcode: "log" shape { element_type: F16 } } - clock_cycles: 95 + clock_cycles: 266 } entries { instruction { - opcode: "tanh" + opcode: "log-plus-one" shape { element_type: F16 } } - clock_cycles: 502 + clock_cycles: 284 } entries { instruction { - opcode: "add" + opcode: "logistic" shape { element_type: F16 } } - clock_cycles: 7 + clock_cycles: 226 } entries { instruction { - opcode: "atan2" + opcode: "rsqrt" shape { element_type: F16 } } - clock_cycles: 451 + clock_cycles: 97 } entries { instruction { - opcode: "divide" + opcode: "sqrt" shape { element_type: F16 } } - clock_cycles: 43 + clock_cycles: 97 } entries { instruction { - opcode: "multiply" + opcode: "tanh" shape { element_type: F16 } } - clock_cycles: 7 + clock_cycles: 212 } entries { instruction { - opcode: "power" + opcode: "atan2" shape { element_type: F16 } } - clock_cycles: 526 + clock_cycles: 482 } entries { instruction { - opcode: "subtract" + opcode: "power" shape { element_type: F16 } } - clock_cycles: 7 + clock_cycles: 975 } entries { instruction { opcode: "cbrt" shape { element_type: F32 } } - clock_cycles: 178 + clock_cycles: 867 } entries { instruction { opcode: "cosine" shape { element_type: F32 } } - clock_cycles: 978 + clock_cycles: 662 } entries { instruction { opcode: "exponential" shape { element_type: F32 } } - clock_cycles: 79 + clock_cycles: 86 } entries { instruction { opcode: "exponential-minus-one" shape { element_type: F32 } } - clock_cycles: 190 + clock_cycles: 381 } entries { instruction { opcode: "log" shape { element_type: F32 } } - clock_cycles: 166 + clock_cycles: 244 } entries { instruction { opcode: "log-plus-one" shape { element_type: F32 } } - clock_cycles: 229 + clock_cycles: 262 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F32 } + } + clock_cycles: 176 } entries { instruction { @@ -361,7 +990,7 @@ constexpr char kDeviceHloOpProfiles[] = opcode: "sine" shape { element_type: F32 } } - clock_cycles: 958 + clock_cycles: 662 } entries { instruction { @@ -375,3237 +1004,2608 @@ constexpr char kDeviceHloOpProfiles[] = opcode: "tanh" shape { element_type: F32 } } - clock_cycles: 467 + clock_cycles: 190 } entries { instruction { - opcode: "add" + opcode: "atan2" shape { element_type: F32 } } - clock_cycles: 7 + clock_cycles: 486 } entries { instruction { - opcode: "atan2" + opcode: "power" shape { element_type: F32 } } - clock_cycles: 431 + clock_cycles: 925 } entries { instruction { - opcode: "divide" - shape { element_type: F32 } + opcode: "cbrt" + shape { element_type: F64 } } - clock_cycles: 19 + clock_cycles: 6339 } entries { instruction { - opcode: "multiply" - shape { element_type: F32 } + opcode: "cosine" + shape { element_type: F64 } } - clock_cycles: 3 + clock_cycles: 1717 } entries { instruction { - opcode: "power" - shape { element_type: F32 } + opcode: "exponential" + shape { element_type: F64 } } - clock_cycles: 510 + clock_cycles: 1652 } entries { instruction { - opcode: "subtract" - shape { element_type: F32 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 586 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 558 - } - entries { - instruction { - opcode: "exponential" + opcode: "exponential-minus-one" shape { element_type: F64 } } - clock_cycles: 376 + clock_cycles: 1900 } entries { instruction { - opcode: "exponential-minus-one" + opcode: "log" shape { element_type: F64 } } - clock_cycles: 712 + clock_cycles: 608 } entries { instruction { - opcode: "log" + opcode: "log-plus-one" shape { element_type: F64 } } - clock_cycles: 815 + clock_cycles: 2073 } entries { instruction { - opcode: "log-plus-one" + opcode: "logistic" shape { element_type: F64 } } - clock_cycles: 1259 + clock_cycles: 2412 } entries { instruction { opcode: "rsqrt" shape { element_type: F64 } } - clock_cycles: 277 + clock_cycles: 698 } entries { instruction { opcode: "sine" shape { element_type: F64 } } - clock_cycles: 554 + clock_cycles: 1789 } entries { instruction { opcode: "sqrt" shape { element_type: F64 } } - clock_cycles: 332 + clock_cycles: 986 } entries { instruction { opcode: "tanh" shape { element_type: F64 } } - clock_cycles: 431 + clock_cycles: 1609 } entries { instruction { opcode: "add" shape { element_type: F64 } } - clock_cycles: 15 + clock_cycles: 97 } entries { instruction { opcode: "atan2" shape { element_type: F64 } } - clock_cycles: 930 + clock_cycles: 3747 } entries { instruction { opcode: "divide" shape { element_type: F64 } } - clock_cycles: 526 + clock_cycles: 2016 } entries { instruction { opcode: "multiply" shape { element_type: F64 } } - clock_cycles: 15 + clock_cycles: 97 } entries { instruction { opcode: "power" shape { element_type: F64 } } - clock_cycles: 2205 + clock_cycles: 5511 } entries { instruction { opcode: "subtract" shape { element_type: F64 } } - clock_cycles: 15 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C64 } - } - clock_cycles: 2415 + clock_cycles: 97 } entries { instruction { opcode: "exponential" shape { element_type: C64 } } - clock_cycles: 641 + clock_cycles: 1360 } entries { instruction { opcode: "exponential-minus-one" shape { element_type: C64 } } - clock_cycles: 2055 + clock_cycles: 1400 } entries { instruction { opcode: "log" shape { element_type: C64 } } - clock_cycles: 756 + clock_cycles: 950 } entries { instruction { opcode: "log-plus-one" shape { element_type: C64 } } - clock_cycles: 633 + clock_cycles: 842 } entries { instruction { opcode: "rsqrt" shape { element_type: C64 } } - clock_cycles: 3148 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C64 } - } - clock_cycles: 2324 + clock_cycles: 2383 } entries { instruction { opcode: "sqrt" shape { element_type: C64 } } - clock_cycles: 4344 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C64 } - } - clock_cycles: 2379 - } - entries { - instruction { - opcode: "add" - shape { element_type: C64 } - } - clock_cycles: 7 + clock_cycles: 3193 } entries { instruction { opcode: "atan2" shape { element_type: C64 } } - clock_cycles: 6462 + clock_cycles: 5353 } entries { instruction { opcode: "divide" shape { element_type: C64 } } - clock_cycles: 498 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C64 } - } - clock_cycles: 79 + clock_cycles: 687 } entries { instruction { opcode: "power" shape { element_type: C64 } } - clock_cycles: 5532 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C64 } - } - clock_cycles: 7 + clock_cycles: 3351 } entries { instruction { opcode: "cosine" shape { element_type: C128 } } - clock_cycles: 1750 + clock_cycles: 6613 } entries { instruction { opcode: "exponential" shape { element_type: C128 } } - clock_cycles: 1342 + clock_cycles: 4028 } entries { instruction { opcode: "exponential-minus-one" shape { element_type: C128 } } - clock_cycles: 1275 + clock_cycles: 4161 } entries { instruction { opcode: "log" shape { element_type: C128 } } - clock_cycles: 2455 + clock_cycles: 7599 } entries { instruction { opcode: "log-plus-one" shape { element_type: C128 } } - clock_cycles: 2403 + clock_cycles: 6962 } entries { instruction { opcode: "rsqrt" shape { element_type: C128 } } - clock_cycles: 5500 + clock_cycles: 11318 } entries { instruction { opcode: "sine" shape { element_type: C128 } } - clock_cycles: 1999 + clock_cycles: 5878 } entries { instruction { opcode: "sqrt" shape { element_type: C128 } } - clock_cycles: 6636 + clock_cycles: 15606 } entries { instruction { opcode: "tanh" shape { element_type: C128 } } - clock_cycles: 4613 + clock_cycles: 9939 } entries { instruction { opcode: "add" shape { element_type: C128 } } - clock_cycles: 15 + clock_cycles: 97 } entries { instruction { opcode: "atan2" shape { element_type: C128 } } - clock_cycles: 13131 + clock_cycles: 39027 } entries { instruction { opcode: "divide" shape { element_type: C128 } } - clock_cycles: 2280 + clock_cycles: 7941 } entries { instruction { opcode: "multiply" shape { element_type: C128 } } - clock_cycles: 39 + clock_cycles: 270 } entries { instruction { opcode: "power" shape { element_type: C128 } } - clock_cycles: 8363 + clock_cycles: 18205 } entries { instruction { opcode: "subtract" shape { element_type: C128 } } - clock_cycles: 15 + clock_cycles: 97 } } })pb" R"pb( entries { - key: "sm_86" + key: "sm_80" # "NVIDIA A100-SXM4-40GB" + value { entries { instruction { opcode: "divide" shape { element_type: S8 } } - clock_cycles: 370 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 392 + clock_cycles: 417 } entries { instruction { opcode: "divide" shape { element_type: S16 } } - clock_cycles: 367 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 396 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 306 + clock_cycles: 468 } entries { instruction { opcode: "divide" shape { element_type: S64 } } - clock_cycles: 918 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 601 + clock_cycles: 1094 } entries { instruction { opcode: "divide" shape { element_type: U8 } } - clock_cycles: 306 + clock_cycles: 420 } entries { instruction { opcode: "power" shape { element_type: U8 } } - clock_cycles: 388 + clock_cycles: 417 } entries { instruction { opcode: "divide" shape { element_type: U16 } } - clock_cycles: 302 + clock_cycles: 391 } entries { instruction { opcode: "power" shape { element_type: U16 } } - clock_cycles: 399 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 115 + clock_cycles: 454 } entries { instruction { opcode: "divide" shape { element_type: U64 } } - clock_cycles: 838 + clock_cycles: 908 } entries { instruction { opcode: "power" shape { element_type: U64 } } - clock_cycles: 604 + clock_cycles: 744 } entries { instruction { opcode: "cbrt" shape { element_type: F16 } } - clock_cycles: 925 + clock_cycles: 1195 } entries { instruction { - opcode: "cosine" + opcode: "log" shape { element_type: F16 } } - clock_cycles: 691 + clock_cycles: 321 } entries { instruction { - opcode: "exponential" + opcode: "log-plus-one" shape { element_type: F16 } } - clock_cycles: 108 + clock_cycles: 346 } entries { instruction { - opcode: "exponential-minus-one" + opcode: "sqrt" shape { element_type: F16 } } - clock_cycles: 396 + clock_cycles: 124 } entries { instruction { - opcode: "log" + opcode: "tanh" shape { element_type: F16 } } - clock_cycles: 266 + clock_cycles: 499 } entries { instruction { - opcode: "log-plus-one" - shape { element_type: F16 } + opcode: "log" + shape { element_type: F32 } } - clock_cycles: 284 + clock_cycles: 259 } entries { instruction { - opcode: "logistic" - shape { element_type: F16 } + opcode: "tanh" + shape { element_type: F32 } } - clock_cycles: 226 + clock_cycles: 504 } entries { instruction { - opcode: "rsqrt" - shape { element_type: F16 } + opcode: "power" + shape { element_type: F32 } } - clock_cycles: 97 + clock_cycles: 1221 } entries { instruction { - opcode: "sqrt" - shape { element_type: F16 } + opcode: "cbrt" + shape { element_type: F64 } } - clock_cycles: 97 + clock_cycles: 1638 } entries { instruction { - opcode: "tanh" - shape { element_type: F16 } + opcode: "exponential-minus-one" + shape { element_type: F64 } } - clock_cycles: 212 + clock_cycles: 572 } entries { instruction { - opcode: "atan2" - shape { element_type: F16 } + opcode: "log" + shape { element_type: F64 } } - clock_cycles: 482 + clock_cycles: 699 } entries { instruction { - opcode: "power" - shape { element_type: F16 } + opcode: "log-plus-one" + shape { element_type: F64 } } - clock_cycles: 975 + clock_cycles: 1223 } entries { instruction { - opcode: "cbrt" - shape { element_type: F32 } + opcode: "rsqrt" + shape { element_type: F64 } } - clock_cycles: 867 + clock_cycles: 329 } entries { instruction { - opcode: "cosine" - shape { element_type: F32 } + opcode: "sine" + shape { element_type: F64 } } - clock_cycles: 662 + clock_cycles: 597 } entries { instruction { - opcode: "exponential" - shape { element_type: F32 } + opcode: "sqrt" + shape { element_type: F64 } } - clock_cycles: 86 + clock_cycles: 397 } entries { instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 381 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 244 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } + opcode: "tanh" + shape { element_type: F64 } } - clock_cycles: 262 + clock_cycles: 733 } entries { instruction { - opcode: "logistic" - shape { element_type: F32 } + opcode: "atan2" + shape { element_type: F64 } } - clock_cycles: 176 + clock_cycles: 1080 } entries { instruction { - opcode: "rsqrt" - shape { element_type: F32 } + opcode: "divide" + shape { element_type: F64 } } - clock_cycles: 75 + clock_cycles: 831 } entries { instruction { - opcode: "sine" - shape { element_type: F32 } + opcode: "power" + shape { element_type: F64 } } - clock_cycles: 662 + clock_cycles: 1861 } entries { instruction { - opcode: "sqrt" - shape { element_type: F32 } + opcode: "log" + shape { element_type: C64 } } - clock_cycles: 75 + clock_cycles: 1037 } entries { instruction { - opcode: "tanh" - shape { element_type: F32 } + opcode: "log-plus-one" + shape { element_type: C64 } } - clock_cycles: 190 + clock_cycles: 1029 } entries { instruction { opcode: "atan2" - shape { element_type: F32 } + shape { element_type: C64 } } - clock_cycles: 486 + clock_cycles: 6618 } entries { instruction { opcode: "power" - shape { element_type: F32 } + shape { element_type: C64 } } - clock_cycles: 925 + clock_cycles: 4131 } entries { instruction { - opcode: "cbrt" - shape { element_type: F64 } + opcode: "cosine" + shape { element_type: C128 } } - clock_cycles: 6339 + clock_cycles: 2309 } entries { instruction { - opcode: "cosine" - shape { element_type: F64 } + opcode: "log" + shape { element_type: C128 } } - clock_cycles: 1717 + clock_cycles: 2371 } entries { instruction { - opcode: "exponential" - shape { element_type: F64 } + opcode: "log-plus-one" + shape { element_type: C128 } } - clock_cycles: 1652 + clock_cycles: 2405 } entries { instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } + opcode: "rsqrt" + shape { element_type: C128 } } - clock_cycles: 1900 + clock_cycles: 3945 } entries { instruction { - opcode: "log" - shape { element_type: F64 } + opcode: "sine" + shape { element_type: C128 } } - clock_cycles: 608 + clock_cycles: 2284 } entries { instruction { - opcode: "log-plus-one" - shape { element_type: F64 } + opcode: "sqrt" + shape { element_type: C128 } } - clock_cycles: 2073 + clock_cycles: 5304 } entries { instruction { - opcode: "logistic" - shape { element_type: F64 } + opcode: "tanh" + shape { element_type: C128 } } - clock_cycles: 2412 + clock_cycles: 3618 } entries { instruction { - opcode: "rsqrt" - shape { element_type: F64 } + opcode: "atan2" + shape { element_type: C128 } } - clock_cycles: 698 + clock_cycles: 13564 } entries { instruction { - opcode: "sine" - shape { element_type: F64 } + opcode: "divide" + shape { element_type: C128 } } - clock_cycles: 1789 + clock_cycles: 3037 } entries { instruction { - opcode: "sqrt" - shape { element_type: F64 } + opcode: "power" + shape { element_type: C128 } } - clock_cycles: 986 + clock_cycles: 6054 } + } + })pb" + R"pb( + entries { key: "sm_70" # "Tesla V100-SXM2-16GB" + + value { entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 336 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S8 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 345 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 183 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 287 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: S32 } + } + clock_cycles: 104 + } + entries { + instruction { + opcode: "add" + shape { element_type: S64 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 685 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S64 } + } + clock_cycles: 12 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 376 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 293 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U8 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 293 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 183 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 113 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: U32 } + } + clock_cycles: 104 + } + entries { + instruction { + opcode: "add" + shape { element_type: U64 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 599 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U64 } + } + clock_cycles: 12 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 376 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 226 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 425 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 128 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 241 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 232 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 266 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 425 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 284 + } + entries { + instruction { + opcode: "add" + shape { element_type: F16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 449 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 73 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 709 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 373 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 79 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 205 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 180 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 217 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 76 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 373 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 76 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 269 + } + entries { + instruction { + opcode: "add" + shape { element_type: F32 } + } + clock_cycles: 6 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 406 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F32 } + } + clock_cycles: 21 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F32 } + } + clock_cycles: 6 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 673 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F32 } + } + clock_cycles: 6 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 599 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 624 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 358 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 410 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 318 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 633 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 263 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 618 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 324 + })pb" + R"pb( entries { instruction { opcode: "tanh" shape { element_type: F64 } } - clock_cycles: 1609 + clock_cycles: 406 } entries { instruction { opcode: "add" shape { element_type: F64 } } - clock_cycles: 97 + clock_cycles: 15 } entries { instruction { opcode: "atan2" shape { element_type: F64 } } - clock_cycles: 3747 + clock_cycles: 973 } entries { instruction { opcode: "divide" shape { element_type: F64 } } - clock_cycles: 2016 + clock_cycles: 501 } entries { instruction { opcode: "multiply" shape { element_type: F64 } } - clock_cycles: 97 + clock_cycles: 15 } entries { instruction { opcode: "power" shape { element_type: F64 } } - clock_cycles: 5511 + clock_cycles: 2099 } entries { instruction { opcode: "subtract" shape { element_type: F64 } } - clock_cycles: 97 + clock_cycles: 15 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C64 } + } + clock_cycles: 780 } entries { instruction { opcode: "exponential" shape { element_type: C64 } } - clock_cycles: 1360 + clock_cycles: 722 } entries { instruction { opcode: "exponential-minus-one" shape { element_type: C64 } } - clock_cycles: 1400 + clock_cycles: 703 } entries { instruction { opcode: "log" shape { element_type: C64 } } - clock_cycles: 950 + clock_cycles: 758 } entries { instruction { opcode: "log-plus-one" shape { element_type: C64 } } - clock_cycles: 842 + clock_cycles: 654 } entries { instruction { opcode: "rsqrt" shape { element_type: C64 } } - clock_cycles: 2383 + clock_cycles: 3261 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C64 } + } + clock_cycles: 789 } entries { instruction { opcode: "sqrt" shape { element_type: C64 } } - clock_cycles: 3193 + clock_cycles: 6282 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C64 } + } + clock_cycles: 1924 + } + entries { + instruction { + opcode: "add" + shape { element_type: C64 } + } + clock_cycles: 12 } entries { instruction { opcode: "atan2" shape { element_type: C64 } } - clock_cycles: 5353 + clock_cycles: 8151 } entries { instruction { opcode: "divide" shape { element_type: C64 } } - clock_cycles: 687 + clock_cycles: 480 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C64 } + } + clock_cycles: 42 } entries { instruction { opcode: "power" shape { element_type: C64 } } - clock_cycles: 3351 + clock_cycles: 8105 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C64 } + } + clock_cycles: 12 } entries { instruction { opcode: "cosine" shape { element_type: C128 } } - clock_cycles: 6613 + clock_cycles: 1808 } entries { instruction { opcode: "exponential" shape { element_type: C128 } } - clock_cycles: 4028 + clock_cycles: 1487 } entries { instruction { opcode: "exponential-minus-one" shape { element_type: C128 } } - clock_cycles: 4161 + clock_cycles: 1334 } entries { instruction { opcode: "log" shape { element_type: C128 } } - clock_cycles: 7599 + clock_cycles: 1805 } entries { instruction { opcode: "log-plus-one" shape { element_type: C128 } } - clock_cycles: 6962 + clock_cycles: 1618 } entries { instruction { opcode: "rsqrt" shape { element_type: C128 } } - clock_cycles: 11318 + clock_cycles: 7261 } entries { instruction { opcode: "sine" shape { element_type: C128 } } - clock_cycles: 5878 + clock_cycles: 2013 } entries { instruction { opcode: "sqrt" shape { element_type: C128 } } - clock_cycles: 15606 + clock_cycles: 8237 } entries { instruction { opcode: "tanh" shape { element_type: C128 } } - clock_cycles: 9939 + clock_cycles: 6343 } entries { instruction { opcode: "add" shape { element_type: C128 } } - clock_cycles: 97 + clock_cycles: 15 } entries { instruction { opcode: "atan2" shape { element_type: C128 } } - clock_cycles: 39027 + clock_cycles: 15355 } entries { instruction { opcode: "divide" shape { element_type: C128 } } - clock_cycles: 7941 + clock_cycles: 2423 } entries { instruction { opcode: "multiply" shape { element_type: C128 } } - clock_cycles: 270 + clock_cycles: 45 } entries { instruction { opcode: "power" shape { element_type: C128 } } - clock_cycles: 18205 + clock_cycles: 9810 } entries { instruction { opcode: "subtract" shape { element_type: C128 } } - clock_cycles: 97 + clock_cycles: 15 } } - })pb" + })pb" + R"pb( + entries { key: "sm_60" # "Tesla P100-SXM2-16GB" + + value { entries { + instruction { + opcode: "add" + shape { element_type: S8 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 426 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S8 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 216 + } + entries { + instruction { + opcode: "add" + shape { element_type: S16 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 420 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S16 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 216 + } + entries { + instruction { + opcode: "add" + shape { element_type: S32 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 444 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S32 } + } + clock_cycles: 14 + } + entries { + instruction { + opcode: "power" + shape { element_type: S32 } + } + clock_cycles: 417 + } + entries { + instruction { + opcode: "add" + shape { element_type: S64 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 1018 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S64 } + } + clock_cycles: 82 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 1569 + } + entries { + instruction { + opcode: "add" + shape { element_type: U8 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 299 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U8 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 213 + } + entries { + instruction { + opcode: "add" + shape { element_type: U16 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 307 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U16 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 216 + } + entries { + instruction { + opcode: "add" + shape { element_type: U32 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U32 } + } + clock_cycles: 14 + } + entries { + instruction { + opcode: "power" + shape { element_type: U32 } + } + clock_cycles: 420 + } + entries { + instruction { + opcode: "add" + shape { element_type: U64 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 888 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U64 } + } + clock_cycles: 79 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 1548 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 233 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 532 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 142 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 364 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 325 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 373 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 100 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 497 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 100 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 458 + } + entries { + instruction { + opcode: "add" + shape { element_type: F16 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 675 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 68 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F16 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 1012 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F16 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 213 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 494 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 109 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 337 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 284 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 328 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 473 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 426 + } + entries { + instruction { + opcode: "add" + shape { element_type: F32 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 663 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F32 } + } + clock_cycles: 35 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F32 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 988 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F32 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 645 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 1427 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 405 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 544 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 441 + })pb" R"pb( - entries { - key: "sm_80" # "NVIDIA A100-SXM4-40GB" - value { entries { instruction { - opcode: "divide" - shape { element_type: S8 } + opcode: "log-plus-one" + shape { element_type: F64 } } - clock_cycles: 417 + clock_cycles: 784 } entries { instruction { - opcode: "divide" - shape { element_type: S16 } + opcode: "rsqrt" + shape { element_type: F64 } } - clock_cycles: 468 + clock_cycles: 355 } entries { instruction { - opcode: "divide" - shape { element_type: S64 } + opcode: "sine" + shape { element_type: F64 } } - clock_cycles: 1094 + clock_cycles: 1640 } entries { instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 420 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } + opcode: "sqrt" + shape { element_type: F64 } } clock_cycles: 417 } entries { instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 391 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 454 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 908 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 744 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } + opcode: "tanh" + shape { element_type: F64 } } - clock_cycles: 1195 + clock_cycles: 473 } entries { instruction { - opcode: "log" - shape { element_type: F16 } + opcode: "add" + shape { element_type: F64 } } - clock_cycles: 321 + clock_cycles: 14 } entries { instruction { - opcode: "log-plus-one" - shape { element_type: F16 } + opcode: "atan2" + shape { element_type: F64 } } - clock_cycles: 346 + clock_cycles: 1169 } entries { instruction { - opcode: "sqrt" - shape { element_type: F16 } + opcode: "divide" + shape { element_type: F64 } } - clock_cycles: 124 + clock_cycles: 565 } entries { instruction { - opcode: "tanh" - shape { element_type: F16 } + opcode: "multiply" + shape { element_type: F64 } } - clock_cycles: 499 + clock_cycles: 14 } entries { instruction { - opcode: "log" - shape { element_type: F32 } + opcode: "power" + shape { element_type: F64 } } - clock_cycles: 259 + clock_cycles: 2682 } entries { instruction { - opcode: "tanh" - shape { element_type: F32 } + opcode: "subtract" + shape { element_type: F64 } } - clock_cycles: 504 + clock_cycles: 14 } entries { instruction { - opcode: "power" - shape { element_type: F32 } + opcode: "cosine" + shape { element_type: C64 } } - clock_cycles: 1221 + clock_cycles: 1128 } entries { instruction { - opcode: "cbrt" - shape { element_type: F64 } + opcode: "exponential" + shape { element_type: C64 } } - clock_cycles: 1638 + clock_cycles: 1021 } entries { instruction { opcode: "exponential-minus-one" - shape { element_type: F64 } + shape { element_type: C64 } } - clock_cycles: 572 + clock_cycles: 991 } entries { instruction { opcode: "log" - shape { element_type: F64 } + shape { element_type: C64 } } - clock_cycles: 699 + clock_cycles: 1107 } entries { instruction { opcode: "log-plus-one" - shape { element_type: F64 } + shape { element_type: C64 } } - clock_cycles: 1223 + clock_cycles: 994 } entries { instruction { opcode: "rsqrt" - shape { element_type: F64 } + shape { element_type: C64 } } - clock_cycles: 329 + clock_cycles: 2158 } entries { instruction { opcode: "sine" - shape { element_type: F64 } + shape { element_type: C64 } } - clock_cycles: 597 + clock_cycles: 1139 } entries { instruction { opcode: "sqrt" - shape { element_type: F64 } + shape { element_type: C64 } } - clock_cycles: 397 + clock_cycles: 2934 } entries { instruction { opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 733 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } + shape { element_type: C64 } } - clock_cycles: 1080 + clock_cycles: 1883 } entries { instruction { - opcode: "divide" - shape { element_type: F64 } + opcode: "add" + shape { element_type: C64 } } - clock_cycles: 831 + clock_cycles: 20 } entries { instruction { - opcode: "power" - shape { element_type: F64 } + opcode: "atan2" + shape { element_type: C64 } } - clock_cycles: 1861 + clock_cycles: 16282 } entries { instruction { - opcode: "log" + opcode: "divide" shape { element_type: C64 } } - clock_cycles: 1037 + clock_cycles: 760 } entries { instruction { - opcode: "log-plus-one" + opcode: "multiply" shape { element_type: C64 } } - clock_cycles: 1029 + clock_cycles: 65 } entries { instruction { - opcode: "atan2" + opcode: "power" shape { element_type: C64 } } - clock_cycles: 6618 + clock_cycles: 8335 } entries { instruction { - opcode: "power" + opcode: "subtract" shape { element_type: C64 } } - clock_cycles: 4131 + clock_cycles: 20 } entries { instruction { opcode: "cosine" shape { element_type: C128 } } - clock_cycles: 2309 + clock_cycles: 4302 } entries { instruction { - opcode: "log" + opcode: "exponential" shape { element_type: C128 } } - clock_cycles: 2371 + clock_cycles: 3665 } entries { instruction { - opcode: "log-plus-one" + opcode: "exponential-minus-one" shape { element_type: C128 } } - clock_cycles: 2405 + clock_cycles: 3656 } entries { instruction { - opcode: "rsqrt" + opcode: "log" shape { element_type: C128 } } - clock_cycles: 3945 + clock_cycles: 2057 } entries { instruction { - opcode: "sine" + opcode: "log-plus-one" shape { element_type: C128 } } - clock_cycles: 2284 + clock_cycles: 1806 } entries { instruction { - opcode: "sqrt" + opcode: "rsqrt" shape { element_type: C128 } } - clock_cycles: 5304 + clock_cycles: 6135 } entries { instruction { - opcode: "tanh" + opcode: "sine" shape { element_type: C128 } } - clock_cycles: 3618 + clock_cycles: 4169 } entries { instruction { - opcode: "atan2" + opcode: "sqrt" shape { element_type: C128 } } - clock_cycles: 13564 + clock_cycles: 8595 } entries { instruction { - opcode: "divide" + opcode: "tanh" shape { element_type: C128 } } - clock_cycles: 3037 + clock_cycles: 5294 } entries { instruction { - opcode: "power" + opcode: "add" shape { element_type: C128 } } - clock_cycles: 6054 - } - } - })pb" - R"pb( - entries { - key: "sm_70" # "Tesla V100-SXM2-16GB" - value { - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 336 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S8 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 345 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S16 } - } - clock_cycles: 9 + clock_cycles: 20 } entries { instruction { - opcode: "power" - shape { element_type: S16 } + opcode: "atan2" + shape { element_type: C128 } } - clock_cycles: 183 + clock_cycles: 22278 } entries { instruction { opcode: "divide" - shape { element_type: S32 } + shape { element_type: C128 } } - clock_cycles: 287 + clock_cycles: 3194 } entries { instruction { opcode: "multiply" - shape { element_type: S32 } + shape { element_type: C128 } } - clock_cycles: 3 + clock_cycles: 65 } entries { instruction { opcode: "power" - shape { element_type: S32 } - } - clock_cycles: 104 - } - entries { - instruction { - opcode: "add" - shape { element_type: S64 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } + shape { element_type: C128 } } - clock_cycles: 685 + clock_cycles: 17893 } entries { instruction { - opcode: "multiply" - shape { element_type: S64 } + opcode: "subtract" + shape { element_type: C128 } } - clock_cycles: 12 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 376 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 293 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U8 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 293 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 183 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 113 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: U32 } - } - clock_cycles: 104 - } - entries { - instruction { - opcode: "add" - shape { element_type: U64 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 599 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U64 } - } - clock_cycles: 12 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 376 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 226 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 425 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 128 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 241 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 232 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 266 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F16 } - } - clock_cycles: 425 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 284 - } - entries { - instruction { - opcode: "add" - shape { element_type: F16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 449 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F16 } - } - clock_cycles: 73 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 709 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 373 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 79 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 205 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 180 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 217 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 76 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 373 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 76 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 269 - } - entries { - instruction { - opcode: "add" - shape { element_type: F32 } - } - clock_cycles: 6 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 406 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F32 } - } - clock_cycles: 21 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F32 } - } - clock_cycles: 6 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 673 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F32 } - } - clock_cycles: 6 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 599 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 624 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 358 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 410 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 318 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 633 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 263 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 618 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 324 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 406 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 973 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 501 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 2099 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C64 } - } - clock_cycles: 780 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 722 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 703 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 758 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 654 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 3261 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C64 } - } - clock_cycles: 789 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 6282 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C64 } - } - clock_cycles: 1924 - } - entries { - instruction { - opcode: "add" - shape { element_type: C64 } - } - clock_cycles: 12 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 8151 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C64 } - } - clock_cycles: 480 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C64 } - } - clock_cycles: 42 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 8105 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C64 } - } - clock_cycles: 12 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 1808 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C128 } - } - clock_cycles: 1487 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C128 } - } - clock_cycles: 1334 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 1805 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 1618 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 7261 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 2013 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 8237 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 6343 - } - entries { - instruction { - opcode: "add" - shape { element_type: C128 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 15355 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 2423 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C128 } - } - clock_cycles: 45 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 9810 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C128 } - } - clock_cycles: 15 - } - } - })pb" - R"pb( - entries { - key: "sm_60" # "Tesla P100-SXM2-16GB" - value { - entries { - instruction { - opcode: "add" - shape { element_type: S8 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 426 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S8 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 216 - } - entries { - instruction { - opcode: "add" - shape { element_type: S16 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 420 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S16 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 216 - } - entries { - instruction { - opcode: "add" - shape { element_type: S32 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 444 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S32 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "power" - shape { element_type: S32 } - } - clock_cycles: 417 - } - entries { - instruction { - opcode: "add" - shape { element_type: S64 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 1018 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S64 } - } - clock_cycles: 82 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 1569 - } - entries { - instruction { - opcode: "add" - shape { element_type: U8 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 299 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U8 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 213 - } - entries { - instruction { - opcode: "add" - shape { element_type: U16 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 307 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U16 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 216 - } - entries { - instruction { - opcode: "add" - shape { element_type: U32 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U32 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "power" - shape { element_type: U32 } - } - clock_cycles: 420 - } - entries { - instruction { - opcode: "add" - shape { element_type: U64 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 888 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U64 } - } - clock_cycles: 79 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 1548 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 233 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 532 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 142 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 364 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 325 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 373 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 100 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F16 } - } - clock_cycles: 497 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 100 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 458 - } - entries { - instruction { - opcode: "add" - shape { element_type: F16 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 675 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F16 } - } - clock_cycles: 68 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F16 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 1012 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F16 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 213 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 494 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 109 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 337 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 284 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 328 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 71 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 473 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 71 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 426 - } - entries { - instruction { - opcode: "add" - shape { element_type: F32 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 663 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F32 } - } - clock_cycles: 35 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F32 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 988 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F32 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 645 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 1427 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 405 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 544 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 441 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 784 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 355 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 1640 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 417 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 473 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 1169 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 565 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 2682 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C64 } - } - clock_cycles: 1128 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 1021 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 991 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 1107 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 994 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 2158 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C64 } - } - clock_cycles: 1139 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 2934 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C64 } - } - clock_cycles: 1883 - } - entries { - instruction { - opcode: "add" - shape { element_type: C64 } - } - clock_cycles: 20 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 16282 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C64 } - } - clock_cycles: 760 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C64 } - } - clock_cycles: 65 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 8335 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C64 } - } - clock_cycles: 20 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 4302 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C128 } - } - clock_cycles: 3665 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C128 } - } - clock_cycles: 3656 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 2057 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 1806 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 6135 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 4169 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 8595 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 5294 - } - entries { - instruction { - opcode: "add" - shape { element_type: C128 } - } - clock_cycles: 20 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 22278 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 3194 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C128 } - } - clock_cycles: 65 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 17893 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C128 } - } - clock_cycles: 20 - } - } - })pb" - R"pb( - entries { - key: "sm_75" - value { - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 360 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 336 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 357 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 339 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 296 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 979 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 495 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 293 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 334 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 290 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 336 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 118 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 812 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 515 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 792 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 815 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 132 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 342 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 239 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 239 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F16 } - } - clock_cycles: 262 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 126 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F16 } - } - clock_cycles: 794 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 123 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 175 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 414 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F16 } - } - clock_cycles: 74 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 1120 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 783 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 737 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 83 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 319 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 201 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 218 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F32 } - } - clock_cycles: 181 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 74 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 717 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 74 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 167 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 414 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 1085 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 6494 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 1800 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 1630 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 1929 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 596 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 1774 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F64 } - } - clock_cycles: 2430 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 705 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 1805 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 984 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 1535 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 3744 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 1915 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 5538 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C64 } - } - clock_cycles: 1702 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 1503 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 1474 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 835 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 737 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 2232 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C64 } - } - clock_cycles: 1632 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 2989 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C64 } - } - clock_cycles: 2263 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 4847 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 3219 + clock_cycles: 20 } + } + })pb" + R"pb( + entries { key: "sm_75" + value { entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 360 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 336 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 357 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 339 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 296 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 979 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 495 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 293 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 334 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 290 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 336 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 118 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 812 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 515 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 792 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 815 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 132 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 342 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 239 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 239 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F16 } + } + clock_cycles: 262 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 126 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 794 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 123 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 175 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 414 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 74 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 1120 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 783 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 737 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 83 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 319 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 201 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 218 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F32 } + } + clock_cycles: 181 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 74 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 717 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 74 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 167 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 414 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 1085 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 6494 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 1800 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 1630 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 1929 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 596 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 1774 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F64 } + } + clock_cycles: 2430 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 705 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 1805 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 984 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 1535 + } + entries { + instruction { + opcode: "add" + shape { element_type: F64 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 3744 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 1915 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F64 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 5538 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F64 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C64 } + } + clock_cycles: 1702 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C64 } + } + clock_cycles: 1503 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C64 } + } + clock_cycles: 1474 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 835 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 737 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C64 } + } + clock_cycles: 2232 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C64 } + } + clock_cycles: 1632 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C64 } + } + clock_cycles: 2989 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C64 } + } + clock_cycles: 2263 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 4847 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 3219 + })pb" + R"pb( entries { instruction { opcode: "cosine" @@ -3712,7 +3712,7 @@ constexpr char kDeviceHloOpProfiles[] = clock_cycles: 97 } } - } + } )pb"; } // namespace gpu From 091793fc2e8b05d06ee5272a1287b285356921b6 Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Tue, 30 Jul 2024 13:54:16 -0700 Subject: [PATCH 21/25] . --- xla/service/gpu/model/hlo_op_profiles_data.h | 7446 +++++++++--------- 1 file changed, 3725 insertions(+), 3721 deletions(-) diff --git a/xla/service/gpu/model/hlo_op_profiles_data.h b/xla/service/gpu/model/hlo_op_profiles_data.h index 92d5979c0f530..1131831933c9b 100644 --- a/xla/service/gpu/model/hlo_op_profiles_data.h +++ b/xla/service/gpu/model/hlo_op_profiles_data.h @@ -1,3721 +1,3725 @@ -/* Copyright 2023 The OpenXLA Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_DATA_H_ -#define XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_DATA_H_ - -namespace xla { -namespace gpu { - -// The data below is obtained with -// xla/service/gpu/model:hlo_op_profiler_run - -constexpr char kDeviceHloOpProfiles[] = - R"pb( - entries { key: "sm_90" # "NVIDIA H100 80GB HBM3" - - value { entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 356 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S8 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 364 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S16 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 297 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: S32 } - } - clock_cycles: 71 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 685 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S64 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 253 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 300 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U8 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 304 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U16 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 126 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: U32 } - } - clock_cycles: 71 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 629 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U64 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 253 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 201 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 997 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 102 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 217 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 182 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 245 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F16 } - } - clock_cycles: 993 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 502 - } - entries { - instruction { - opcode: "add" - shape { element_type: F16 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 451 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F16 } - } - clock_cycles: 43 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F16 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 526 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F16 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 178 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 978 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 79 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 190 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 166 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 229 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 75 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 958 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 75 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 467 - } - entries { - instruction { - opcode: "add" - shape { element_type: F32 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 431 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F32 } - } - clock_cycles: 19 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 510 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F32 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 586 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 558 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 376 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 712 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 815 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 1259 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 277 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 554 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 332 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 431 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 930 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 526 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 2205 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C64 } - } - clock_cycles: 2415 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 641 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 2055 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 756 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 633 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 3148 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C64 } - } - clock_cycles: 2324 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 4344 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C64 } - } - clock_cycles: 2379 - } - entries { - instruction { - opcode: "add" - shape { element_type: C64 } - } - clock_cycles: 7 - })pb" - R"pb( - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 6462 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C64 } - } - clock_cycles: 498 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C64 } - } - clock_cycles: 79 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 5532 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C64 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 1750 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C128 } - } - clock_cycles: 1342 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C128 } - } - clock_cycles: 1275 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 2455 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 2403 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 5500 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 1999 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 6636 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 4613 - } - entries { - instruction { - opcode: "add" - shape { element_type: C128 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 13131 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 2280 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C128 } - } - clock_cycles: 39 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 8363 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C128 } - } - clock_cycles: 15 - } - } - })pb" - R"pb( - entries { - key: "sm_86" - value { - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 370 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 392 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 367 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 396 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 306 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 918 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 601 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 306 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 388 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 302 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 399 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 115 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 838 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 604 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 925 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 691 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 108 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 396 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 266 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 284 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F16 } - } - clock_cycles: 226 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 212 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 482 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 975 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 867 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 662 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 86 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 381 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 244 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 262 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F32 } - } - clock_cycles: 176 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 75 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 662 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 75 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 190 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 486 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 925 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 6339 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 1717 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 1652 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 1900 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 608 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 2073 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F64 } - } - clock_cycles: 2412 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 698 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 1789 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 986 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 1609 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 3747 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 2016 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 5511 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 1360 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 1400 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 950 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 842 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 2383 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 3193 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 5353 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C64 } - } - clock_cycles: 687 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 3351 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 6613 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C128 } - } - clock_cycles: 4028 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C128 } - } - clock_cycles: 4161 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 7599 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 6962 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 11318 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 5878 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 15606 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 9939 - } - entries { - instruction { - opcode: "add" - shape { element_type: C128 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 39027 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 7941 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C128 } - } - clock_cycles: 270 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 18205 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C128 } - } - clock_cycles: 97 - } - } - })pb" - R"pb( - entries { - key: "sm_80" # "NVIDIA A100-SXM4-40GB" - - value { - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 417 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 468 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 1094 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 420 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 417 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 391 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 454 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 908 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 744 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 1195 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 321 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 346 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 124 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 499 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 259 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 504 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 1221 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 1638 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 572 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 699 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 1223 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 329 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 597 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 397 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 733 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 1080 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 831 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 1861 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 1037 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 1029 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 6618 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 4131 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 2309 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 2371 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 2405 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 3945 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 2284 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 5304 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 3618 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 13564 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 3037 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 6054 - } - } - })pb" - R"pb( - entries { key: "sm_70" # "Tesla V100-SXM2-16GB" - - value { entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 336 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S8 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 345 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 183 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 287 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: S32 } - } - clock_cycles: 104 - } - entries { - instruction { - opcode: "add" - shape { element_type: S64 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 685 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S64 } - } - clock_cycles: 12 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 376 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 293 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U8 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 293 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 183 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 113 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: U32 } - } - clock_cycles: 104 - } - entries { - instruction { - opcode: "add" - shape { element_type: U64 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 599 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U64 } - } - clock_cycles: 12 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 376 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 226 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 425 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 128 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 241 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 232 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 266 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F16 } - } - clock_cycles: 425 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 284 - } - entries { - instruction { - opcode: "add" - shape { element_type: F16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 449 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F16 } - } - clock_cycles: 73 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 709 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 373 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 79 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 205 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 180 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 217 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 76 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 373 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 76 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 269 - } - entries { - instruction { - opcode: "add" - shape { element_type: F32 } - } - clock_cycles: 6 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 406 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F32 } - } - clock_cycles: 21 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F32 } - } - clock_cycles: 6 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 673 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F32 } - } - clock_cycles: 6 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 599 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 624 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 358 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 410 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 318 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 633 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 263 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 618 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 324 - })pb" - R"pb( - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 406 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 973 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 501 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 2099 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C64 } - } - clock_cycles: 780 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 722 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 703 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 758 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 654 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 3261 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C64 } - } - clock_cycles: 789 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 6282 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C64 } - } - clock_cycles: 1924 - } - entries { - instruction { - opcode: "add" - shape { element_type: C64 } - } - clock_cycles: 12 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 8151 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C64 } - } - clock_cycles: 480 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C64 } - } - clock_cycles: 42 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 8105 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C64 } - } - clock_cycles: 12 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 1808 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C128 } - } - clock_cycles: 1487 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C128 } - } - clock_cycles: 1334 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 1805 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 1618 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 7261 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 2013 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 8237 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 6343 - } - entries { - instruction { - opcode: "add" - shape { element_type: C128 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 15355 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 2423 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C128 } - } - clock_cycles: 45 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 9810 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C128 } - } - clock_cycles: 15 - } - } - })pb" - R"pb( - entries { key: "sm_60" # "Tesla P100-SXM2-16GB" - - value { entries { - instruction { - opcode: "add" - shape { element_type: S8 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 426 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S8 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 216 - } - entries { - instruction { - opcode: "add" - shape { element_type: S16 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 420 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S16 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 216 - } - entries { - instruction { - opcode: "add" - shape { element_type: S32 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 444 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S32 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "power" - shape { element_type: S32 } - } - clock_cycles: 417 - } - entries { - instruction { - opcode: "add" - shape { element_type: S64 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 1018 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S64 } - } - clock_cycles: 82 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 1569 - } - entries { - instruction { - opcode: "add" - shape { element_type: U8 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 299 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U8 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 213 - } - entries { - instruction { - opcode: "add" - shape { element_type: U16 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 307 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U16 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 216 - } - entries { - instruction { - opcode: "add" - shape { element_type: U32 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U32 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "power" - shape { element_type: U32 } - } - clock_cycles: 420 - } - entries { - instruction { - opcode: "add" - shape { element_type: U64 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 888 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U64 } - } - clock_cycles: 79 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 1548 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 233 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 532 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 142 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 364 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 325 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 373 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 100 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F16 } - } - clock_cycles: 497 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 100 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 458 - } - entries { - instruction { - opcode: "add" - shape { element_type: F16 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 675 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F16 } - } - clock_cycles: 68 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F16 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 1012 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F16 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 213 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 494 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 109 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 337 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 284 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 328 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 71 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 473 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 71 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 426 - } - entries { - instruction { - opcode: "add" - shape { element_type: F32 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 663 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F32 } - } - clock_cycles: 35 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F32 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 988 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F32 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 645 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 1427 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 405 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 544 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 441 - })pb" - R"pb( - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 784 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 355 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 1640 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 417 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 473 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 1169 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 565 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 2682 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C64 } - } - clock_cycles: 1128 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 1021 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 991 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 1107 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 994 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 2158 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C64 } - } - clock_cycles: 1139 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 2934 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C64 } - } - clock_cycles: 1883 - } - entries { - instruction { - opcode: "add" - shape { element_type: C64 } - } - clock_cycles: 20 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 16282 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C64 } - } - clock_cycles: 760 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C64 } - } - clock_cycles: 65 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 8335 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C64 } - } - clock_cycles: 20 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 4302 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C128 } - } - clock_cycles: 3665 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C128 } - } - clock_cycles: 3656 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 2057 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 1806 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 6135 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 4169 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 8595 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 5294 - } - entries { - instruction { - opcode: "add" - shape { element_type: C128 } - } - clock_cycles: 20 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 22278 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 3194 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C128 } - } - clock_cycles: 65 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 17893 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C128 } - } - clock_cycles: 20 - } - } - })pb" - R"pb( - entries { key: "sm_75" - value { entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 360 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 336 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 357 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 339 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 296 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 979 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 495 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 293 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 334 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 290 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 336 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 118 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 812 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 515 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 792 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 815 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 132 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 342 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 239 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 239 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F16 } - } - clock_cycles: 262 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 126 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F16 } - } - clock_cycles: 794 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 123 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 175 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 414 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F16 } - } - clock_cycles: 74 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 1120 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 783 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 737 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 83 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 319 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 201 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 218 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F32 } - } - clock_cycles: 181 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 74 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 717 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 74 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 167 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 414 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 1085 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 6494 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 1800 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 1630 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 1929 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 596 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 1774 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F64 } - } - clock_cycles: 2430 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 705 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 1805 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 984 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 1535 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 3744 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 1915 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 5538 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C64 } - } - clock_cycles: 1702 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 1503 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 1474 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 835 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 737 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 2232 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C64 } - } - clock_cycles: 1632 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 2989 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C64 } - } - clock_cycles: 2263 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 4847 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 3219 - })pb" - R"pb( - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 6474 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C128 } - } - clock_cycles: 4962 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C128 } - } - clock_cycles: 4037 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 7286 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 6848 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 10748 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 5391 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 15981 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 9653 - } - entries { - instruction { - opcode: "add" - shape { element_type: C128 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 38206 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 8040 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C128 } - } - clock_cycles: 273 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 18550 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C128 } - } - clock_cycles: 97 - } - } - } - )pb"; - -} // namespace gpu -} // namespace xla - -#endif // XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_DATA_H_ +/* Copyright 2023 The OpenXLA Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_DATA_H_ +#define XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_DATA_H_ + +namespace xla { +namespace gpu { + +// The data below is obtained with +// xla/service/gpu/model:hlo_op_profiler_run + +constexpr char kDeviceHloOpProfiles[] = + R"pb( + entries { key: "sm_90" # "NVIDIA H100 80GB HBM3" + + + value { entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 356 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S8 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 364 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 297 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: S32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 685 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S64 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 253 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 300 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U8 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 304 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 126 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: U32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 629 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U64 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 253 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 201 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 997 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 102 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 217 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 182 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 245 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 993 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 502 + } + entries { + instruction { + opcode: "add" + shape { element_type: F16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 451 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 43 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 526 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 178 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 978 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 79 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 190 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 166 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 229 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 75 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 958 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 75 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 467 + } + entries { + instruction { + opcode: "add" + shape { element_type: F32 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 431 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F32 } + } + clock_cycles: 19 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 510 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F32 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 586 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 558 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 376 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 712 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 815 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 1259 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 277 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 554 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 332 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 431 + } + entries { + instruction { + opcode: "add" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 930 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 526 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 2205 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C64 } + } + clock_cycles: 2415 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C64 } + } + clock_cycles: 641 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C64 } + } + clock_cycles: 2055 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 756 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 633 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C64 } + } + clock_cycles: 3148 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C64 } + } + clock_cycles: 2324 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C64 } + } + clock_cycles: 4344 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C64 } + } + clock_cycles: 2379 + } + entries { + instruction { + opcode: "add" + shape { element_type: C64 } + } + clock_cycles: 7 + })pb" + R"pb( + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 6462 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C64 } + } + clock_cycles: 498 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C64 } + } + clock_cycles: 79 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 5532 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C64 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C128 } + } + clock_cycles: 1750 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C128 } + } + clock_cycles: 1342 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C128 } + } + clock_cycles: 1275 + } + entries { + instruction { + opcode: "log" + shape { element_type: C128 } + } + clock_cycles: 2455 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C128 } + } + clock_cycles: 2403 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C128 } + } + clock_cycles: 5500 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C128 } + } + clock_cycles: 1999 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C128 } + } + clock_cycles: 6636 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C128 } + } + clock_cycles: 4613 + } + entries { + instruction { + opcode: "add" + shape { element_type: C128 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C128 } + } + clock_cycles: 13131 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C128 } + } + clock_cycles: 2280 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C128 } + } + clock_cycles: 39 + } + entries { + instruction { + opcode: "power" + shape { element_type: C128 } + } + clock_cycles: 8363 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C128 } + } + clock_cycles: 15 + } + } + })pb" + R"pb( + entries { + key: "sm_86" + value { + entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 370 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 392 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 367 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 396 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 306 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 918 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 601 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 306 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 388 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 302 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 399 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 115 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 838 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 604 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 925 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 691 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 108 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 396 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 266 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 284 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F16 } + } + clock_cycles: 226 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 212 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 482 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 975 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 867 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 662 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 86 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 381 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 244 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 262 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F32 } + } + clock_cycles: 176 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 75 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 662 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 75 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 190 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 486 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 925 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 6339 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 1717 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 1652 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 1900 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 608 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 2073 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F64 } + } + clock_cycles: 2412 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 698 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 1789 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 986 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 1609 + } + entries { + instruction { + opcode: "add" + shape { element_type: F64 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 3747 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 2016 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F64 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 5511 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F64 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C64 } + } + clock_cycles: 1360 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C64 } + } + clock_cycles: 1400 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 950 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 842 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C64 } + } + clock_cycles: 2383 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C64 } + } + clock_cycles: 3193 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 5353 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C64 } + } + clock_cycles: 687 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 3351 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C128 } + } + clock_cycles: 6613 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C128 } + } + clock_cycles: 4028 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C128 } + } + clock_cycles: 4161 + } + entries { + instruction { + opcode: "log" + shape { element_type: C128 } + } + clock_cycles: 7599 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C128 } + } + clock_cycles: 6962 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C128 } + } + clock_cycles: 11318 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C128 } + } + clock_cycles: 5878 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C128 } + } + clock_cycles: 15606 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C128 } + } + clock_cycles: 9939 + } + entries { + instruction { + opcode: "add" + shape { element_type: C128 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C128 } + } + clock_cycles: 39027 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C128 } + } + clock_cycles: 7941 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C128 } + } + clock_cycles: 270 + } + entries { + instruction { + opcode: "power" + shape { element_type: C128 } + } + clock_cycles: 18205 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C128 } + } + clock_cycles: 97 + } + } + })pb" + R"pb( + entries { + key: "sm_80" # "NVIDIA A100-SXM4-40GB" + + + value { + entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 417 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 468 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 1094 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 420 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 417 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 391 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 454 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 908 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 744 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 1195 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 321 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 346 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 124 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 499 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 259 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 504 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 1221 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 1638 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 572 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 699 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 1223 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 329 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 597 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 397 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 733 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 1080 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 831 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 1861 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 1037 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 1029 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 6618 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 4131 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C128 } + } + clock_cycles: 2309 + } + entries { + instruction { + opcode: "log" + shape { element_type: C128 } + } + clock_cycles: 2371 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C128 } + } + clock_cycles: 2405 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C128 } + } + clock_cycles: 3945 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C128 } + } + clock_cycles: 2284 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C128 } + } + clock_cycles: 5304 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C128 } + } + clock_cycles: 3618 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C128 } + } + clock_cycles: 13564 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C128 } + } + clock_cycles: 3037 + } + entries { + instruction { + opcode: "power" + shape { element_type: C128 } + } + clock_cycles: 6054 + } + } + })pb" + R"pb( + entries { key: "sm_70" # "Tesla V100-SXM2-16GB" + + + value { entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 336 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S8 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 345 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 183 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 287 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: S32 } + } + clock_cycles: 104 + } + entries { + instruction { + opcode: "add" + shape { element_type: S64 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 685 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S64 } + } + clock_cycles: 12 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 376 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 293 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U8 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 293 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 183 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 113 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: U32 } + } + clock_cycles: 104 + } + entries { + instruction { + opcode: "add" + shape { element_type: U64 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 599 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U64 } + } + clock_cycles: 12 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 376 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 226 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 425 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 128 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 241 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 232 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 266 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 425 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 284 + } + entries { + instruction { + opcode: "add" + shape { element_type: F16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 449 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 73 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 709 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 373 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 79 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 205 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 180 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 217 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 76 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 373 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 76 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 269 + } + entries { + instruction { + opcode: "add" + shape { element_type: F32 } + } + clock_cycles: 6 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 406 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F32 } + } + clock_cycles: 21 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F32 } + } + clock_cycles: 6 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 673 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F32 } + } + clock_cycles: 6 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 599 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 624 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 358 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 410 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 318 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 633 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 263 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 618 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 324 + })pb" + R"pb( + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 406 + } + entries { + instruction { + opcode: "add" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 973 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 501 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 2099 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C64 } + } + clock_cycles: 780 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C64 } + } + clock_cycles: 722 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C64 } + } + clock_cycles: 703 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 758 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 654 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C64 } + } + clock_cycles: 3261 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C64 } + } + clock_cycles: 789 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C64 } + } + clock_cycles: 6282 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C64 } + } + clock_cycles: 1924 + } + entries { + instruction { + opcode: "add" + shape { element_type: C64 } + } + clock_cycles: 12 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 8151 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C64 } + } + clock_cycles: 480 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C64 } + } + clock_cycles: 42 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 8105 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C64 } + } + clock_cycles: 12 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C128 } + } + clock_cycles: 1808 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C128 } + } + clock_cycles: 1487 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C128 } + } + clock_cycles: 1334 + } + entries { + instruction { + opcode: "log" + shape { element_type: C128 } + } + clock_cycles: 1805 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C128 } + } + clock_cycles: 1618 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C128 } + } + clock_cycles: 7261 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C128 } + } + clock_cycles: 2013 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C128 } + } + clock_cycles: 8237 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C128 } + } + clock_cycles: 6343 + } + entries { + instruction { + opcode: "add" + shape { element_type: C128 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C128 } + } + clock_cycles: 15355 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C128 } + } + clock_cycles: 2423 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C128 } + } + clock_cycles: 45 + } + entries { + instruction { + opcode: "power" + shape { element_type: C128 } + } + clock_cycles: 9810 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C128 } + } + clock_cycles: 15 + } + } + })pb" + R"pb( + entries { key: "sm_60" # "Tesla P100-SXM2-16GB" + + + value { entries { + instruction { + opcode: "add" + shape { element_type: S8 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 426 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S8 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 216 + } + entries { + instruction { + opcode: "add" + shape { element_type: S16 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 420 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S16 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 216 + } + entries { + instruction { + opcode: "add" + shape { element_type: S32 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 444 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S32 } + } + clock_cycles: 14 + } + entries { + instruction { + opcode: "power" + shape { element_type: S32 } + } + clock_cycles: 417 + } + entries { + instruction { + opcode: "add" + shape { element_type: S64 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 1018 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S64 } + } + clock_cycles: 82 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 1569 + } + entries { + instruction { + opcode: "add" + shape { element_type: U8 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 299 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U8 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 213 + } + entries { + instruction { + opcode: "add" + shape { element_type: U16 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 307 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U16 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 216 + } + entries { + instruction { + opcode: "add" + shape { element_type: U32 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U32 } + } + clock_cycles: 14 + } + entries { + instruction { + opcode: "power" + shape { element_type: U32 } + } + clock_cycles: 420 + } + entries { + instruction { + opcode: "add" + shape { element_type: U64 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 888 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U64 } + } + clock_cycles: 79 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 1548 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 233 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 532 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 142 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 364 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 325 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 373 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 100 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 497 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 100 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 458 + } + entries { + instruction { + opcode: "add" + shape { element_type: F16 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 675 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 68 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F16 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 1012 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F16 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 213 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 494 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 109 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 337 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 284 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 328 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 473 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 426 + } + entries { + instruction { + opcode: "add" + shape { element_type: F32 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 663 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F32 } + } + clock_cycles: 35 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F32 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 988 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F32 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 645 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 1427 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 405 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 544 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 441 + })pb" + R"pb( + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 784 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 355 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 1640 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 417 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 473 + } + entries { + instruction { + opcode: "add" + shape { element_type: F64 } + } + clock_cycles: 14 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 1169 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 565 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F64 } + } + clock_cycles: 14 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 2682 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F64 } + } + clock_cycles: 14 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C64 } + } + clock_cycles: 1128 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C64 } + } + clock_cycles: 1021 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C64 } + } + clock_cycles: 991 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 1107 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 994 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C64 } + } + clock_cycles: 2158 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C64 } + } + clock_cycles: 1139 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C64 } + } + clock_cycles: 2934 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C64 } + } + clock_cycles: 1883 + } + entries { + instruction { + opcode: "add" + shape { element_type: C64 } + } + clock_cycles: 20 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 16282 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C64 } + } + clock_cycles: 760 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C64 } + } + clock_cycles: 65 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 8335 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C64 } + } + clock_cycles: 20 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C128 } + } + clock_cycles: 4302 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C128 } + } + clock_cycles: 3665 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C128 } + } + clock_cycles: 3656 + } + entries { + instruction { + opcode: "log" + shape { element_type: C128 } + } + clock_cycles: 2057 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C128 } + } + clock_cycles: 1806 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C128 } + } + clock_cycles: 6135 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C128 } + } + clock_cycles: 4169 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C128 } + } + clock_cycles: 8595 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C128 } + } + clock_cycles: 5294 + } + entries { + instruction { + opcode: "add" + shape { element_type: C128 } + } + clock_cycles: 20 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C128 } + } + clock_cycles: 22278 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C128 } + } + clock_cycles: 3194 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C128 } + } + clock_cycles: 65 + } + entries { + instruction { + opcode: "power" + shape { element_type: C128 } + } + clock_cycles: 17893 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C128 } + } + clock_cycles: 20 + } + } + })pb" + R"pb( + entries { key: "sm_75" + value { entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 360 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 336 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 357 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 339 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 296 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 979 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 495 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 293 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 334 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 290 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 336 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 118 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 812 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 515 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 792 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 815 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 132 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 342 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 239 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 239 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F16 } + } + clock_cycles: 262 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 126 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 794 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 123 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 175 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 414 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 74 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 1120 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 783 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 737 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 83 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 319 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 201 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 218 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F32 } + } + clock_cycles: 181 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 74 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 717 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 74 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 167 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 414 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 1085 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 6494 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 1800 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 1630 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 1929 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 596 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 1774 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F64 } + } + clock_cycles: 2430 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 705 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 1805 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 984 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 1535 + } + entries { + instruction { + opcode: "add" + shape { element_type: F64 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 3744 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 1915 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F64 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 5538 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F64 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C64 } + } + clock_cycles: 1702 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C64 } + } + clock_cycles: 1503 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C64 } + } + clock_cycles: 1474 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 835 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 737 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C64 } + } + clock_cycles: 2232 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C64 } + } + clock_cycles: 1632 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C64 } + } + clock_cycles: 2989 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C64 } + } + clock_cycles: 2263 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 4847 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 3219 + })pb" + R"pb( + entries { + instruction { + opcode: "cosine" + shape { element_type: C128 } + } + clock_cycles: 6474 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C128 } + } + clock_cycles: 4962 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C128 } + } + clock_cycles: 4037 + } + entries { + instruction { + opcode: "log" + shape { element_type: C128 } + } + clock_cycles: 7286 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C128 } + } + clock_cycles: 6848 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C128 } + } + clock_cycles: 10748 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C128 } + } + clock_cycles: 5391 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C128 } + } + clock_cycles: 15981 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C128 } + } + clock_cycles: 9653 + } + entries { + instruction { + opcode: "add" + shape { element_type: C128 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C128 } + } + clock_cycles: 38206 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C128 } + } + clock_cycles: 8040 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C128 } + } + clock_cycles: 273 + } + entries { + instruction { + opcode: "power" + shape { element_type: C128 } + } + clock_cycles: 18550 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C128 } + } + clock_cycles: 97 + } + } + } + )pb"; + +} // namespace gpu +} // namespace xla + +#endif // XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_DATA_H_ From 60371493299b680b147798198fdf384eb86ee952 Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Tue, 30 Jul 2024 13:55:12 -0700 Subject: [PATCH 22/25] . --- xla/service/gpu/model/hlo_op_profiles_data.h | 5854 +++++++++--------- 1 file changed, 2925 insertions(+), 2929 deletions(-) diff --git a/xla/service/gpu/model/hlo_op_profiles_data.h b/xla/service/gpu/model/hlo_op_profiles_data.h index 1131831933c9b..7f9ffed616863 100644 --- a/xla/service/gpu/model/hlo_op_profiles_data.h +++ b/xla/service/gpu/model/hlo_op_profiles_data.h @@ -24,583 +24,582 @@ namespace gpu { constexpr char kDeviceHloOpProfiles[] = R"pb( - entries { key: "sm_90" # "NVIDIA H100 80GB HBM3" + entries { key: "sm_90" # "NVIDIA H100 80GB HBM3" - - value { entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 356 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S8 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 364 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S16 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 297 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: S32 } - } - clock_cycles: 71 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 685 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S64 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 253 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 300 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U8 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 304 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U16 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 126 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: U32 } - } - clock_cycles: 71 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 629 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U64 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 253 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 201 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 997 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 102 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 217 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 182 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 245 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F16 } - } - clock_cycles: 993 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 502 - } - entries { - instruction { - opcode: "add" - shape { element_type: F16 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 451 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F16 } - } - clock_cycles: 43 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F16 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 526 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F16 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 178 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 978 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 79 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 190 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 166 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 229 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 75 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 958 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 75 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 467 - } - entries { - instruction { - opcode: "add" - shape { element_type: F32 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 431 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F32 } - } - clock_cycles: 19 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 510 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F32 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 586 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 558 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 376 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 712 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 815 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 1259 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 277 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 554 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 332 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 431 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 930 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 526 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 2205 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C64 } - } - clock_cycles: 2415 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 641 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 2055 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 756 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 633 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 3148 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C64 } - } - clock_cycles: 2324 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 4344 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C64 } - } - clock_cycles: 2379 - } - entries { - instruction { - opcode: "add" - shape { element_type: C64 } - } - clock_cycles: 7 - })pb" + value { entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 356 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S8 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 364 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 297 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: S32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 685 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S64 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 253 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 300 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U8 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 304 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 126 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: U32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 629 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U64 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 253 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 201 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 997 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 102 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 217 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 182 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 245 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 993 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 502 + } + entries { + instruction { + opcode: "add" + shape { element_type: F16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 451 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 43 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 526 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F16 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 178 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 978 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 79 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 190 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 166 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 229 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 75 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 958 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 75 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 467 + } + entries { + instruction { + opcode: "add" + shape { element_type: F32 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 431 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F32 } + } + clock_cycles: 19 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 510 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F32 } + } + clock_cycles: 7 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 586 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 558 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 376 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 712 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 815 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 1259 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 277 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 554 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 332 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 431 + } + entries { + instruction { + opcode: "add" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 930 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 526 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 2205 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F64 } + } + clock_cycles: 15 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C64 } + } + clock_cycles: 2415 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C64 } + } + clock_cycles: 641 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C64 } + } + clock_cycles: 2055 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 756 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 633 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C64 } + } + clock_cycles: 3148 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C64 } + } + clock_cycles: 2324 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C64 } + } + clock_cycles: 4344 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C64 } + } + clock_cycles: 2379 + } + entries { + instruction { + opcode: "add" + shape { element_type: C64 } + } + clock_cycles: 7 + })pb" R"pb( entries { instruction { @@ -625,1466 +624,1464 @@ constexpr char kDeviceHloOpProfiles[] = } entries { instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 5532 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C64 } - } - clock_cycles: 7 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 1750 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C128 } - } - clock_cycles: 1342 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C128 } - } - clock_cycles: 1275 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 2455 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 2403 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 5500 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 1999 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 6636 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 4613 - } - entries { - instruction { - opcode: "add" - shape { element_type: C128 } - } - clock_cycles: 15 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 13131 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 2280 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C128 } - } - clock_cycles: 39 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 8363 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C128 } - } - clock_cycles: 15 - } - } - })pb" - R"pb( - entries { - key: "sm_86" - value { - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 370 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 392 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 367 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 396 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 306 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 918 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 601 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 306 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 388 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 302 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 399 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 115 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 838 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 604 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 925 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 691 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 108 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 396 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 266 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 284 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F16 } - } - clock_cycles: 226 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 212 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 482 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 975 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 867 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 662 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 86 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 381 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 244 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 262 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F32 } - } - clock_cycles: 176 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 75 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 662 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 75 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 190 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 486 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 925 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 6339 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 1717 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 1652 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 1900 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 608 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 2073 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F64 } - } - clock_cycles: 2412 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 698 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 1789 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 986 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 1609 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 3747 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 2016 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 5511 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 1360 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 1400 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 950 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 842 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 2383 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 3193 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 5353 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C64 } - } - clock_cycles: 687 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 3351 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C128 } - } - clock_cycles: 6613 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C128 } - } - clock_cycles: 4028 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C128 } - } - clock_cycles: 4161 - } - entries { - instruction { - opcode: "log" - shape { element_type: C128 } - } - clock_cycles: 7599 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C128 } - } - clock_cycles: 6962 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C128 } - } - clock_cycles: 11318 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C128 } - } - clock_cycles: 5878 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C128 } - } - clock_cycles: 15606 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C128 } - } - clock_cycles: 9939 - } - entries { - instruction { - opcode: "add" - shape { element_type: C128 } - } - clock_cycles: 97 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C128 } - } - clock_cycles: 39027 - } - entries { - instruction { - opcode: "divide" - shape { element_type: C128 } - } - clock_cycles: 7941 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: C128 } - } - clock_cycles: 270 - } - entries { - instruction { - opcode: "power" - shape { element_type: C128 } - } - clock_cycles: 18205 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: C128 } - } - clock_cycles: 97 - } - } - })pb" - R"pb( - entries { - key: "sm_80" # "NVIDIA A100-SXM4-40GB" - - - value { - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 417 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 468 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 1094 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 420 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 417 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 391 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 454 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 908 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 744 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 1195 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 321 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 346 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 124 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 499 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 259 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 504 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 1221 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 1638 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 572 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 699 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 1223 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 329 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 597 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 397 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 733 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 1080 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 831 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 1861 - } - entries { - instruction { - opcode: "log" + opcode: "power" shape { element_type: C64 } } - clock_cycles: 1037 + clock_cycles: 5532 } entries { instruction { - opcode: "log-plus-one" + opcode: "subtract" shape { element_type: C64 } } - clock_cycles: 1029 + clock_cycles: 7 } entries { instruction { - opcode: "atan2" - shape { element_type: C64 } + opcode: "cosine" + shape { element_type: C128 } } - clock_cycles: 6618 + clock_cycles: 1750 } entries { instruction { - opcode: "power" - shape { element_type: C64 } + opcode: "exponential" + shape { element_type: C128 } } - clock_cycles: 4131 + clock_cycles: 1342 } entries { instruction { - opcode: "cosine" + opcode: "exponential-minus-one" shape { element_type: C128 } } - clock_cycles: 2309 + clock_cycles: 1275 } entries { instruction { opcode: "log" shape { element_type: C128 } } - clock_cycles: 2371 + clock_cycles: 2455 } entries { instruction { opcode: "log-plus-one" shape { element_type: C128 } } - clock_cycles: 2405 + clock_cycles: 2403 } entries { instruction { opcode: "rsqrt" shape { element_type: C128 } } - clock_cycles: 3945 + clock_cycles: 5500 } entries { instruction { opcode: "sine" shape { element_type: C128 } } - clock_cycles: 2284 + clock_cycles: 1999 } entries { instruction { opcode: "sqrt" shape { element_type: C128 } } - clock_cycles: 5304 + clock_cycles: 6636 } entries { instruction { opcode: "tanh" shape { element_type: C128 } } - clock_cycles: 3618 + clock_cycles: 4613 + } + entries { + instruction { + opcode: "add" + shape { element_type: C128 } + } + clock_cycles: 15 } entries { instruction { opcode: "atan2" shape { element_type: C128 } } - clock_cycles: 13564 + clock_cycles: 13131 } entries { instruction { opcode: "divide" shape { element_type: C128 } } - clock_cycles: 3037 + clock_cycles: 2280 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C128 } + } + clock_cycles: 39 } entries { instruction { opcode: "power" shape { element_type: C128 } } - clock_cycles: 6054 + clock_cycles: 8363 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C128 } + } + clock_cycles: 15 } } - })pb" + })pb" + R"pb( + entries { + key: "sm_86" + value { + entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 370 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 392 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 367 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 396 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 306 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 918 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 601 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 306 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 388 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 302 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 399 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 115 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 838 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 604 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 925 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 691 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 108 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 396 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 266 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 284 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F16 } + } + clock_cycles: 226 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 212 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 482 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 975 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 867 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 662 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 86 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 381 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 244 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 262 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F32 } + } + clock_cycles: 176 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 75 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 662 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 75 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 190 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 486 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 925 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 6339 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 1717 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 1652 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 1900 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 608 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 2073 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F64 } + } + clock_cycles: 2412 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 698 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 1789 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 986 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 1609 + } + entries { + instruction { + opcode: "add" + shape { element_type: F64 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 3747 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 2016 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F64 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 5511 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F64 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C64 } + } + clock_cycles: 1360 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C64 } + } + clock_cycles: 1400 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 950 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 842 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C64 } + } + clock_cycles: 2383 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C64 } + } + clock_cycles: 3193 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 5353 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C64 } + } + clock_cycles: 687 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 3351 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C128 } + } + clock_cycles: 6613 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C128 } + } + clock_cycles: 4028 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C128 } + } + clock_cycles: 4161 + } + entries { + instruction { + opcode: "log" + shape { element_type: C128 } + } + clock_cycles: 7599 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C128 } + } + clock_cycles: 6962 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C128 } + } + clock_cycles: 11318 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C128 } + } + clock_cycles: 5878 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C128 } + } + clock_cycles: 15606 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C128 } + } + clock_cycles: 9939 + } + entries { + instruction { + opcode: "add" + shape { element_type: C128 } + } + clock_cycles: 97 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C128 } + } + clock_cycles: 39027 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C128 } + } + clock_cycles: 7941 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: C128 } + } + clock_cycles: 270 + } + entries { + instruction { + opcode: "power" + shape { element_type: C128 } + } + clock_cycles: 18205 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: C128 } + } + clock_cycles: 97 + } + } + })pb" R"pb( - entries { key: "sm_70" # "Tesla V100-SXM2-16GB" + entries { + key: "sm_80" # "NVIDIA A100-SXM4-40GB" + value { + entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 417 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 468 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 1094 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 420 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 417 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 391 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 454 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 908 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 744 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 1195 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 321 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 346 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 124 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 499 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 259 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 504 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 1221 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 1638 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 572 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 699 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 1223 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 329 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 597 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 397 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 733 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 1080 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 831 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 1861 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 1037 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 1029 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 6618 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 4131 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C128 } + } + clock_cycles: 2309 + } + entries { + instruction { + opcode: "log" + shape { element_type: C128 } + } + clock_cycles: 2371 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C128 } + } + clock_cycles: 2405 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C128 } + } + clock_cycles: 3945 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C128 } + } + clock_cycles: 2284 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C128 } + } + clock_cycles: 5304 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C128 } + } + clock_cycles: 3618 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C128 } + } + clock_cycles: 13564 + } + entries { + instruction { + opcode: "divide" + shape { element_type: C128 } + } + clock_cycles: 3037 + } + entries { + instruction { + opcode: "power" + shape { element_type: C128 } + } + clock_cycles: 6054 + } + } + })pb" + R"pb( + entries { key: "sm_70" # "Tesla V100-SXM2-16GB" - value { entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 336 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S8 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 345 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 183 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 287 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: S32 } - } - clock_cycles: 104 - } - entries { - instruction { - opcode: "add" - shape { element_type: S64 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 685 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S64 } - } - clock_cycles: 12 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 376 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 293 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U8 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 293 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 183 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 113 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U32 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "power" - shape { element_type: U32 } - } - clock_cycles: 104 - } - entries { - instruction { - opcode: "add" - shape { element_type: U64 } - } - clock_cycles: 3 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 599 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U64 } - } - clock_cycles: 12 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 376 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 226 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 425 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 128 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 241 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 232 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 266 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F16 } - } - clock_cycles: 425 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 122 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 284 - } - entries { - instruction { - opcode: "add" - shape { element_type: F16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 449 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F16 } - } - clock_cycles: 73 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 709 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F16 } - } - clock_cycles: 9 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 373 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 79 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 205 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 180 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 217 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 76 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 373 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 76 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 269 - } - entries { - instruction { - opcode: "add" - shape { element_type: F32 } - } - clock_cycles: 6 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 406 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F32 } - } - clock_cycles: 21 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F32 } - } - clock_cycles: 6 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 673 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F32 } - } - clock_cycles: 6 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 599 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 624 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 358 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 410 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 318 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 633 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 263 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 618 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 324 - })pb" + value { entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 336 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S8 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 345 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 183 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 287 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: S32 } + } + clock_cycles: 104 + } + entries { + instruction { + opcode: "add" + shape { element_type: S64 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 685 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S64 } + } + clock_cycles: 12 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 376 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 293 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U8 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 293 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 183 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 113 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U32 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "power" + shape { element_type: U32 } + } + clock_cycles: 104 + } + entries { + instruction { + opcode: "add" + shape { element_type: U64 } + } + clock_cycles: 3 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 599 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U64 } + } + clock_cycles: 12 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 376 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 226 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 425 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 128 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 241 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 232 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 266 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 425 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 122 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 284 + } + entries { + instruction { + opcode: "add" + shape { element_type: F16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 449 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 73 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 709 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F16 } + } + clock_cycles: 9 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 373 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 79 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 205 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 180 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 217 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 76 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 373 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 76 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 269 + } + entries { + instruction { + opcode: "add" + shape { element_type: F32 } + } + clock_cycles: 6 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 406 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F32 } + } + clock_cycles: 21 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F32 } + } + clock_cycles: 6 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 673 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F32 } + } + clock_cycles: 6 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 599 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 624 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 358 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 410 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 318 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 633 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 263 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 618 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 324 + })pb" R"pb( entries { instruction { @@ -2346,494 +2343,493 @@ constexpr char kDeviceHloOpProfiles[] = clock_cycles: 15 } } - })pb" + })pb" R"pb( - entries { key: "sm_60" # "Tesla P100-SXM2-16GB" - + entries { key: "sm_60" # "Tesla P100-SXM2-16GB" - value { entries { - instruction { - opcode: "add" - shape { element_type: S8 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 426 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S8 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 216 - } - entries { - instruction { - opcode: "add" - shape { element_type: S16 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 420 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S16 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 216 - } - entries { - instruction { - opcode: "add" - shape { element_type: S32 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 444 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S32 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "power" - shape { element_type: S32 } - } - clock_cycles: 417 - } - entries { - instruction { - opcode: "add" - shape { element_type: S64 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 1018 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: S64 } - } - clock_cycles: 82 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 1569 - } - entries { - instruction { - opcode: "add" - shape { element_type: U8 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 299 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U8 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 213 - } - entries { - instruction { - opcode: "add" - shape { element_type: U16 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 307 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U16 } - } - clock_cycles: 5 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 216 - } - entries { - instruction { - opcode: "add" - shape { element_type: U32 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 189 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U32 } - } - clock_cycles: 14 - } - entries { - instruction { - opcode: "power" - shape { element_type: U32 } - } - clock_cycles: 420 - } - entries { - instruction { - opcode: "add" - shape { element_type: U64 } - } - clock_cycles: 2 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 888 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: U64 } - } - clock_cycles: 79 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 1548 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 233 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 532 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 142 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 364 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 325 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 373 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 100 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F16 } - } - clock_cycles: 497 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 100 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 458 - } - entries { - instruction { - opcode: "add" - shape { element_type: F16 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 675 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F16 } - } - clock_cycles: 68 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F16 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 1012 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F16 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 213 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 494 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 109 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 337 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 284 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 328 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 71 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 473 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 71 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 426 - } - entries { - instruction { - opcode: "add" - shape { element_type: F32 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 663 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F32 } - } - clock_cycles: 35 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F32 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 988 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F32 } - } - clock_cycles: 11 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 645 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 1427 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 405 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 544 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 441 - })pb" + value { entries { + instruction { + opcode: "add" + shape { element_type: S8 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 426 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S8 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 216 + } + entries { + instruction { + opcode: "add" + shape { element_type: S16 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 420 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S16 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 216 + } + entries { + instruction { + opcode: "add" + shape { element_type: S32 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 444 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S32 } + } + clock_cycles: 14 + } + entries { + instruction { + opcode: "power" + shape { element_type: S32 } + } + clock_cycles: 417 + } + entries { + instruction { + opcode: "add" + shape { element_type: S64 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 1018 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: S64 } + } + clock_cycles: 82 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 1569 + } + entries { + instruction { + opcode: "add" + shape { element_type: U8 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 299 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U8 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 213 + } + entries { + instruction { + opcode: "add" + shape { element_type: U16 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 307 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U16 } + } + clock_cycles: 5 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 216 + } + entries { + instruction { + opcode: "add" + shape { element_type: U32 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 189 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U32 } + } + clock_cycles: 14 + } + entries { + instruction { + opcode: "power" + shape { element_type: U32 } + } + clock_cycles: 420 + } + entries { + instruction { + opcode: "add" + shape { element_type: U64 } + } + clock_cycles: 2 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 888 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: U64 } + } + clock_cycles: 79 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 1548 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 233 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 532 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 142 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 364 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 325 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 373 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 100 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 497 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 100 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 458 + } + entries { + instruction { + opcode: "add" + shape { element_type: F16 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 675 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 68 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F16 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 1012 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F16 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 213 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 494 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 109 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 337 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 284 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 328 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 473 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 71 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 426 + } + entries { + instruction { + opcode: "add" + shape { element_type: F32 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 663 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F32 } + } + clock_cycles: 35 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F32 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 988 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F32 } + } + clock_cycles: 11 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 645 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 1427 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 405 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 544 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 441 + })pb" R"pb( entries { instruction { @@ -3123,492 +3119,492 @@ constexpr char kDeviceHloOpProfiles[] = clock_cycles: 20 } } - })pb" + })pb" R"pb( - entries { key: "sm_75" - value { entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 360 - } - entries { - instruction { - opcode: "power" - shape { element_type: S8 } - } - clock_cycles: 336 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 357 - } - entries { - instruction { - opcode: "power" - shape { element_type: S16 } - } - clock_cycles: 339 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S32 } - } - clock_cycles: 296 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 979 - } - entries { - instruction { - opcode: "power" - shape { element_type: S64 } - } - clock_cycles: 495 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 293 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 334 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 290 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 336 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U32 } - } - clock_cycles: 118 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 812 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 515 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 792 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F16 } - } - clock_cycles: 815 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F16 } - } - clock_cycles: 132 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F16 } - } - clock_cycles: 342 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 239 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 239 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F16 } - } - clock_cycles: 262 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F16 } - } - clock_cycles: 126 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F16 } - } - clock_cycles: 794 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 123 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 175 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F16 } - } - clock_cycles: 414 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F16 } - } - clock_cycles: 74 - } - entries { - instruction { - opcode: "power" - shape { element_type: F16 } - } - clock_cycles: 1120 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F32 } - } - clock_cycles: 783 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F32 } - } - clock_cycles: 737 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F32 } - } - clock_cycles: 83 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F32 } - } - clock_cycles: 319 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 201 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F32 } - } - clock_cycles: 218 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F32 } - } - clock_cycles: 181 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F32 } - } - clock_cycles: 74 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F32 } - } - clock_cycles: 717 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F32 } - } - clock_cycles: 74 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 167 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F32 } - } - clock_cycles: 414 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 1085 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 6494 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: F64 } - } - clock_cycles: 1800 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: F64 } - } - clock_cycles: 1630 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 1929 - } - entries { - instruction { - opcode: "log" - shape { element_type: F64 } - } - clock_cycles: 596 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F64 } - } - clock_cycles: 1774 - } - entries { - instruction { - opcode: "logistic" - shape { element_type: F64 } - } - clock_cycles: 2430 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: F64 } - } - clock_cycles: 705 - } - entries { - instruction { - opcode: "sine" - shape { element_type: F64 } - } - clock_cycles: 1805 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F64 } - } - clock_cycles: 984 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F64 } - } - clock_cycles: 1535 - } - entries { - instruction { - opcode: "add" - shape { element_type: F64 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: F64 } - } - clock_cycles: 3744 - } - entries { - instruction { - opcode: "divide" - shape { element_type: F64 } - } - clock_cycles: 1915 - } - entries { - instruction { - opcode: "multiply" - shape { element_type: F64 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "power" - shape { element_type: F64 } - } - clock_cycles: 5538 - } - entries { - instruction { - opcode: "subtract" - shape { element_type: F64 } - } - clock_cycles: 95 - } - entries { - instruction { - opcode: "cosine" - shape { element_type: C64 } - } - clock_cycles: 1702 - } - entries { - instruction { - opcode: "exponential" - shape { element_type: C64 } - } - clock_cycles: 1503 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: C64 } - } - clock_cycles: 1474 - } - entries { - instruction { - opcode: "log" - shape { element_type: C64 } - } - clock_cycles: 835 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: C64 } - } - clock_cycles: 737 - } - entries { - instruction { - opcode: "rsqrt" - shape { element_type: C64 } - } - clock_cycles: 2232 - } - entries { - instruction { - opcode: "sine" - shape { element_type: C64 } - } - clock_cycles: 1632 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: C64 } - } - clock_cycles: 2989 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: C64 } - } - clock_cycles: 2263 - } - entries { - instruction { - opcode: "atan2" - shape { element_type: C64 } - } - clock_cycles: 4847 - } - entries { - instruction { - opcode: "power" - shape { element_type: C64 } - } - clock_cycles: 3219 - })pb" + entries { key: "sm_75" + value { entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 360 + } + entries { + instruction { + opcode: "power" + shape { element_type: S8 } + } + clock_cycles: 336 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 357 + } + entries { + instruction { + opcode: "power" + shape { element_type: S16 } + } + clock_cycles: 339 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S32 } + } + clock_cycles: 296 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 979 + } + entries { + instruction { + opcode: "power" + shape { element_type: S64 } + } + clock_cycles: 495 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 293 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 334 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 290 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 336 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U32 } + } + clock_cycles: 118 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 812 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 515 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 792 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F16 } + } + clock_cycles: 815 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F16 } + } + clock_cycles: 132 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F16 } + } + clock_cycles: 342 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 239 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 239 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F16 } + } + clock_cycles: 262 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F16 } + } + clock_cycles: 126 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F16 } + } + clock_cycles: 794 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 123 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 175 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F16 } + } + clock_cycles: 414 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F16 } + } + clock_cycles: 74 + } + entries { + instruction { + opcode: "power" + shape { element_type: F16 } + } + clock_cycles: 1120 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F32 } + } + clock_cycles: 783 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F32 } + } + clock_cycles: 737 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F32 } + } + clock_cycles: 83 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F32 } + } + clock_cycles: 319 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 201 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F32 } + } + clock_cycles: 218 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F32 } + } + clock_cycles: 181 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F32 } + } + clock_cycles: 74 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F32 } + } + clock_cycles: 717 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F32 } + } + clock_cycles: 74 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 167 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F32 } + } + clock_cycles: 414 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 1085 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 6494 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: F64 } + } + clock_cycles: 1800 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: F64 } + } + clock_cycles: 1630 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 1929 + } + entries { + instruction { + opcode: "log" + shape { element_type: F64 } + } + clock_cycles: 596 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F64 } + } + clock_cycles: 1774 + } + entries { + instruction { + opcode: "logistic" + shape { element_type: F64 } + } + clock_cycles: 2430 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: F64 } + } + clock_cycles: 705 + } + entries { + instruction { + opcode: "sine" + shape { element_type: F64 } + } + clock_cycles: 1805 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F64 } + } + clock_cycles: 984 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F64 } + } + clock_cycles: 1535 + } + entries { + instruction { + opcode: "add" + shape { element_type: F64 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: F64 } + } + clock_cycles: 3744 + } + entries { + instruction { + opcode: "divide" + shape { element_type: F64 } + } + clock_cycles: 1915 + } + entries { + instruction { + opcode: "multiply" + shape { element_type: F64 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "power" + shape { element_type: F64 } + } + clock_cycles: 5538 + } + entries { + instruction { + opcode: "subtract" + shape { element_type: F64 } + } + clock_cycles: 95 + } + entries { + instruction { + opcode: "cosine" + shape { element_type: C64 } + } + clock_cycles: 1702 + } + entries { + instruction { + opcode: "exponential" + shape { element_type: C64 } + } + clock_cycles: 1503 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: C64 } + } + clock_cycles: 1474 + } + entries { + instruction { + opcode: "log" + shape { element_type: C64 } + } + clock_cycles: 835 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: C64 } + } + clock_cycles: 737 + } + entries { + instruction { + opcode: "rsqrt" + shape { element_type: C64 } + } + clock_cycles: 2232 + } + entries { + instruction { + opcode: "sine" + shape { element_type: C64 } + } + clock_cycles: 1632 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: C64 } + } + clock_cycles: 2989 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: C64 } + } + clock_cycles: 2263 + } + entries { + instruction { + opcode: "atan2" + shape { element_type: C64 } + } + clock_cycles: 4847 + } + entries { + instruction { + opcode: "power" + shape { element_type: C64 } + } + clock_cycles: 3219 + })pb" R"pb( entries { instruction { @@ -3716,7 +3712,7 @@ constexpr char kDeviceHloOpProfiles[] = clock_cycles: 97 } } - } + } )pb"; } // namespace gpu From 866e013f3c2417cd6f03a935cff92fe3866dfe74 Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Tue, 30 Jul 2024 18:21:14 -0700 Subject: [PATCH 23/25] . --- xla/service/gpu/fusions/mlir/erase_dead_functions.cc | 3 +-- xla/service/gpu/fusions/mlir/merge_pointers_to_same_slice.cc | 3 +-- xla/service/gpu/fusions/mlir/optimize_loops.cc | 3 +-- xla/service/gpu/fusions/mlir/unswitch_loops.cc | 3 +-- xla/service/gpu/fusions/mlir/vectorize_loads_stores.cc | 3 +-- 5 files changed, 5 insertions(+), 10 deletions(-) diff --git a/xla/service/gpu/fusions/mlir/erase_dead_functions.cc b/xla/service/gpu/fusions/mlir/erase_dead_functions.cc index 012201a76fe9c..3c8a4d4dfb901 100644 --- a/xla/service/gpu/fusions/mlir/erase_dead_functions.cc +++ b/xla/service/gpu/fusions/mlir/erase_dead_functions.cc @@ -77,8 +77,7 @@ class EraseDeadFunctionsPass } // namespace -std::unique_ptr> -CreateEraseDeadFunctionsPass() { +std::unique_ptr CreateEraseDeadFunctionsPass() { return std::make_unique(); } diff --git a/xla/service/gpu/fusions/mlir/merge_pointers_to_same_slice.cc b/xla/service/gpu/fusions/mlir/merge_pointers_to_same_slice.cc index c1899d27c6887..c9bcc4cd347f4 100644 --- a/xla/service/gpu/fusions/mlir/merge_pointers_to_same_slice.cc +++ b/xla/service/gpu/fusions/mlir/merge_pointers_to_same_slice.cc @@ -108,8 +108,7 @@ void MergePointersToSameSlicePass::runOnOperation() { } // namespace -std::unique_ptr> -CreateMergePointersToSameSlicePass() { +std::unique_ptr CreateMergePointersToSameSlicePass() { return std::make_unique(); } diff --git a/xla/service/gpu/fusions/mlir/optimize_loops.cc b/xla/service/gpu/fusions/mlir/optimize_loops.cc index 6d5456f015032..f7c9829c66194 100644 --- a/xla/service/gpu/fusions/mlir/optimize_loops.cc +++ b/xla/service/gpu/fusions/mlir/optimize_loops.cc @@ -306,8 +306,7 @@ class OptimizeLoopsPass } // namespace -std::unique_ptr> -CreateOptimizeLoopsPass() { +std::unique_ptr CreateOptimizeLoopsPass() { return std::make_unique(); } diff --git a/xla/service/gpu/fusions/mlir/unswitch_loops.cc b/xla/service/gpu/fusions/mlir/unswitch_loops.cc index 7d963f31292b4..1775c0ed0e134 100644 --- a/xla/service/gpu/fusions/mlir/unswitch_loops.cc +++ b/xla/service/gpu/fusions/mlir/unswitch_loops.cc @@ -97,8 +97,7 @@ void UnswitchLoopsPass::runOnOperation() { } // namespace -std::unique_ptr> -CreateUnswitchLoopsPass() { +std::unique_ptr CreateUnswitchLoopsPass() { return std::make_unique(); } diff --git a/xla/service/gpu/fusions/mlir/vectorize_loads_stores.cc b/xla/service/gpu/fusions/mlir/vectorize_loads_stores.cc index 00079845867fb..9b37ec69eff56 100644 --- a/xla/service/gpu/fusions/mlir/vectorize_loads_stores.cc +++ b/xla/service/gpu/fusions/mlir/vectorize_loads_stores.cc @@ -350,8 +350,7 @@ class VectorizeLoadsAndStoresPass } // namespace -std::unique_ptr> -CreateVectorizeLoadsAndStoresPass() { +std::unique_ptr CreateVectorizeLoadsAndStoresPass() { return std::make_unique(); } From e289dfffdbe7641c94360b566e16a4c66c6401c1 Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Tue, 30 Jul 2024 21:45:31 -0700 Subject: [PATCH 24/25] . --- xla/stream_executor/cuda/cuda_executor.cc | 5 +---- xla/stream_executor/gpu/BUILD | 1 + xla/stream_executor/gpu/gpu_executor.h | 6 ++++-- xla/stream_executor/rocm/rocm_executor.cc | 4 ---- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/xla/stream_executor/cuda/cuda_executor.cc b/xla/stream_executor/cuda/cuda_executor.cc index 7f478df047be8..4981fa1c5ec32 100644 --- a/xla/stream_executor/cuda/cuda_executor.cc +++ b/xla/stream_executor/cuda/cuda_executor.cc @@ -69,6 +69,7 @@ limitations under the License. #include "xla/stream_executor/gpu/gpu_command_buffer.h" #include "xla/stream_executor/gpu/gpu_driver.h" #include "xla/stream_executor/gpu/gpu_event.h" +#include "xla/stream_executor/gpu/gpu_executor.h" #include "xla/stream_executor/gpu/gpu_kernel.h" #include "xla/stream_executor/gpu/gpu_runtime.h" #include "xla/stream_executor/gpu/gpu_semaphore.h" @@ -766,10 +767,6 @@ absl::StatusOr> GpuExecutor::CreateGpuEvent( return std::move(gpu_event); } -absl::StatusOr> GpuExecutor::CreateEvent() { - return CreateGpuEvent(/*allow_timing=*/false); -} - absl::StatusOr> GpuExecutor::CreateStream( std::optional> priority) { auto gpu_stream = std::make_unique(this); diff --git a/xla/stream_executor/gpu/BUILD b/xla/stream_executor/gpu/BUILD index 1da212056f868..e6c7700910c12 100644 --- a/xla/stream_executor/gpu/BUILD +++ b/xla/stream_executor/gpu/BUILD @@ -209,6 +209,7 @@ gpu_only_cc_library( deps = [ ":gpu_collectives_header", ":gpu_driver_header", + ":gpu_event_header", ":gpu_types_header", "//xla/stream_executor:blas", "//xla/stream_executor:command_buffer", diff --git a/xla/stream_executor/gpu/gpu_executor.h b/xla/stream_executor/gpu/gpu_executor.h index 13b9b944d1beb..ed739c618b383 100644 --- a/xla/stream_executor/gpu/gpu_executor.h +++ b/xla/stream_executor/gpu/gpu_executor.h @@ -48,6 +48,7 @@ limitations under the License. #include "xla/stream_executor/fft.h" #include "xla/stream_executor/gpu/gpu_collectives.h" #include "xla/stream_executor/gpu/gpu_driver.h" +#include "xla/stream_executor/gpu/gpu_event.h" #include "xla/stream_executor/gpu/gpu_types.h" #include "xla/stream_executor/host_memory_allocation.h" #include "xla/stream_executor/kernel.h" @@ -66,7 +67,6 @@ class StreamExecutor; namespace gpu { -class GpuEvent; class GpuKernel; class GpuCommandBuffer; class GpuStream; @@ -234,7 +234,9 @@ class GpuExecutor : public StreamExecutorCommon { dnn::DnnSupport* AsDnn() override; - absl::StatusOr> CreateEvent() override; + absl::StatusOr> CreateEvent() override { + return CreateGpuEvent(/*allow_timing=*/false); + }; absl::StatusOr> CreateStream( std::optional> priority = diff --git a/xla/stream_executor/rocm/rocm_executor.cc b/xla/stream_executor/rocm/rocm_executor.cc index 19a367a37ec27..949055c2f85c8 100644 --- a/xla/stream_executor/rocm/rocm_executor.cc +++ b/xla/stream_executor/rocm/rocm_executor.cc @@ -644,10 +644,6 @@ absl::StatusOr> GpuExecutor::CreateGpuEvent( return std::move(gpu_event); } -absl::StatusOr> GpuExecutor::CreateEvent() { - return CreateGpuEvent(/*allow_timing=*/false); -} - absl::StatusOr> GpuExecutor::CreateStream( std::optional> priority) { auto gpu_stream = std::make_unique(this); From 57c98d8f3a8dffc85f31e2c94e6d75a93d2e2b3b Mon Sep 17 00:00:00 2001 From: eaplatanios Date: Tue, 30 Jul 2024 21:46:53 -0700 Subject: [PATCH 25/25] . --- xla/stream_executor/cuda/cuda_executor.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/xla/stream_executor/cuda/cuda_executor.cc b/xla/stream_executor/cuda/cuda_executor.cc index 4981fa1c5ec32..df77f6d914451 100644 --- a/xla/stream_executor/cuda/cuda_executor.cc +++ b/xla/stream_executor/cuda/cuda_executor.cc @@ -69,7 +69,6 @@ limitations under the License. #include "xla/stream_executor/gpu/gpu_command_buffer.h" #include "xla/stream_executor/gpu/gpu_driver.h" #include "xla/stream_executor/gpu/gpu_event.h" -#include "xla/stream_executor/gpu/gpu_executor.h" #include "xla/stream_executor/gpu/gpu_kernel.h" #include "xla/stream_executor/gpu/gpu_runtime.h" #include "xla/stream_executor/gpu/gpu_semaphore.h"