From 58481d606c19f4e46c1cd7dbfd4aba819ae024d3 Mon Sep 17 00:00:00 2001
From: "Chereshnev, Eugene" <eugene.chereshnev@intel.com>
Date: Tue, 1 Nov 2022 13:13:07 -0700
Subject: [PATCH] gpu: compute: remove lazy initialization for
 mayiuse_ngen_kernels()

- Most models rely on nGEN kernels so overhead on this check cannot be
  avoided
- Lazy initialization prevents "full" engine serialization. If the
  check hasn't been done yet - the serialized object is incomplete
  (mayiuse_ngen_kernels() result is not serialized). This commit removes
  lazy initialization.
---
 src/gpu/compute/compute_engine.hpp  |  4 ++--
 src/gpu/compute/device_info.cpp     | 24 ------------------------
 src/gpu/compute/device_info.hpp     |  6 ++----
 src/gpu/ocl/ocl_gpu_device_info.cpp |  3 ++-
 src/gpu/ocl/ocl_gpu_hw_info.cpp     | 11 +++++++++--
 src/gpu/ocl/ocl_gpu_hw_info.hpp     |  8 +++++---
 src/gpu/ocl/verbose.hpp             |  8 +++++---
 src/sycl/sycl_device_info.cpp       |  5 +++--
 src/sycl/verbose.hpp                |  7 ++++---
 9 files changed, 32 insertions(+), 44 deletions(-)

diff --git a/src/gpu/compute/compute_engine.hpp b/src/gpu/compute/compute_engine.hpp
index b77dc2d2165..152af230204 100644
--- a/src/gpu/compute/compute_engine.hpp
+++ b/src/gpu/compute/compute_engine.hpp
@@ -125,8 +125,8 @@ class compute_engine_t : public engine_t {
     bool is_xe_hpc() const {
         return device_info_->gpu_arch() == gpu_arch_t::xe_hpc;
     }
-    bool mayiuse_ngen_kernels() {
-        return device_info_->mayiuse_ngen_kernels(this);
+    bool mayiuse_ngen_kernels() const {
+        return device_info_->mayiuse_ngen_kernels();
     }
     bool mayiuse_non_uniform_work_groups() const {
         return device_info_->mayiuse_non_uniform_work_groups();
diff --git a/src/gpu/compute/device_info.cpp b/src/gpu/compute/device_info.cpp
index 7236cea1dfc..dab3c5df27b 100644
--- a/src/gpu/compute/device_info.cpp
+++ b/src/gpu/compute/device_info.cpp
@@ -20,9 +20,6 @@
 
 #include "gpu/compute/device_info.hpp"
 
-#include "common/verbose.hpp"
-#include "gpu/jit/binary_format.hpp"
-
 #ifdef DNNL_WITH_SYCL
 #include "sycl/sycl_engine_base.hpp"
 #endif
@@ -59,25 +56,6 @@ uint64_t get_future_extensions(compute::gpu_arch_t gpu_arch) {
     return extensions;
 }
 
-bool device_info_t::mayiuse_ngen_kernels(engine_t *engine) {
-    static std::mutex m;
-    std::lock_guard<std::mutex> guard(m);
-
-    if (checked_ngen_kernels_) return mayiuse_ngen_kernels_;
-
-    auto status
-            = jit::gpu_supports_binary_format(&mayiuse_ngen_kernels_, engine);
-    if (status != status::success) mayiuse_ngen_kernels_ = false;
-
-    if (get_verbose())
-        printf("onednn_verbose,info,gpu,binary_kernels:%s\n",
-                mayiuse_ngen_kernels_ ? "enabled" : "disabled");
-
-    checked_ngen_kernels_ = true;
-
-    return mayiuse_ngen_kernels_;
-}
-
 bool device_info_t::mayiuse_sub_group(int size) const {
     switch (gpu_arch()) {
         case gpu_arch_t::xe_hpc: return utils::one_of(size, 16, 32);
@@ -221,7 +199,6 @@ status_t device_info_t::init_serialized_device_info(
     serialized_device_info_.write(&llc_cache_size_);
     serialized_device_info_.write(&extensions_);
     serialized_device_info_.write(&mayiuse_ngen_kernels_);
-    serialized_device_info_.write(&checked_ngen_kernels_);
     serialized_device_info_.write(&mayiuse_non_uniform_work_groups_);
 
     const size_t name_size = name_.size();
@@ -257,7 +234,6 @@ status_t device_info_t::init_from_cache_blob(
     DESERIALIZE(llc_cache_size_, size_t);
     DESERIALIZE(extensions_, uint64_t);
     DESERIALIZE(mayiuse_ngen_kernels_, bool);
-    DESERIALIZE(checked_ngen_kernels_, bool);
     DESERIALIZE(mayiuse_non_uniform_work_groups_, bool);
 #undef DESERIALIZE
 
diff --git a/src/gpu/compute/device_info.hpp b/src/gpu/compute/device_info.hpp
index 56ce64c55e6..b5f912bbd0c 100644
--- a/src/gpu/compute/device_info.hpp
+++ b/src/gpu/compute/device_info.hpp
@@ -239,7 +239,7 @@ struct device_info_t {
     }
     const std::string &name() const { return name_; }
 
-    bool mayiuse_ngen_kernels(engine_t *engine);
+    bool mayiuse_ngen_kernels() const { return mayiuse_ngen_kernels_; }
 
     bool mayiuse_non_uniform_work_groups() const {
         return mayiuse_non_uniform_work_groups_;
@@ -272,6 +272,7 @@ struct device_info_t {
 
     compute::gpu_arch_t gpu_arch_ = compute::gpu_arch_t::unknown;
     int stepping_id_ = 0;
+    bool mayiuse_ngen_kernels_ = false;
 
     std::string name_;
     runtime_version_t runtime_version_;
@@ -295,9 +296,6 @@ struct device_info_t {
             const std::vector<uint8_t> &cache_blob = {});
     status_t init_from_cache_blob(const std::vector<uint8_t> &cache_blob);
 
-    bool mayiuse_ngen_kernels_ = false;
-    bool checked_ngen_kernels_ = false;
-
     bool mayiuse_non_uniform_work_groups_ = false;
 
     serialization_stream_t serialized_device_info_;
diff --git a/src/gpu/ocl/ocl_gpu_device_info.cpp b/src/gpu/ocl/ocl_gpu_device_info.cpp
index 04beb719d58..dba90db72a4 100644
--- a/src/gpu/ocl/ocl_gpu_device_info.cpp
+++ b/src/gpu/ocl/ocl_gpu_device_info.cpp
@@ -39,7 +39,8 @@ status_t ocl_gpu_device_info_t::init_arch(engine_t *engine) {
             = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err);
     OCL_CHECK(err);
 
-    init_gpu_hw_info(device, context, gpu_arch_, stepping_id_);
+    init_gpu_hw_info(engine, device, context, gpu_arch_, stepping_id_,
+            mayiuse_ngen_kernels_);
 
     err = clReleaseContext(context);
     OCL_CHECK(err);
diff --git a/src/gpu/ocl/ocl_gpu_hw_info.cpp b/src/gpu/ocl/ocl_gpu_hw_info.cpp
index 75cda22957a..b45d49c6724 100644
--- a/src/gpu/ocl/ocl_gpu_hw_info.cpp
+++ b/src/gpu/ocl/ocl_gpu_hw_info.cpp
@@ -15,6 +15,8 @@
 *******************************************************************************/
 
 #include "gpu/ocl/ocl_gpu_hw_info.hpp"
+
+#include "gpu/jit/binary_format.hpp"
 #include "gpu/jit/jit_generator.hpp"
 #include "gpu/jit/ngen_type_bridge.hpp"
 
@@ -23,8 +25,9 @@ namespace impl {
 namespace gpu {
 namespace ocl {
 
-void init_gpu_hw_info(cl_device_id device, cl_context context,
-        compute::gpu_arch_t &gpu_arch, int &stepping_id) {
+void init_gpu_hw_info(engine_t *engine, cl_device_id device, cl_context context,
+        compute::gpu_arch_t &gpu_arch, int &stepping_id,
+        bool &mayiuse_ngen_kernels) {
     using namespace ngen;
 
     HW hw = HW::Unknown;
@@ -32,6 +35,10 @@ void init_gpu_hw_info(cl_device_id device, cl_context context,
             context, device, hw, stepping_id);
 
     gpu_arch = jit::convert_ngen_arch_to_dnnl(hw);
+
+    auto status
+            = jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine);
+    if (status != status::success) mayiuse_ngen_kernels = false;
 }
 
 } // namespace ocl
diff --git a/src/gpu/ocl/ocl_gpu_hw_info.hpp b/src/gpu/ocl/ocl_gpu_hw_info.hpp
index 9f42443a342..be1bd25d242 100644
--- a/src/gpu/ocl/ocl_gpu_hw_info.hpp
+++ b/src/gpu/ocl/ocl_gpu_hw_info.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
+* Copyright 2020-2022 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 
 #include <CL/cl.h>
 
+#include "common/c_types_map.hpp"
 #include "gpu/compute/device_info.hpp"
 
 namespace dnnl {
@@ -26,8 +27,9 @@ namespace impl {
 namespace gpu {
 namespace ocl {
 
-void init_gpu_hw_info(cl_device_id device, cl_context context,
-        compute::gpu_arch_t &gpu_arch, int &stepping_id);
+void init_gpu_hw_info(engine_t *engine, cl_device_id device, cl_context context,
+        compute::gpu_arch_t &gpu_arch, int &stepping_id,
+        bool &mayiuse_ngen_kernels);
 
 } // namespace ocl
 } // namespace gpu
diff --git a/src/gpu/ocl/verbose.hpp b/src/gpu/ocl/verbose.hpp
index e6f02650abf..257be97cdd3 100644
--- a/src/gpu/ocl/verbose.hpp
+++ b/src/gpu/ocl/verbose.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2021 Intel Corporation
+* Copyright 2019-2022 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,8 +39,10 @@ void print_verbose_header() {
         auto s_name = dev_info ? dev_info->name() : "unknown";
         auto s_ver = dev_info ? dev_info->runtime_version().str() : "unknown";
 
-        printf("onednn_verbose,info,gpu,engine,%d,name:%s,driver_version:%s\n",
-                (int)i, s_name.c_str(), s_ver.c_str());
+        printf("onednn_verbose,info,gpu,engine,%d,name:%s,driver_version:%s,"
+               "binary_kernels:%s\n",
+                (int)i, s_name.c_str(), s_ver.c_str(),
+                dev_info->mayiuse_ngen_kernels() ? "enabled" : "disabled");
     }
 }
 
diff --git a/src/sycl/sycl_device_info.cpp b/src/sycl/sycl_device_info.cpp
index e2b9f9d6096..44ff2155a38 100644
--- a/src/sycl/sycl_device_info.cpp
+++ b/src/sycl/sycl_device_info.cpp
@@ -50,8 +50,8 @@ status_t sycl_device_info_t::init_arch(engine_t *engine) {
                 clCreateContext(nullptr, 1, &ocl_dev, nullptr, nullptr, &err));
         OCL_CHECK(err);
 
-        gpu::ocl::init_gpu_hw_info(
-                ocl_dev_wrapper, ocl_ctx_wrapper, gpu_arch_, stepping_id_);
+        gpu::ocl::init_gpu_hw_info(engine, ocl_dev_wrapper, ocl_ctx_wrapper,
+                gpu_arch_, stepping_id_, mayiuse_ngen_kernels_);
     } else if (be == backend_t::level0) {
         // TODO: add support for L0 binary ngen check
         // XXX: query from ocl_engine for now
@@ -68,6 +68,7 @@ status_t sycl_device_info_t::init_arch(engine_t *engine) {
         auto *dev_info = compute_engine->device_info();
         gpu_arch_ = dev_info->gpu_arch();
         stepping_id_ = dev_info->stepping_id();
+        mayiuse_ngen_kernels_ = dev_info->mayiuse_ngen_kernels();
     } else {
         assert(!"not_expected");
     }
diff --git a/src/sycl/verbose.hpp b/src/sycl/verbose.hpp
index cb5ddae702a..0ae4eb503bc 100644
--- a/src/sycl/verbose.hpp
+++ b/src/sycl/verbose.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2021 Intel Corporation
+* Copyright 2019-2022 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,9 +41,10 @@ void print_verbose_header(engine_kind_t kind) {
         auto s_ver = dev_info ? dev_info->runtime_version().str() : "unknown";
 
         printf("onednn_verbose,info,%s,engine,%d,backend:%s,name:%s,driver_"
-               "version:%s\n",
+               "version:%s,binary_kernels:%s\n",
                 s_engine_kind, (int)i, s_backend.c_str(), s_name.c_str(),
-                s_ver.c_str());
+                s_ver.c_str(),
+                dev_info->mayiuse_ngen_kernels() ? "enabled" : "disabled");
     }
 }