diff --git a/CMakeLists.txt b/CMakeLists.txt index 24821d52..a5a8feb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.17) -project(cudnn_frontend VERSION 1.0.3) +project(cudnn_frontend VERSION 1.1.0) option(CUDNN_FRONTEND_BUILD_SAMPLES "Defines if samples are built or not." ON) option(CUDNN_FRONTEND_BUILD_UNIT_TESTS "Defines if unittests are built or not." OFF) diff --git a/README.FE.1.0.md b/README.FE.1.0.md index 337851bd..196c5b12 100644 --- a/README.FE.1.0.md +++ b/README.FE.1.0.md @@ -9,8 +9,8 @@ 6. [Miscellaneous](#Miscellaneous) ## Introduction -FE v1.0 API is aimed to extend functionality and usage exposed by the [cuDNN C backend API](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnn-backend-api). Both C++ and python APIs are provided with both having functional parity. -For a general introduction to FE, please first refer README.md +FE v1.0 API is aimed to extend functionality and usage exposed by the [cuDNN C backend API](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnn-backend-api). Both C++ and python APIs are provided, and both have functional parity. +For a general introduction to FE, please start with README.md. ## Workflow The steps involved in building and running a cudnn graph are as follows: @@ -97,6 +97,14 @@ This method internally queries the heuristics for engine configs for the given h cudnn_frontend::error_t cudnn_frontend::graph::Graph::get_execution_plans(std::vector) ``` +### Get execution plan count +This method returns the number of execution plans returned by cudnn heuristics. Each plan gets an index from 0 to #plans-1, with 0 having top priority. + +``` +cudnn_frontend::int64_t +cudnn_frontend::Graph::get_execution_plan_count() const; +``` + ### Check graph support This method guarantees that executing the graph using plans queried will succeed. @@ -105,14 +113,33 @@ cudnn_frontend::error_t cudnn_frontend::graph::Graph::check_support(cudnnHandle_ ``` ### Build plans -This method builds one or all the engine configs that was queries during the create_execution_plan phase. +This function builds execution plans queired with `create_execution_plan(...)`` API. + +There are two flavours of this API: + +Use this method to build execution plans according to a policy. Suitable when trusting cudnn heuristics to return nest suitable execition plan with top priority. +``` +cudnn_frontend::error_t +cudnn_frontend::graph::Graph::build_plan( + cudnnHandle_t const &handle, + cudnn_frontend::BuildPlanPolicy_t const policy, + bool const do_multithreaded_builds +); +``` + +Use this method to build individual plan indicies. Main usecase is to parallely build execution plans when autotuning. +Plan index to be used here can be queried with `get_execution_plan_count(...)` API. ``` -cudnn_frontend::error_t cudnn_frontend::graph::Graph::build_plans(cudnnHandle_t const &handle, - cudnn_frontend::BuildPlanPolicy_t const policy, - bool const do_multithreaded_builds); +cudnn_frontend::error_t +cudnn_frontend::Graph::build_plan_at_index( + cudnnHandle_t const &handle, + int64_t plan_index +); ``` + + ### Filter plans (optional) Users can filter out plans against numerical, behavioral notes, or plans that do not provide desired functional correctness. @@ -139,18 +166,40 @@ cudnn_frontend::graph::Graph::autotune(cudnnHandle_t handle, ### Execute Executing graph requires device pointers to all input output tensors and a user alloaction device workspace pointer. +Two flavours of execute exists, corresponding to `build_plans(...)`` API. + +This API already has a candidate execution plan set. Candidate execution plan get internally set either: +- if build_policy_t::HEURISTIC_CHOICE is used, or +- as the last plan built that got built. + ``` cudnn_frontend::error_t -cudnn_frontend::graph::Graph::execute(cudnnHandle_t handle, - std::unordered_map, void *> var_pack, - void* workspace); +cudnn_frontend::graph::Graph::execute( + cudnnHandle_t handle, + std::unordered_map, void *> var_pack, + void* workspace +); +``` + +execute API also takes a plan index to target a specific plan. This may be used when autotuning, in conjuction with `build_plan_at_index(...)` API. +``` +cudnn_frontend::error_t +cudnn_frontend::graph::Graph::execute( + cudnnHandle_t handle, + std::unordered_map, void *> var_pack, + void* workspace, + int64_t plan_index +); ``` ### Miscellaneous APIs Get workspace to execute the current selected execution plan. +Can also take in a plan index to query workspace for. This may be used when autotuning, in conjuction with `build_plan_at_index(...)` API. + `int64_t get_workspace_size() const` +`int64_t get_workspace_size_plan_index(int64_t plan_index) const` Get workspace to run autotune on all plans. @@ -167,8 +216,7 @@ Samples are meant to illustrate FE v1.0 API usage to users. - `samples/cpp` contains samples that use C++ API. - `samples/python` contains samples that use python API. -C++ samples are written using [Catch2](https://github.com/catchorg/Catch2) test framework. -Python samples are written using [pytest](https://github.com/pytest-dev/pytest) and [pytorch](https://pytorch.org), with both requiring external installation. +Python samples are jupyter notebooks with step by step guide on using FE v1 API. ## Operations diff --git a/README.md b/README.md index af1082b3..573e1d32 100644 --- a/README.md +++ b/README.md @@ -31,56 +31,63 @@ cudnn can be installed from Minimum python version needed 3.6 The python binding compilation requires development package which can be installed by running `apt-get install python-dev`. -To run the python samples, additionally, you will need the following python packages +To run the python samples, additionally, you will need the following python packages: - pytest -- pytorch-cuda=12.1 (or pytorch-cuda=11.8) -- torchvision -- torchaudio -- pytorch +- torch +- jupyter + + +### Python API +Install FE python API by running: +``` +pip install git+https://github.com/NVIDIA/cudnn-frontend.git +``` + +Above command picks cuda and cudnn from default system paths. + +To provide a custom CUDA installation path, use environment variable: `CUDAToolkit_ROOT`. +To provide a custom CUDNN installation path, use environment variable: `CUDNN_PATH`. + + +To test whether installation is successful, run: +``` +pytest tests/python_fe +``` + +NOTE: Only v1.0 API is exposed via python bindings. ### C++ API -C++ API is header only library. The following compilation steps are only required for building the samples and python bindings. +C++ API is header only library. + +The root CMakeLists.txt can be used as reference to include the cudnn_frontend in your project's build system. -The CMakeLists.txt can be used reference to include the cudnn_frontend in your project. +#### Building samples +The following compilation steps are only required for building the samples and/or python bindings. -Provide CUDA according to: https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html +Provide CUDA installation path according to: https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html + +Provide CUDNN installation path using CUDNN_PATH env variable or cmake parameter. CUDNN_PATH has the cudnn installation: - Headers are in CUDNN_PATH/include. - Libraries are in CUDNN_PATH/lib or CUDNN_PATH/lib64 or CUDNN_PATH/lib/x64. -From project Root, - +For a in-source build, ``` -mkdir build; cd build +mkdir build +cd build cmake -DCUDNN_PATH=/path/to/cudnn -DCUDAToolkit_ROOT=/path/to/cuda ../ cmake --build . -j16 bin/samples ``` -Skip building samples by providing `CUDNN_FRONTEND_BUILD_SAMPLES=OFF` as cmake parameter. -Skip building python bindings by providing `CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS=OFF` as cmake parameter. - -In case, you have a stale cmake cache and want to update the cudnn/cuda paths, please delete the cmake cache (or build directory and redo the above steps). - -### Python API -Install FE python API by running: -pip install git+https://github.com/NVIDIA/cudnn-frontend.git - -Incase of custom installation of CUDA and CUDNN, the default path can be overriden by: +To skip building samples, use `-DCUDNN_FRONTEND_BUILD_SAMPLES=OFF`. -`CUDAToolkit_ROOT=/path/to/cuda CUDNN_PATH=/path/to/cudnn pip install /path/to/cudnn_frontend`. +To skip building python bindings, use `-DCUDNN_FRONTEND_BUILD_PYTHON_BINDINGS=OFF`. -To provide a custom CUDA, export environment variable: `CUDAToolkit_ROOT`. -To provide a custom CUDNN, export environment variable: `CUDNN_PATH`. - -``` - pytest samples/python -``` - -NOTE: Only v1.0 API is exposed via python bindings. +In case, you have a stale cmake cache and want to update the cudnn/cuda paths, please delete the cmake cache (or build directory and redo the above steps). ## Debugging For initial debugging, we recommend turning on the cudnn FE logging and checking for warnings and errors. @@ -108,4 +115,5 @@ No external contribution to this repository is accepted. Please create an issue ## Feedback Support, resources, and information about cuDNN can be found online at https://developer.nvidia.com/cudnn. + Also, bugs and rfes can be reported in the issues section. diff --git a/docs/operations/Attention.md b/docs/operations/Attention.md index 092ef3a1..78f9f617 100644 --- a/docs/operations/Attention.md +++ b/docs/operations/Attention.md @@ -27,6 +27,7 @@ using the FlashAttention-2 algorithm as described in the paper [FlashAttention-2 - To use an user-provided dropout mask, users must provide: - `dropout mask` that matches the attention weights' dimensions, indicating which weights to drop. - `dropout scale` used to adjust the scale of the remaining weights accordingly, such as $1 / (1 - \text{dropout probability})$. +- Ragged tensor: allows the query, key, value, and output tensor to be [ragged tensors](https://www.tensorflow.org/guide/ragged_tensor), which are tensors with nested variable length lists as inner dimensions. Users must pass another tensor called ragged offset tensor using the `Tensor_attributes.set_ragged_offset()` method as specified in the tensors section below. When multiple masking options are enabled, they are applied in the listed order above. @@ -43,6 +44,7 @@ The dimensions that are passed as 1 will apply a broadcasted mask over attention - (Optional) When philox RNG dropout mask is enabled, the RNG seed and offset tensors should have size $(1, 1, 1, 1)$ with int32 or int64 datatype as either a CPU or GPU tensor. - (Optional) When a user provided dropout mask is enabled, a dropout mask tensor should have shape $(1, 1, S_{q}, S_{kv})$, $(1, H_{q}, S_{q}, S_{kv})$, $(B, 1, S_{q}, S_{kv})$, or $(B, H_{q}, S_{q}, S_{kv})$ with input/output datatype. The dimensions that are passed as 1 will apply a broadcasted mask over attention weights. +- (Optional) When query, key, value, and output tensors are ragged tensors, the ragged offset tensor must be a tensor of size $(B + 1, 1, 1, 1)$ that contains the nested tensor's offset in terms of number of elements (not bytes). The last value of the offset tensor specifies the offset of the past-the-end element of the ragged tensor. Where, @@ -96,7 +98,7 @@ SDPA_attributes & set_bias(std::shared_ptr value); SDPA_attributes& -set_alibi_mask(bool const value) +set_alibi_mask(bool const value); SDPA_attributes& set_padding_mask(bool const value); @@ -120,7 +122,7 @@ set_dropout(std::shared_ptr mask, std::shared_ptr scale); SDPA_attributes & -set_compute_data_type(DataType_t value) +set_compute_data_type(DataType_t value); ``` **Python API:** @@ -153,7 +155,7 @@ This operation computes gradient tensors for scaled dot product attention using #### Configurable Options: -All the options mentioned in the forward operation, including GQA and MQA, are applicable in the backward operation as well. +All the options mentioned in the forward operation, including ragged tensors and GQA/MQA, are applicable in the backward operation as well. #### Tensors: @@ -181,19 +183,19 @@ The `options` parameter of type `SDPA_backward_attributes` is used to control th ```cpp SDPA_backward_attributes& -set_attn_scale(std::shared_ptr value) +set_attn_scale(std::shared_ptr value); SDPA_backward_attributes& set_attn_scale(float const value); SDPA_backward_attributes& -set_bias(std::shared_ptr value) +set_bias(std::shared_ptr value); SDPA_backward_attributes& -set_dbias(std::shared_ptr value) +set_dbias(std::shared_ptr value); SDPA_backward_attributes& -set_alibi_mask(bool const value) +set_alibi_mask(bool const value); SDPA_backward_attributes& set_padding_mask(bool const value); @@ -205,20 +207,20 @@ SDPA_backward_attributes& set_seq_len_kv(std::shared_ptr value); SDPA_backward_attributes& -set_causal_mask(bool const value) +set_causal_mask(bool const value); SDPA_backward_attributes& set_dropout(float const probability, std::shared_ptr seed, - std::shared_ptr offset) + std::shared_ptr offset); SDPA_backward_attributes& set_dropout(std::shared_ptr mask, std::shared_ptr scale, - std::shared_ptr scale_inv) + std::shared_ptr scale_inv); SDPA_backward_attributes& -set_compute_data_type(DataType_t const value) +set_compute_data_type(DataType_t const value); ``` Python API: diff --git a/include/cudnn_frontend.h b/include/cudnn_frontend.h index d2946bb5..0f0d5a66 100644 --- a/include/cudnn_frontend.h +++ b/include/cudnn_frontend.h @@ -121,10 +121,11 @@ #include "cudnn_frontend_Resample.h" #include "cudnn_frontend/graph_interface.h" +#include "cudnn_frontend/utils/serialize.h" #define CUDNN_FRONTEND_MAJOR_VERSION 1 -#define CUDNN_FRONTEND_MINOR_VERSION 0 -#define CUDNN_FRONTEND_PATCH_VERSION 3 +#define CUDNN_FRONTEND_MINOR_VERSION 1 +#define CUDNN_FRONTEND_PATCH_VERSION 0 #define CUDNN_FRONTEND_VERSION \ ((CUDNN_FRONTEND_MAJOR_VERSION * 10000) + (CUDNN_FRONTEND_MINOR_VERSION * 100) + CUDNN_FRONTEND_PATCH_VERSION) diff --git a/include/cudnn_frontend/cudnn_interface.h b/include/cudnn_frontend/cudnn_interface.h index 8c12cdc4..6fe8bcf2 100644 --- a/include/cudnn_frontend/cudnn_interface.h +++ b/include/cudnn_frontend/cudnn_interface.h @@ -42,48 +42,59 @@ class ICudnn { // TODO: Always returns OK. Can the status and error message be accessed from tensor descriptor? error_t create_cudnn_tensor(std::shared_ptr const& props, - int64_t& uid, - std::unordered_map>& tensors) const { + uid_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const { // Check whether tensor already created - // TODO: Do not reply on uid being 0? - if (props->get_uid() == 0) { - // Make sure no other tensor somehow already has claimed uid. - RETURN_CUDNN_FRONTEND_ERROR_IF(tensors.find(uid) != tensors.end(), - error_code_t::ATTRIBUTE_NOT_SET, - "Trying to assign same uid to possibily two different tensors."); + // Make sure no other tensor somehow already has claimed uid. + + auto tensor_uid = props->has_uid() ? props->get_uid() : uid; + if (tensors.find(tensor_uid) != tensors.end()) { + getLogger() << "[cudnn_frontend] INFO: Shared Tensor" << uid << " already created." << std::endl; + return {error_code_t::OK, ""}; + } + + if (props->has_uid() == false) { props->set_uid(uid); - uid++; - - auto&& tensor_builder = cudnn_frontend::TensorBuilder(); - - tensor_builder.setDim(props->get_dim().size(), props->get_dim().data()) - .setStrides(props->get_stride().size(), props->get_stride().data()) - .setId(props->get_uid()) - .setAlignment(16) - .setDataType(props->get_data_type()) - .setVirtual(props->get_is_virtual()) - .setByValue(props->get_is_pass_by_value()) - .setReorderType(props->get_reordering_type()); - - if (auto ragged_offset_props = props->get_ragged_offset()) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(ragged_offset_props, uid, tensors)); - tensor_builder.setRaggedOffset(tensors.at(ragged_offset_props->get_uid())); - } + do { + uid++; + } while (invalid_uids.find(uid) != invalid_uids.end()); + } + + auto&& tensor_builder = cudnn_frontend::TensorBuilder(); + tensor_builder.setDim(props->get_dim().size(), props->get_dim().data()) + .setStrides(props->get_stride().size(), props->get_stride().data()) + .setId(props->get_uid()) + .setAlignment(16) + .setDataType(props->get_data_type()) + .setVirtual(props->get_is_virtual()) + .setByValue(props->get_is_pass_by_value()) + .setReorderType(props->get_reordering_type()); + + if (auto ragged_offset_props = props->get_ragged_offset()) { + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(ragged_offset_props, uid, tensors, invalid_uids)); + tensor_builder.setRaggedOffset(tensors.at(ragged_offset_props->get_uid())); + } + +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto tensor = tensor_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF( + tensor.get_status() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, tensor.get_error()); + tensors.emplace(props->get_uid(), std::make_shared(std::move(tensor))); +#else + // build() can throw + // wrap in try catch + try { auto tensor = tensor_builder.build(); tensors.emplace(props->get_uid(), std::make_shared(std::move(tensor))); - - } else { - // Make sure tensor's uid is present in backend tensor registry. + } catch (cudnn_frontend::cudnnException& e) { RETURN_CUDNN_FRONTEND_ERROR_IF( - tensors.find(props->get_uid()) == tensors.end(), - error_code_t::ATTRIBUTE_NOT_SET, - "Backend tensor already not found for non-zero Id: " + std::to_string(props->get_uid())); - - getLogger() << "[cudnn_frontend] INFO: Backend tensor already created for Id: " + - std::to_string(props->get_uid()) - << std::endl; + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } +#endif return {error_code_t::OK, ""}; } @@ -94,26 +105,50 @@ class ICudnn { for (std::shared_ptr operation : operations) { cudnn_operations.push_back(operation.get()); } - auto cudnn_operation_graph = cudnn_frontend::OperationGraphBuilder() - .setHandle(handle) - .setOperationGraph(cudnn_operations.size(), cudnn_operations.data()) - .build(); + auto&& cudnn_operation_graph_builder = cudnn_frontend::OperationGraphBuilder(); + cudnn_operation_graph_builder.setHandle(handle).setOperationGraph(cudnn_operations.size(), + cudnn_operations.data()); + +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto cudnn_operation_graph = cudnn_operation_graph_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(cudnn_operation_graph.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + cudnn_operation_graph.get_error()); operation_graphs.push_back(std::make_shared(std::move(cudnn_operation_graph))); - getLogger() << "[cudnn_frontend] INFO: Successfully built Operation Graphs." << std::endl; - - return {error_code_t::OK, ""}; +#else + // build() can throw + // wrap in try catch + try { + auto cudnn_operation_graph = cudnn_operation_graph_builder.build(); + operation_graphs.push_back(std::make_shared(std::move(cudnn_operation_graph))); + } catch (cudnn_frontend::cudnnException& e) { + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); + } +#endif + return {error_code_t::OK, "Successfully built Operation Graphs."}; } public: - int64_t - get_cudnn_workspace_size_node() const { - int64_t current_workspace_size = 0; + error_t + get_cudnn_workspace_size_node(int64_t const plan_index, int64_t& cudnn_workspace_size) const { for (auto const& execution_plan_list : plans) { - current_workspace_size = - std::max(current_workspace_size, execution_plan_list.get_best_candidate()->getWorkspaceSize()); + int64_t candidate = plan_index != -1 ? plan_index : execution_plan_list.candidate; + RETURN_CUDNN_FRONTEND_ERROR_IF( + (candidate < 0) && (static_cast(execution_plan_list.execution_plans.size()) <= candidate), + error_code_t::GRAPH_EXECUTION_FAILED, + "Plan index is invalid."); + + RETURN_CUDNN_FRONTEND_ERROR_IF(!(execution_plan_list.execution_plans[candidate]), + error_code_t::GRAPH_EXECUTION_FAILED, + "No candidate plan found for graph to query worksapce for."); + cudnn_workspace_size = + std::max(cudnn_workspace_size, execution_plan_list.execution_plans[candidate]->getWorkspaceSize()); } - return current_workspace_size; + return {error_code_t::OK, ""}; } int64_t @@ -126,22 +161,18 @@ class ICudnn { } error_t - execute_cudnn_plans(cudnnHandle_t handle, - std::unordered_map const& tensor_uid_to_pointer_map, - void* workspace_ptr) const { - getLogger() << "[cudnn_frontend] INFO: Executing " << plans.size() << " Plans." << std::endl; + execute_cudnn_plans_with_uid(cudnnHandle_t handle, + std::unordered_map const& tensor_uid_to_pointer_map, + void* workspace_ptr, + int64_t plan_index = -1) const { + getLogger() << "[cudnn_frontend] INFO: Executing " << plans.size() << " plans." << std::endl; + // Go over each plan list for (size_t i = 0; i < plans.size(); ++i) { - auto const& execution_plan = plans[i].get_best_candidate(); - RETURN_CUDNN_FRONTEND_ERROR_IF( - execution_plan == nullptr, error_code_t::GRAPH_EXECUTION_FAILED, "No plan found to execute!!"); - auto const& variant_pack_uid = variant_pack_uids[i]; - - getLogger() << "[cudnn_frontend] INFO: Executing " << execution_plan->getTag() << "..." << std::endl; - + // Make sure device pointer is provided for all uids expected for this plan std::vector device_ptrs; std::vector uids; - for (auto const& uid : variant_pack_uid) { + for (auto const& uid : variant_pack_uids[i]) { auto search = tensor_uid_to_pointer_map.find(uid); RETURN_CUDNN_FRONTEND_ERROR_IF(search == tensor_uid_to_pointer_map.end(), error_code_t::INVALID_VARIANT_PACK, @@ -149,25 +180,19 @@ class ICudnn { device_ptrs.push_back(tensor_uid_to_pointer_map.at(uid)); uids.push_back(uid); } - auto variant_pack = VariantPackBuilder() - .setDataPointers(device_ptrs.size(), device_ptrs.data()) - .setUids(uids.size(), uids.data()) - .setWorkspacePointer(workspace_ptr) - .build(); - if (variant_pack.get_status() != CUDNN_STATUS_SUCCESS) { - std::string message = "[cudnn_frontend] ERROR: Variant pack creation failed with " + - std::string(variant_pack.get_error()); - return {error_code_t::INVALID_VARIANT_PACK, message}; - } - getLogger() << "[cudnn_frontend] INFO: Built variant pack for " << execution_plan->getTag() << "..." - << std::endl; - auto status = cudnnBackendExecute(handle, execution_plan->get_raw_desc(), variant_pack.get_raw_desc()); - if (status != CUDNN_STATUS_SUCCESS) { - std::string message = "[cudnn_frontend] ERROR: Graph execution failed."; - return {error_code_t::GRAPH_EXECUTION_FAILED, message}; - } - getLogger() << "[cudnn_frontend] INFO: Executed " << execution_plan->getTag() << "." << std::endl; + int64_t candidate = plan_index != -1 ? plan_index : plans[i].candidate; + RETURN_CUDNN_FRONTEND_ERROR_IF( + (candidate < 0) && (static_cast(plans[i].execution_plans.size()) <= candidate), + error_code_t::GRAPH_EXECUTION_FAILED, + "Plan index is invalid."); + + RETURN_CUDNN_FRONTEND_ERROR_IF(!(plans[i].execution_plans[candidate]), + error_code_t::GRAPH_EXECUTION_FAILED, + "Plan index does not correspond to a valid plan."); + + CHECK_CUDNN_FRONTEND_ERROR( + detail::execute(handle, plans[i].execution_plans[candidate].get(), device_ptrs, uids, workspace_ptr)); } return {error_code_t::OK, ""}; diff --git a/include/cudnn_frontend/graph_interface.h b/include/cudnn_frontend/graph_interface.h index 10b444de..13f40d18 100644 --- a/include/cudnn_frontend/graph_interface.h +++ b/include/cudnn_frontend/graph_interface.h @@ -28,11 +28,16 @@ class Graph : public INode { private: std::unordered_set> tensors; + void + add_to_tensor_map(std::shared_ptr tensor) { + tensors.emplace(tensor); + } + std::shared_ptr output_tensor(std::string const &name) { auto tensor = std::make_shared(); tensor->set_name(name).set_is_virtual(true); - tensors.emplace(tensor); + add_to_tensor_map(tensor); return tensor; } @@ -187,6 +192,9 @@ class Graph : public INode { error_t create_execution_plans(std::vector const &mode); + int64_t + get_execution_plan_count() const; + error_t check_support(cudnnHandle_t h) { for (auto &plan_list : plans) { @@ -200,6 +208,9 @@ class Graph : public INode { BuildPlanPolicy_t const policy = BuildPlanPolicy_t::HEURISTICS_CHOICE, bool const do_multithreaded_builds = false); + error_t + build_plan_at_index(cudnnHandle_t const &handle, int64_t index); + Graph & deselect_workspace_greater_than(int64_t const workspace) { for (auto &plan_list : plans) { @@ -210,16 +221,10 @@ class Graph : public INode { Graph & deselect_behavior_notes(std::vector const ¬es) { - std::vector backend_notes; - for (auto ¬e : notes) { - cudnnBackendBehaviorNote_t backend_note; - detail::convert_to_cudnn_type(note, backend_note); - backend_notes.push_back(backend_note); - } for (auto &plan_list : plans) { - auto status = plan_list.filter_out_behavior_notes(backend_notes); + auto status = plan_list.deselect_behavior_notes(notes); if (status.is_bad()) { - getLogger() << "[cudnn_frontend] ERROR: Filtering by behavioural notes failed." << std::endl; + getLogger() << status.get_message() << std::endl; } } return *this; @@ -227,33 +232,83 @@ class Graph : public INode { Graph & deselect_numeric_notes(std::vector const ¬es) { - std::vector backend_notes; - for (auto ¬e : notes) { - cudnnBackendNumericalNote_t backend_note; - detail::convert_to_cudnn_type(note, backend_note); - backend_notes.push_back(backend_note); - } for (auto &plan_list : plans) { - auto status = plan_list.filter_out_numeric_notes(backend_notes); + auto status = plan_list.deselect_numeric_notes(notes); if (status.is_bad()) { - getLogger() << "[cudnn_frontend] ERROR: Filtering by numerical notes failed." << std::endl; + getLogger() << status.get_message() << std::endl; } } return *this; } + using INode::deserialize; + using INode::serialize; + + virtual void + serialize(json &j) const override final { + // Different from serialization of other INodes. + // Go over each subnode and serialize them. + j["nodes"]; + for (auto const &sub_node : sub_nodes) { + json j_sub_node; + sub_node->serialize(j_sub_node); + j["nodes"].push_back(j_sub_node); + } + }; + + // TODO: temparorily placed in graphs class. This function needs to be a free standing function. error_t - autotune(cudnnHandle_t handle, - std::unordered_map, void *> variants, - void *workspace, - void *user_impl = nullptr) { - for (auto &plan_list : plans) { - CHECK_CUDNN_FRONTEND_ERROR(plan_list.autotune(handle, variants, workspace, user_impl)); + deserialize(const json &j) { + if (j.contains("nodes") && j["nodes"].is_array()) { + for (const auto &j_sub_node : j["nodes"]) { + if (j_sub_node.contains("tag") && j_sub_node["tag"].is_string()) { + auto tag = j_sub_node["tag"].get(); + if (tag == "CONV_FPROP") { + auto conv_fprop_attributes = j_sub_node.get(); + sub_nodes.emplace_back( + std::make_unique(std::move(conv_fprop_attributes), detail::Context())); + } else if (tag == "POINTWISE") { + auto pointwise_attributes = j_sub_node.get(); + sub_nodes.emplace_back( + std::make_unique(std::move(pointwise_attributes), detail::Context())); + } else if (tag == "REDUCTION") { + auto reduction_attributes = j_sub_node.get(); + sub_nodes.emplace_back( + std::make_unique(std::move(reduction_attributes), detail::Context())); + } else if (tag == "SDPA_FWD") { + auto sdpa_attributes = j_sub_node.get(); + sub_nodes.emplace_back( + std::make_unique(std::move(sdpa_attributes), detail::Context())); + } else if (tag == "SDPA_BWD") { + auto sdpa_bwd_attributes = j_sub_node.get(); + sub_nodes.emplace_back( + std::make_unique(std::move(sdpa_bwd_attributes), detail::Context())); + } + } + } } + return {error_code_t::OK, ""}; } + + std::string + print(void) const { + std::stringstream ss; + json j = *this; + ss << j.dump(4); + return ss.str(); + } }; +inline int64_t +Graph::get_execution_plan_count() const { + int64_t plan_count = 0; + for (auto &plan_list : plans) { + plan_count += plan_list.execution_plans.size(); + } + return plan_count; +} + inline error_t Graph::create_execution_plans(std::vector const &mode) { std::unordered_map op_graph_to_configs; @@ -276,6 +331,14 @@ Graph::create_execution_plans(std::vector const &mode) { return {error_code_t::OK, ""}; } +inline error_t +Graph::build_plan_at_index(cudnnHandle_t const &handle, int64_t plan_index) { + for (auto i = 0u; i < plans.size(); i++) { + CHECK_CUDNN_FRONTEND_ERROR(plans[i].build_plan_at_index(handle, plan_index)); + } + return {error_code_t::OK, ""}; +} + inline error_t Graph::build_plans(cudnnHandle_t const &handle, BuildPlanPolicy_t const policy, bool const do_multithreaded_builds) { for (auto &plan_list : plans) { @@ -305,7 +368,7 @@ Graph::set_compute_data_type(DataType_t const type) { inline std::shared_ptr Graph::tensor(Tensor_attributes const &tensor) { auto tensor_ptr = std::make_shared(tensor); - tensors.emplace(tensor_ptr); + add_to_tensor_map(tensor_ptr); return tensor_ptr; } @@ -320,12 +383,11 @@ Graph::tensor_like(std::shared_ptr const &tensor, std::string // reset the uid of the cloned tensor // uids are not meant to be copied by tensor_like // When lowering to cudnn backend, both tensors involved here will get unique uids. - tensor_ptr->set_uid(0); + tensor_ptr->clear_uid(); // reset the name too. Defaults to empty string. tensor_ptr->set_name(name); - tensors.emplace(tensor_ptr); return tensor_ptr; } @@ -755,4 +817,10 @@ Graph::sdpa_backward(std::shared_ptr q, return {dQ, dK, dV}; } +static inline std::ostream & +operator<<(std::ostream &os, Graph const &graph) { + os << graph.print(); + return os; +} + } // namespace cudnn_frontend::graph \ No newline at end of file diff --git a/include/cudnn_frontend/graph_properties.h b/include/cudnn_frontend/graph_properties.h index 9eba9373..cf323431 100644 --- a/include/cudnn_frontend/graph_properties.h +++ b/include/cudnn_frontend/graph_properties.h @@ -28,6 +28,7 @@ class Tensor_attributes { bool is_pass_by_value = false; TensorReordering_t reordering_type = TensorReordering_t::NONE; int64_t uid = 0; + bool uid_assigned = false; std::shared_ptr ragged_offset; @@ -68,10 +69,9 @@ class Tensor_attributes { stride, is_virtual, is_pass_by_value, - reordering_type - /* uid */ // Not serializing uid is intentional. FE graphs do no need a uid. uid is - // only meant to act as a bridge between backend and frontend tensors. - ) + reordering_type, + uid, + uid_assigned) Tensor_attributes() = default; @@ -167,14 +167,27 @@ class Tensor_attributes { return uid; } + int64_t + has_uid() const { + return uid_assigned; + } + + auto + clear_uid(void) -> Tensor_attributes& { + uid = 0; + uid_assigned = false; + return *this; + } + auto set_uid(int64_t value) -> Tensor_attributes& { - uid = value; + uid = value; + uid_assigned = true; return *this; } auto - set_ragged_offset(std::shared_ptr value) -> Tensor_attributes& { + set_ragged_offset(std::shared_ptr const& value) -> Tensor_attributes& { ragged_offset = value; return *this; } @@ -314,6 +327,45 @@ class Attributes { } return {error_code_t::OK, ""}; } + + error_t + get_prefilled_uids(std::unordered_set& pre_assigned_uids) const { + auto derived = static_cast(this); + + for (auto& [name, tensor] : derived->inputs) { + (void)name; + if (tensor && tensor->has_uid()) { + pre_assigned_uids.insert(tensor->get_uid()); + if (auto ragged_offset = tensor->get_ragged_offset()) { + pre_assigned_uids.insert(ragged_offset->get_uid()); + } + } + } + for (auto& [name, tensor] : derived->outputs) { + (void)name; + if (tensor && tensor->has_uid()) { + pre_assigned_uids.insert(tensor->get_uid()); + if (auto ragged_offset = tensor->get_ragged_offset()) { + pre_assigned_uids.insert(ragged_offset->get_uid()); + } + } + } + + // Handle special case of BN where peer_stats is also an input + if constexpr (std::is_same_v || + std::is_same_v) { + for (auto& tensor : derived->peer_stats) { + if (tensor && tensor->has_uid()) { + pre_assigned_uids.insert(tensor->get_uid()); + if (auto ragged_offset = tensor->get_ragged_offset()) { + pre_assigned_uids.insert(ragged_offset->get_uid()); + } + } + } + } + + return {error_code_t::OK, ""}; + } }; class BN_finalize_attributes : public Attributes { @@ -321,6 +373,7 @@ class BN_finalize_attributes : public Attributes { friend class BatchNormFinalizeNode; friend class Graph; + public: enum class input_names { SUM, SQ_SUM, @@ -332,13 +385,11 @@ class BN_finalize_attributes : public Attributes { PREV_RUNNING_VAR, MOMENTUM }; - std::unordered_map> inputs; - + std::map> inputs; enum class output_names { EQ_SCALE, EQ_BIAS, MEAN, INV_VARIANCE, NEXT_RUNNING_MEAN, NEXT_RUNNING_VAR }; - std::unordered_map> outputs; - public: NLOHMANN_DEFINE_TYPE_INTRUSIVE(BN_finalize_attributes, name, inputs, outputs) + std::map> outputs; BN_finalize_attributes& set_previous_running_stats(std::shared_ptr& mean, @@ -356,13 +407,12 @@ class Genstats_attributes : public Attributes { friend class GenstatsNode; friend class Graph; + public: enum class input_names { X }; - std::unordered_map> inputs; + std::map> inputs; enum class output_names { SUM, SQ_SUM }; - std::unordered_map> outputs; - - public: + std::map> outputs; NLOHMANN_DEFINE_TYPE_INTRUSIVE(Genstats_attributes, name, inputs, outputs) }; @@ -371,27 +421,51 @@ class Conv_fprop_attributes : public Attributes { friend class ConvolutionNode; friend class Graph; - enum class input_names { X, W }; - std::unordered_map> inputs; - - enum class output_names { Y }; - std::unordered_map> outputs; - - std::vector padding; + std::vector pre_padding; + std::vector post_padding; std::vector stride; std::vector dilation; public: - NLOHMANN_DEFINE_TYPE_INTRUSIVE(Conv_fprop_attributes, name, inputs, outputs, padding, stride, dilation) + enum class input_names { X, W }; + std::map> inputs; + enum class output_names { Y }; + std::map> outputs; + NLOHMANN_DEFINE_TYPE_INTRUSIVE(Conv_fprop_attributes, + name, + inputs, + outputs, + pre_padding, + post_padding, + stride, + dilation) + + std::vector + get_pre_padding() const { + return pre_padding; + } std::vector - get_padding() const { - return padding; + get_post_padding() const { + return post_padding; } Conv_fprop_attributes& set_padding(std::vector value) { - padding = value; + pre_padding = value; + post_padding = value; + return *this; + } + + Conv_fprop_attributes& + set_pre_padding(std::vector value) { + pre_padding = value; + return *this; + } + + Conv_fprop_attributes& + set_post_padding(std::vector value) { + post_padding = value; return *this; } @@ -423,16 +497,14 @@ class Batchnorm_backward_attributes : public Attributes> inputs; + std::map> inputs; // Only special case where one of the inputs is a vector. std::vector> peer_stats; - enum class output_names { DX, DSCALE, DBIAS }; - std::unordered_map> outputs; - - public: - NLOHMANN_DEFINE_TYPE_INTRUSIVE(Batchnorm_backward_attributes, name, inputs, outputs) + NLOHMANN_DEFINE_TYPE_INTRUSIVE(Batchnorm_backward_attributes, name, inputs, peer_stats, outputs) + std::map> outputs; Batchnorm_backward_attributes& set_saved_mean_and_inv_variance(std::shared_ptr mean, @@ -454,14 +526,12 @@ class DBN_weight_attributes : public Attributes { friend class DBNWeightNode; friend class Graph; + public: enum class input_names { DY, X, SCALE, MEAN, INV_VARIANCE }; - std::unordered_map> inputs; - + std::map> inputs; enum class output_names { DSCALE, DBIAS, EQ_BIAS, EQ_SCALE_DY, EQ_SCALE_X }; - std::unordered_map> outputs; - - public: NLOHMANN_DEFINE_TYPE_INTRUSIVE(DBN_weight_attributes, name, inputs, outputs) + std::map> outputs; }; class Conv_dgrad_attributes : public Attributes { @@ -469,27 +539,51 @@ class Conv_dgrad_attributes : public Attributes { friend class DgradNode; friend class Graph; - enum class input_names { DY, W }; - std::unordered_map> inputs; - - enum class output_names { DX }; - std::unordered_map> outputs; - - std::vector padding; + std::vector pre_padding; + std::vector post_padding; std::vector stride; std::vector dilation; public: - NLOHMANN_DEFINE_TYPE_INTRUSIVE(Conv_dgrad_attributes, name, inputs, outputs, padding, stride, dilation) + enum class input_names { DY, W }; + std::map> inputs; + enum class output_names { DX }; + std::map> outputs; + NLOHMANN_DEFINE_TYPE_INTRUSIVE(Conv_dgrad_attributes, + name, + inputs, + outputs, + pre_padding, + post_padding, + stride, + dilation) + + std::vector + get_pre_padding() const { + return pre_padding; + } std::vector - get_padding() const { - return padding; + get_post_padding() const { + return post_padding; } Conv_dgrad_attributes& set_padding(std::vector value) { - padding = value; + pre_padding = value; + post_padding = value; + return *this; + } + + Conv_dgrad_attributes& + set_pre_padding(std::vector value) { + pre_padding = value; + return *this; + } + + Conv_dgrad_attributes& + set_post_padding(std::vector value) { + post_padding = value; return *this; } @@ -521,15 +615,13 @@ class Matmul_attributes : public Attributes { friend class MatmulNode; friend class INode; - enum class input_names { A, B, M_override, N_override, K_override }; - std::unordered_map> inputs; - - enum class output_names { C }; - std::unordered_map> outputs; - double padding_value = 0.0; public: + enum class input_names { A, B, M_override, N_override, K_override }; + std::map> inputs; + enum class output_names { C }; + std::map> outputs; NLOHMANN_DEFINE_TYPE_INTRUSIVE(Matmul_attributes, name, inputs, outputs) Matmul_attributes& @@ -563,18 +655,16 @@ class Pointwise_attributes : public Attributes { friend class SoftmaxNode; friend class INode; - enum class input_names { IN_0, IN_1, IN_2 }; - std::unordered_map> inputs; - - enum class output_names { OUT_0 }; - std::unordered_map> outputs; - PointwiseMode_t mode = PointwiseMode_t::NOT_SET; std::optional axis; std::optional relu_lower_clip_slope; public: + enum class input_names { IN_0, IN_1, IN_2 }; + std::map> inputs; + enum class output_names { OUT_0 }; + std::map> outputs; NLOHMANN_DEFINE_TYPE_INTRUSIVE(Pointwise_attributes, name, inputs, outputs, mode, axis) Pointwise_attributes& @@ -606,13 +696,11 @@ class Instancenorm_backward_attributes : public Attributes> inputs; - + std::map> inputs; enum class output_names { DX, DSCALE, DBIAS }; - std::unordered_map> outputs; - - public: + std::map> outputs; NLOHMANN_DEFINE_TYPE_INTRUSIVE(Instancenorm_backward_attributes, name, inputs, outputs) Instancenorm_backward_attributes& @@ -629,13 +717,11 @@ class Layernorm_backward_attributes : public Attributes> inputs; - + std::map> inputs; enum class output_names { DX, DSCALE, DBIAS }; - std::unordered_map> outputs; - - public: + std::map> outputs; NLOHMANN_DEFINE_TYPE_INTRUSIVE(Layernorm_backward_attributes, name, inputs, outputs) Layernorm_backward_attributes& @@ -652,15 +738,13 @@ class Layernorm_attributes : public Attributes { friend class LayerNormNode; friend class Graph; - enum class input_names { X, SCALE, BIAS, EPSILON }; - std::unordered_map> inputs; - - enum class output_names { Y, MEAN, INV_VARIANCE }; - std::unordered_map> outputs; - NormFwdPhase_t forward_phase = NormFwdPhase_t::NOT_SET; public: + enum class input_names { X, SCALE, BIAS, EPSILON }; + std::map> inputs; + enum class output_names { Y, MEAN, INV_VARIANCE }; + std::map> outputs; NLOHMANN_DEFINE_TYPE_INTRUSIVE(Layernorm_attributes, name, inputs, outputs, forward_phase) Layernorm_attributes& @@ -681,15 +765,13 @@ class Instancenorm_attributes : public Attributes { friend class InstanceNormNode; friend class Graph; - enum class input_names { X, SCALE, BIAS, EPSILON }; - std::unordered_map> inputs; - - enum class output_names { Y, MEAN, INV_VARIANCE }; - std::unordered_map> outputs; - NormFwdPhase_t forward_phase = NormFwdPhase_t::NOT_SET; public: + enum class input_names { X, SCALE, BIAS, EPSILON }; + std::map> inputs; + enum class output_names { Y, MEAN, INV_VARIANCE }; + std::map> outputs; NLOHMANN_DEFINE_TYPE_INTRUSIVE(Instancenorm_attributes, name, inputs, outputs, forward_phase) Instancenorm_attributes& @@ -710,15 +792,13 @@ class Batchnorm_attributes : public Attributes { friend class BatchNormNode; friend class Graph; + public: enum class input_names { X, SCALE, BIAS, PREV_RUNNING_MEAN, PREV_RUNNING_VAR, EPSILON, MOMENTUM }; - std::unordered_map> inputs; + std::map> inputs; // Only special case where one of the inputs is a vector. std::vector> peer_stats; - enum class output_names { Y, MEAN, INV_VARIANCE, NEXT_RUNNING_MEAN, NEXT_RUNNING_VAR }; - std::unordered_map> outputs; - - public: + std::map> outputs; NLOHMANN_DEFINE_TYPE_INTRUSIVE(Batchnorm_attributes, name, inputs, peer_stats, outputs) Batchnorm_attributes& @@ -749,13 +829,11 @@ class Batchnorm_inference_attributes : public Attributes> inputs; - + std::map> inputs; enum class output_names { Y }; - std::unordered_map> outputs; - - public: + std::map> outputs; NLOHMANN_DEFINE_TYPE_INTRUSIVE(Batchnorm_inference_attributes, name, inputs, outputs) }; @@ -764,15 +842,13 @@ class Reduction_attributes : public Attributes { friend class ReductionNode; friend class INode; - enum class input_names { X }; - std::unordered_map> inputs; - - enum class output_names { Y }; - std::unordered_map> outputs; - std::optional mode; public: + enum class input_names { X }; + std::map> inputs; + enum class output_names { Y }; + std::map> outputs; NLOHMANN_DEFINE_TYPE_INTRUSIVE(Reduction_attributes, name, inputs, outputs, mode) std::optional @@ -792,12 +868,6 @@ class Rng_attributes : public Attributes { friend class RngNode; friend class INode; - enum class input_names { Seed, Offset }; - std::unordered_map> inputs; - - enum class output_names { Y }; - std::unordered_map> outputs; - RngDistribution_t distribution = RngDistribution_t::NOT_SET; std::vector dim = {}; std::vector stride = {}; @@ -805,6 +875,10 @@ class Rng_attributes : public Attributes { std::optional bernoulli_probability; public: + enum class input_names { Seed, Offset }; + std::map> inputs; + enum class output_names { Y }; + std::map> outputs; NLOHMANN_DEFINE_TYPE_INTRUSIVE(Rng_attributes, name, inputs, @@ -876,16 +950,14 @@ class Reshape_attributes : public Attributes { friend class ReshapeNode; friend class INode; - enum class input_names { X }; - std::unordered_map> inputs; - - enum class output_names { Y }; - std::unordered_map> outputs; - std::vector dim = {}; std::vector stride = {}; public: + enum class input_names { X }; + std::map> inputs; + enum class output_names { Y }; + std::map> outputs; NLOHMANN_DEFINE_TYPE_INTRUSIVE(Reshape_attributes, name, inputs, outputs, dim, stride) std::vector @@ -916,15 +988,13 @@ class Rmsnorm_attributes : public Attributes { friend class RMSNormNode; friend class Graph; - enum class input_names { X, SCALE, BIAS, EPSILON }; - std::unordered_map> inputs; - - enum class output_names { Y, INV_VARIANCE }; - std::unordered_map> outputs; - NormFwdPhase_t forward_phase = NormFwdPhase_t::NOT_SET; public: + enum class input_names { X, SCALE, BIAS, EPSILON }; + std::map> inputs; + enum class output_names { Y, INV_VARIANCE }; + std::map> outputs; NLOHMANN_DEFINE_TYPE_INTRUSIVE(Rmsnorm_attributes, name, inputs, outputs, forward_phase) Rmsnorm_attributes& @@ -951,14 +1021,13 @@ class Rmsnorm_backward_attributes : public Attributes> inputs; - - enum class output_names { DX, DSCALE, DBIAS }; - std::unordered_map> outputs; std::optional use_dbias; public: + enum class input_names { DY, X, SCALE, INV_VARIANCE }; + std::map> inputs; + enum class output_names { DX, DSCALE, DBIAS }; + std::map> outputs; NLOHMANN_DEFINE_TYPE_INTRUSIVE(Rmsnorm_backward_attributes, name, inputs, outputs) Rmsnorm_backward_attributes& @@ -1090,6 +1159,14 @@ class SDPA_attributes : public Attributes { friend class SDPANode; friend class Graph; + std::optional is_inference; + bool alibi_mask = false; + bool padding_mask = false; + bool causal_mask = false; + std::optional dropout_probability; + std::optional attn_scale_value; + + public: enum class input_names { Q, K, @@ -1103,19 +1180,20 @@ class SDPA_attributes : public Attributes { Dropout_mask, Dropout_scale }; - std::unordered_map> inputs; - + std::map> inputs; enum class output_names { O, Stats, RNG_DUMP }; - std::unordered_map> outputs; - - std::optional is_inference; - bool alibi_mask = false; - bool padding_mask = false; - bool causal_mask = false; - std::optional dropout_probability; - std::optional attn_scale_value; + std::map> outputs; + NLOHMANN_DEFINE_TYPE_INTRUSIVE(SDPA_attributes, + name, + inputs, + outputs, + is_inference, + alibi_mask, + padding_mask, + causal_mask, + dropout_probability, + attn_scale_value) - public: SDPA_attributes& set_is_inference(bool const value) { is_inference = value; @@ -1200,6 +1278,14 @@ class SDPA_backward_attributes : public Attributes { friend class SDPABackwardNode; friend class Graph; + bool alibi_mask = false; + bool padding_mask = false; + bool causal_mask = false; + + std::optional dropout_probability; + std::optional attn_scale_value; + + public: enum class input_names { Q, K, @@ -1217,19 +1303,19 @@ class SDPA_backward_attributes : public Attributes { Dropout_scale, Dropout_scale_inv }; - std::unordered_map> inputs; - + std::map> inputs; enum class output_names { dQ, dK, dV, dBias, RNG_DUMP }; - std::unordered_map> outputs; - - bool alibi_mask = false; - bool padding_mask = false; - bool causal_mask = false; - - std::optional dropout_probability; - std::optional attn_scale_value; + std::map> outputs; + NLOHMANN_DEFINE_TYPE_INTRUSIVE(SDPA_backward_attributes, + name, + inputs, + outputs, + alibi_mask, + padding_mask, + causal_mask, + dropout_probability, + attn_scale_value) - public: SDPA_backward_attributes& set_attn_scale(std::shared_ptr value) { inputs[SDPA_backward_attributes::input_names::Attn_scale] = value; @@ -1320,16 +1406,16 @@ class Softmax_attributes : public Attributes { friend class SoftmaxNode; friend class INode; - enum class input_names { P }; - std::unordered_map> inputs; - - enum class output_names { S, Stats, M, Zinv }; - std::unordered_map> outputs; - std::optional use_stats; std::optional use_M_Zinv; public: + enum class input_names { P }; + std::map> inputs; + enum class output_names { S, Stats, M, Zinv }; + std::map> outputs; + NLOHMANN_DEFINE_TYPE_INTRUSIVE(Softmax_attributes, name, inputs, outputs, use_stats, use_M_Zinv) + Softmax_attributes& has_stats(bool const value) { use_stats = value; @@ -1368,10 +1454,10 @@ class SDPA_FP8_attributes : public Attributes { ragged_offset_QKV, ragged_offset_O }; - std::unordered_map> inputs; + std::map> inputs; enum class output_names { O, Stats, M, Zinv, AMax_S, AMax_O }; - std::unordered_map> outputs; + std::map> outputs; std::optional is_inference; bool padding_mask = false; @@ -1456,27 +1542,52 @@ class Conv_wgrad_attributes : public Attributes { friend class WgradNode; friend class Graph; - enum class input_names { DY, X }; - std::unordered_map> inputs; - - enum class output_names { DW }; - std::unordered_map> outputs; - - std::vector padding; + std::vector pre_padding; + std::vector post_padding; std::vector stride; std::vector dilation; public: - NLOHMANN_DEFINE_TYPE_INTRUSIVE(Conv_wgrad_attributes, name, inputs, outputs, padding, stride, dilation) + enum class input_names { DY, X }; + std::map> inputs; + + enum class output_names { DW }; + std::map> outputs; + NLOHMANN_DEFINE_TYPE_INTRUSIVE(Conv_wgrad_attributes, + name, + inputs, + outputs, + pre_padding, + post_padding, + stride, + dilation) std::vector - get_padding() const { - return padding; + get_pre_padding() const { + return pre_padding; + } + + std::vector + get_post_padding() const { + return post_padding; } Conv_wgrad_attributes& set_padding(std::vector value) { - padding = value; + pre_padding = value; + post_padding = value; + return *this; + } + + Conv_wgrad_attributes& + set_pre_padding(std::vector value) { + pre_padding = value; + return *this; + } + + Conv_wgrad_attributes& + set_post_padding(std::vector value) { + post_padding = value; return *this; } diff --git a/include/cudnn_frontend/node/batchnorm.h b/include/cudnn_frontend/node/batchnorm.h index 69caf4c1..c5c3a50a 100644 --- a/include/cudnn_frontend/node/batchnorm.h +++ b/include/cudnn_frontend/node/batchnorm.h @@ -21,6 +21,11 @@ class BatchNormNode : public INode { return Type::BATCHNORM; } + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + error_t expand_and_infer_properties() override final { getLogger() << "[cudnn_frontend] INFO: Inferencing properties for batchnorm node " << attributes.name << "..." @@ -107,28 +112,29 @@ class BatchNormNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building BatchNormNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } // Special case in BN where peer stats is also an input but is not present in inputs map for (auto const& tensor : attributes.peer_stats) { if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } return {error_code_t::OK, ""}; @@ -142,66 +148,72 @@ class BatchNormNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building BatchNormNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION - try { -#endif - - std::vector peer_stats; - for (auto const& peer_stat : attributes.peer_stats) { - peer_stats.emplace_back(std::move(*(tensors[peer_stat->get_uid()]))); - } - - auto&& batchnorm_operation_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR); - - batchnorm_operation_builder.setNormalizationMode(NormMode_t::BATCH_NORM) - .setNormFwdPhase(NormFwdPhase_t::TRAINING); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Batchnorm_attributes::input_names::X); - batchnorm_operation_builder.setxDesc(*(tensors[X->second->get_uid()])); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, Batchnorm_attributes::output_names::MEAN); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, Batchnorm_attributes::output_names::INV_VARIANCE); - batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors[MEAN->second->get_uid()]), - *(tensors[INV_VARIANCE->second->get_uid()])); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Batchnorm_attributes::input_names::SCALE); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Batchnorm_attributes::input_names::BIAS); - batchnorm_operation_builder.setScaleAndBias(*(tensors[SCALE->second->get_uid()]), - *(tensors[BIAS->second->get_uid()])); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_MEAN, - Batchnorm_attributes::input_names::PREV_RUNNING_MEAN); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_VAR, - Batchnorm_attributes::input_names::PREV_RUNNING_VAR); - batchnorm_operation_builder.setPrevRunningMeanAndVar(*(tensors[PREV_RUNNING_MEAN->second->get_uid()]), - *(tensors[PREV_RUNNING_VAR->second->get_uid()])); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_MEAN, - Batchnorm_attributes::output_names::NEXT_RUNNING_MEAN); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_VAR, - Batchnorm_attributes::output_names::NEXT_RUNNING_VAR); - batchnorm_operation_builder.setNextRunningMeanAndVar(*(tensors[NEXT_RUNNING_MEAN->second->get_uid()]), - *(tensors[NEXT_RUNNING_VAR->second->get_uid()])); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Batchnorm_attributes::input_names::EPSILON); - batchnorm_operation_builder.setEpsilonTensor(*(tensors[EPSILON->second->get_uid()])); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MOMENTUM, Batchnorm_attributes::input_names::MOMENTUM); - batchnorm_operation_builder.setExpDecayFactorTensor(*(tensors[MOMENTUM->second->get_uid()])); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Batchnorm_attributes::output_names::Y); - batchnorm_operation_builder.setyDesc(*(tensors[Y->second->get_uid()])); - - batchnorm_operation_builder.setPeerStatTensor(peer_stats); + std::vector peer_stats; + for (auto const& peer_stat : attributes.peer_stats) { + peer_stats.emplace_back(std::move(*(tensors[peer_stat->get_uid()]))); + } + auto&& batchnorm_operation_builder = + cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR); + + batchnorm_operation_builder.setNormalizationMode(NormMode_t::BATCH_NORM) + .setNormFwdPhase(NormFwdPhase_t::TRAINING); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Batchnorm_attributes::input_names::X); + batchnorm_operation_builder.setxDesc(*(tensors[X->second->get_uid()])); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, Batchnorm_attributes::output_names::MEAN); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, Batchnorm_attributes::output_names::INV_VARIANCE); + batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors[MEAN->second->get_uid()]), + *(tensors[INV_VARIANCE->second->get_uid()])); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Batchnorm_attributes::input_names::SCALE); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Batchnorm_attributes::input_names::BIAS); + batchnorm_operation_builder.setScaleAndBias(*(tensors[SCALE->second->get_uid()]), + *(tensors[BIAS->second->get_uid()])); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_MEAN, + Batchnorm_attributes::input_names::PREV_RUNNING_MEAN); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_VAR, + Batchnorm_attributes::input_names::PREV_RUNNING_VAR); + batchnorm_operation_builder.setPrevRunningMeanAndVar(*(tensors[PREV_RUNNING_MEAN->second->get_uid()]), + *(tensors[PREV_RUNNING_VAR->second->get_uid()])); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_MEAN, + Batchnorm_attributes::output_names::NEXT_RUNNING_MEAN); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_VAR, + Batchnorm_attributes::output_names::NEXT_RUNNING_VAR); + batchnorm_operation_builder.setNextRunningMeanAndVar(*(tensors[NEXT_RUNNING_MEAN->second->get_uid()]), + *(tensors[NEXT_RUNNING_VAR->second->get_uid()])); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Batchnorm_attributes::input_names::EPSILON); + batchnorm_operation_builder.setEpsilonTensor(*(tensors[EPSILON->second->get_uid()])); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MOMENTUM, Batchnorm_attributes::input_names::MOMENTUM); + batchnorm_operation_builder.setExpDecayFactorTensor(*(tensors[MOMENTUM->second->get_uid()])); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Batchnorm_attributes::output_names::Y); + batchnorm_operation_builder.setyDesc(*(tensors[Y->second->get_uid()])); + + batchnorm_operation_builder.setPeerStatTensor(peer_stats); + +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = batchnorm_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch + try { auto operation = batchnorm_operation_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -213,6 +225,7 @@ class BatchNormNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "BATCHNORM"})"_json); } }; diff --git a/include/cudnn_frontend/node/batchnorm_inference.h b/include/cudnn_frontend/node/batchnorm_inference.h index 243f6913..3ab531d2 100644 --- a/include/cudnn_frontend/node/batchnorm_inference.h +++ b/include/cudnn_frontend/node/batchnorm_inference.h @@ -75,21 +75,27 @@ class BatchnormInferenceNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + + error_t + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building BatchnormInferenceNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } return {error_code_t::OK, ""}; @@ -103,39 +109,45 @@ class BatchnormInferenceNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building BatchnormInferenceNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION + auto&& batchnorm_operation_builder = + cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR); + batchnorm_operation_builder.setNormalizationMode(NormMode_t::BATCH_NORM) + .setNormFwdPhase(NormFwdPhase_t::INFERENCE); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Batchnorm_inference_attributes::input_names::X); + batchnorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Batchnorm_inference_attributes::input_names::MEAN); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, + Batchnorm_inference_attributes::input_names::INV_VARIANCE); + batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), + *(tensors.at(INV_VARIANCE->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Batchnorm_inference_attributes::input_names::SCALE); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Batchnorm_inference_attributes::input_names::BIAS); + batchnorm_operation_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())), + *(tensors.at(BIAS->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Batchnorm_inference_attributes::output_names::Y); + batchnorm_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid()))); + +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = batchnorm_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch try { -#endif - - auto&& batchnorm_operation_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR); - batchnorm_operation_builder.setNormalizationMode(NormMode_t::BATCH_NORM) - .setNormFwdPhase(NormFwdPhase_t::INFERENCE); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Batchnorm_inference_attributes::input_names::X); - batchnorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Batchnorm_inference_attributes::input_names::MEAN); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, - Batchnorm_inference_attributes::input_names::INV_VARIANCE); - batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), - *(tensors.at(INV_VARIANCE->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Batchnorm_inference_attributes::input_names::SCALE); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Batchnorm_inference_attributes::input_names::BIAS); - batchnorm_operation_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())), - *(tensors.at(BIAS->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Batchnorm_inference_attributes::output_names::Y); - batchnorm_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid()))); - auto operation = batchnorm_operation_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -147,6 +159,7 @@ class BatchnormInferenceNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "BATCHNORM_INFERENCE"})"_json); } }; diff --git a/include/cudnn_frontend/node/bn_finalize.h b/include/cudnn_frontend/node/bn_finalize.h index cb73d0fa..1226d088 100644 --- a/include/cudnn_frontend/node/bn_finalize.h +++ b/include/cudnn_frontend/node/bn_finalize.h @@ -22,6 +22,11 @@ class BatchNormFinalizeNode : public INode { return Type::BN_FINALIZE; } + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + error_t pre_validate_node() const override final { CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs()); @@ -74,21 +79,22 @@ class BatchNormFinalizeNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building BatchNormFinalizeNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } @@ -103,68 +109,73 @@ class BatchNormFinalizeNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building BatchNormFinalizeNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION + // Create the batchnorm operation. + auto&& batchnorm_operation_builder = + cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR); + batchnorm_operation_builder.setComputeType(CUDNN_DATA_FLOAT) + .setBNFinalizeMode(CUDNN_BN_FINALIZE_STATISTICS_TRAINING); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SUM, BN_finalize_attributes::input_names::SUM); + batchnorm_operation_builder.setSumDesc(*(tensors.at(SUM->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SQ_SUM, BN_finalize_attributes::input_names::SQ_SUM); + batchnorm_operation_builder.setSqSumDesc(*(tensors.at(SQ_SUM->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_SCALE, BN_finalize_attributes::output_names::EQ_SCALE); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_BIAS, BN_finalize_attributes::output_names::EQ_BIAS); + batchnorm_operation_builder.setEqScaleAndBias(*(tensors.at(EQ_SCALE->second->get_uid())), + *(tensors.at(EQ_BIAS->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, BN_finalize_attributes::output_names::MEAN); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, BN_finalize_attributes::output_names::INV_VARIANCE); + batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), + *(tensors.at(INV_VARIANCE->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, BN_finalize_attributes::input_names::SCALE); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, BN_finalize_attributes::input_names::BIAS); + batchnorm_operation_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())), + *(tensors.at(BIAS->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_MEAN, + BN_finalize_attributes::input_names::PREV_RUNNING_MEAN); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_VAR, + BN_finalize_attributes::input_names::PREV_RUNNING_VAR); + batchnorm_operation_builder.setPrevRunningMeanAndVar(*(tensors.at(PREV_RUNNING_MEAN->second->get_uid())), + *(tensors.at(PREV_RUNNING_VAR->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_MEAN, + BN_finalize_attributes::output_names::NEXT_RUNNING_MEAN); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_VAR, + BN_finalize_attributes::output_names::NEXT_RUNNING_VAR); + batchnorm_operation_builder.setNextRunningMeanAndVar(*(tensors.at(NEXT_RUNNING_MEAN->second->get_uid())), + *(tensors.at(NEXT_RUNNING_VAR->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, BN_finalize_attributes::input_names::EPSILON); + batchnorm_operation_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MOMENTUM, BN_finalize_attributes::input_names::MOMENTUM); + batchnorm_operation_builder.setExpDecayFactorTensor(*(tensors.at(MOMENTUM->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(ACCUM_COUNT, BN_finalize_attributes::input_names::ACCUM_COUNT); + batchnorm_operation_builder.setAccumCountTensor(*(tensors.at(ACCUM_COUNT->second->get_uid()))); + +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = batchnorm_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch try { -#endif - - // Create the batchnorm operation. - auto&& batchnorm_operation_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR); - batchnorm_operation_builder.setComputeType(CUDNN_DATA_FLOAT) - .setBNFinalizeMode(CUDNN_BN_FINALIZE_STATISTICS_TRAINING); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SUM, BN_finalize_attributes::input_names::SUM); - batchnorm_operation_builder.setSumDesc(*(tensors.at(SUM->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SQ_SUM, BN_finalize_attributes::input_names::SQ_SUM); - batchnorm_operation_builder.setSqSumDesc(*(tensors.at(SQ_SUM->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_SCALE, BN_finalize_attributes::output_names::EQ_SCALE); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_BIAS, BN_finalize_attributes::output_names::EQ_BIAS); - batchnorm_operation_builder.setEqScaleAndBias(*(tensors.at(EQ_SCALE->second->get_uid())), - *(tensors.at(EQ_BIAS->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, BN_finalize_attributes::output_names::MEAN); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, - BN_finalize_attributes::output_names::INV_VARIANCE); - batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), - *(tensors.at(INV_VARIANCE->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, BN_finalize_attributes::input_names::SCALE); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, BN_finalize_attributes::input_names::BIAS); - batchnorm_operation_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())), - *(tensors.at(BIAS->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_MEAN, - BN_finalize_attributes::input_names::PREV_RUNNING_MEAN); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_VAR, - BN_finalize_attributes::input_names::PREV_RUNNING_VAR); - batchnorm_operation_builder.setPrevRunningMeanAndVar(*(tensors.at(PREV_RUNNING_MEAN->second->get_uid())), - *(tensors.at(PREV_RUNNING_VAR->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_MEAN, - BN_finalize_attributes::output_names::NEXT_RUNNING_MEAN); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_VAR, - BN_finalize_attributes::output_names::NEXT_RUNNING_VAR); - batchnorm_operation_builder.setNextRunningMeanAndVar(*(tensors.at(NEXT_RUNNING_MEAN->second->get_uid())), - *(tensors.at(NEXT_RUNNING_VAR->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, BN_finalize_attributes::input_names::EPSILON); - batchnorm_operation_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MOMENTUM, BN_finalize_attributes::input_names::MOMENTUM); - batchnorm_operation_builder.setExpDecayFactorTensor(*(tensors.at(MOMENTUM->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(ACCUM_COUNT, BN_finalize_attributes::input_names::ACCUM_COUNT); - batchnorm_operation_builder.setAccumCountTensor(*(tensors.at(ACCUM_COUNT->second->get_uid()))); - auto operation = batchnorm_operation_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -176,6 +187,7 @@ class BatchNormFinalizeNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "BN_FINALIZE"})"_json); } }; diff --git a/include/cudnn_frontend/node/conv_dgrad.h b/include/cudnn_frontend/node/conv_dgrad.h index 74295421..a597fa9c 100644 --- a/include/cudnn_frontend/node/conv_dgrad.h +++ b/include/cudnn_frontend/node/conv_dgrad.h @@ -31,6 +31,15 @@ class DgradNode : public INode { CUDNN_FE_VALIDATE_OUTPUT_TENSOR(Conv_dgrad_attributes::output_names::DX); + RETURN_CUDNN_FRONTEND_ERROR_IF( + attributes.get_pre_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Pre padding not set."); + RETURN_CUDNN_FRONTEND_ERROR_IF( + attributes.get_post_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Post padding not set."); + RETURN_CUDNN_FRONTEND_ERROR_IF( + attributes.get_stride().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv strides not set."); + RETURN_CUDNN_FRONTEND_ERROR_IF( + attributes.get_dilation().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv dilation not set."); + CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs()); return {error_code_t::OK, ""}; } @@ -73,21 +82,27 @@ class DgradNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + + error_t + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building DgradNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } return {error_code_t::OK, ""}; @@ -101,44 +116,50 @@ class DgradNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building DgradNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION + // dgrad descriptor + int64_t const spatial_dim_count = attributes.get_pre_padding().size(); + auto dgrad_descriptor = cudnn_frontend::ConvDescBuilder() + .setComputeType(attributes.compute_data_type) + .setMathMode(CUDNN_CROSS_CORRELATION) + .setSpatialDimCount(spatial_dim_count) + .setSpatialStride(spatial_dim_count, attributes.get_stride().data()) + .setPrePadding(spatial_dim_count, attributes.get_pre_padding().data()) + .setPostPadding(spatial_dim_count, attributes.get_post_padding().data()) + .setDilation(spatial_dim_count, attributes.get_dilation().data()) + .build(); + + // Create the dgrad operation. + auto&& dgrad_operation_builder = + cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Conv_dgrad_attributes::output_names::DX); + dgrad_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(W, Conv_dgrad_attributes::input_names::W); + dgrad_operation_builder.setwDesc(*(tensors.at(W->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Conv_dgrad_attributes::input_names::DY); + dgrad_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid()))); + + dgrad_operation_builder.setcDesc(dgrad_descriptor).setAlpha(1.f).setBeta(0.f); + +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = dgrad_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch try { -#endif - - // dgrad descriptor - int64_t const spatial_dim_count = attributes.get_padding().size(); - auto dgrad_descriptor = cudnn_frontend::ConvDescBuilder() - .setComputeType(attributes.compute_data_type) - .setMathMode(CUDNN_CROSS_CORRELATION) - .setSpatialDimCount(spatial_dim_count) - .setSpatialStride(spatial_dim_count, attributes.get_stride().data()) - .setPrePadding(spatial_dim_count, attributes.get_padding().data()) - .setPostPadding(spatial_dim_count, attributes.get_padding().data()) - .setDilation(spatial_dim_count, attributes.get_dilation().data()) - .build(); - - // Create the dgrad operation. - auto&& dgrad_operation_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Conv_dgrad_attributes::output_names::DX); - dgrad_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(W, Conv_dgrad_attributes::input_names::W); - dgrad_operation_builder.setwDesc(*(tensors.at(W->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Conv_dgrad_attributes::input_names::DY); - dgrad_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid()))); - - dgrad_operation_builder.setcDesc(dgrad_descriptor).setAlpha(1.f).setBeta(0.f); - auto operation = dgrad_operation_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -150,6 +171,7 @@ class DgradNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "CONV_DGRAD"})"_json); } }; diff --git a/include/cudnn_frontend/node/conv_fprop.h b/include/cudnn_frontend/node/conv_fprop.h index 5b20a9db..35dcc231 100644 --- a/include/cudnn_frontend/node/conv_fprop.h +++ b/include/cudnn_frontend/node/conv_fprop.h @@ -21,6 +21,11 @@ class ConvolutionNode : public INode { return Type::CONVOLUTION; } + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + error_t pre_validate_node() const override final { getLogger() << "[cudnn_frontend] INFO: " @@ -31,6 +36,15 @@ class ConvolutionNode : public INode { CUDNN_FE_VALIDATE_OUTPUT_TENSOR(Conv_fprop_attributes::output_names::Y); + RETURN_CUDNN_FRONTEND_ERROR_IF( + attributes.get_pre_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Pre padding not set."); + RETURN_CUDNN_FRONTEND_ERROR_IF( + attributes.get_post_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Post padding not set."); + RETURN_CUDNN_FRONTEND_ERROR_IF( + attributes.get_stride().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv strides not set."); + RETURN_CUDNN_FRONTEND_ERROR_IF( + attributes.get_dilation().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv dilation not set."); + CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs()); return {error_code_t::OK, ""}; } @@ -54,16 +68,17 @@ class ConvolutionNode : public INode { // Only infer dims and strides if user did not set them if (y_tensor_dim.empty()) { y_tensor_dim.resize(x_tensor_dim.size()); - auto const& padding = attributes.get_padding(); - auto const& stride = attributes.get_stride(); - auto const& dilation = attributes.get_dilation(); + auto const& pre_padding = attributes.get_pre_padding(); + auto const& post_padding = attributes.get_post_padding(); + auto const& stride = attributes.get_stride(); + auto const& dilation = attributes.get_dilation(); // N y_tensor_dim[0] = x_tensor_dim[0]; // PQ for (size_t dim = 2; dim < x_tensor_dim.size(); ++dim) { - y_tensor_dim[dim] = - 1 + (x_tensor_dim[dim] - dilation[dim - 2] * (w_tensor_dim[dim] - 1) - 1 + 2 * padding[dim - 2]) / - stride[dim - 2]; + y_tensor_dim[dim] = 1 + (x_tensor_dim[dim] - dilation[dim - 2] * (w_tensor_dim[dim] - 1) - 1 + + pre_padding[dim - 2] + post_padding[dim - 2]) / + stride[dim - 2]; } // K y_tensor_dim[1] = w_tensor_dim[0]; @@ -89,21 +104,22 @@ class ConvolutionNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building ConvolutionNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } @@ -118,44 +134,50 @@ class ConvolutionNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building ConvolutionNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION + // convolution descriptor + int64_t const spatial_dim_count = attributes.get_pre_padding().size(); + auto convolution_descriptor = cudnn_frontend::ConvDescBuilder() + .setComputeType(attributes.compute_data_type) + .setMathMode(CUDNN_CROSS_CORRELATION) + .setSpatialDimCount(spatial_dim_count) + .setSpatialStride(spatial_dim_count, attributes.get_stride().data()) + .setPrePadding(spatial_dim_count, attributes.get_pre_padding().data()) + .setPostPadding(spatial_dim_count, attributes.get_post_padding().data()) + .setDilation(spatial_dim_count, attributes.get_dilation().data()) + .build(); + + // Create the convolution operation. + auto&& convolution_operation_builder = + cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Conv_fprop_attributes::input_names::X); + convolution_operation_builder.setxDesc(*(tensors[X->second->get_uid()])); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(W, Conv_fprop_attributes::input_names::W); + convolution_operation_builder.setwDesc(*(tensors[W->second->get_uid()])); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Conv_fprop_attributes::output_names::Y); + convolution_operation_builder.setyDesc(*(tensors[Y->second->get_uid()])); + + convolution_operation_builder.setcDesc(convolution_descriptor).setAlpha(1.f).setBeta(0.f); + +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = convolution_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch try { -#endif - - // convolution descriptor - int64_t const spatial_dim_count = attributes.get_padding().size(); - auto convolution_descriptor = cudnn_frontend::ConvDescBuilder() - .setComputeType(attributes.compute_data_type) - .setMathMode(CUDNN_CROSS_CORRELATION) - .setSpatialDimCount(spatial_dim_count) - .setSpatialStride(spatial_dim_count, attributes.get_stride().data()) - .setPrePadding(spatial_dim_count, attributes.get_padding().data()) - .setPostPadding(spatial_dim_count, attributes.get_padding().data()) - .setDilation(spatial_dim_count, attributes.get_dilation().data()) - .build(); - - // Create the convolution operation. - auto&& convolution_operation_builder = - cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Conv_fprop_attributes::input_names::X); - convolution_operation_builder.setxDesc(*(tensors[X->second->get_uid()])); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(W, Conv_fprop_attributes::input_names::W); - convolution_operation_builder.setwDesc(*(tensors[W->second->get_uid()])); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Conv_fprop_attributes::output_names::Y); - convolution_operation_builder.setyDesc(*(tensors[Y->second->get_uid()])); - - convolution_operation_builder.setcDesc(convolution_descriptor).setAlpha(1.f).setBeta(0.f); - auto operation = convolution_operation_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -167,6 +189,7 @@ class ConvolutionNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"({"tag": "CONV_FPROP"})"_json); } }; diff --git a/include/cudnn_frontend/node/conv_wgrad.h b/include/cudnn_frontend/node/conv_wgrad.h index 8a6fb384..575be4bd 100644 --- a/include/cudnn_frontend/node/conv_wgrad.h +++ b/include/cudnn_frontend/node/conv_wgrad.h @@ -31,6 +31,15 @@ class WgradNode : public INode { CUDNN_FE_VALIDATE_OUTPUT_TENSOR(Conv_wgrad_attributes::output_names::DW); + RETURN_CUDNN_FRONTEND_ERROR_IF( + attributes.get_pre_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Pre padding not set."); + RETURN_CUDNN_FRONTEND_ERROR_IF( + attributes.get_post_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Post padding not set."); + RETURN_CUDNN_FRONTEND_ERROR_IF( + attributes.get_stride().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv strides not set."); + RETURN_CUDNN_FRONTEND_ERROR_IF( + attributes.get_dilation().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv dilation not set."); + CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs()); return {error_code_t::OK, ""}; } @@ -73,21 +82,27 @@ class WgradNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + + error_t + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building WgradNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } return {error_code_t::OK, ""}; @@ -101,44 +116,50 @@ class WgradNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building WgradNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION + // wgrad descriptor + int64_t const spatial_dim_count = attributes.get_pre_padding().size(); + auto wgrad_descriptor = cudnn_frontend::ConvDescBuilder() + .setComputeType(attributes.compute_data_type) + .setMathMode(CUDNN_CROSS_CORRELATION) + .setSpatialDimCount(spatial_dim_count) + .setSpatialStride(spatial_dim_count, attributes.get_stride().data()) + .setPrePadding(spatial_dim_count, attributes.get_pre_padding().data()) + .setPostPadding(spatial_dim_count, attributes.get_post_padding().data()) + .setDilation(spatial_dim_count, attributes.get_dilation().data()) + .build(); + + // Create the wgrad operation. + auto&& wgrad_operation_builder = + cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Conv_wgrad_attributes::input_names::X); + wgrad_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Conv_wgrad_attributes::input_names::DY); + wgrad_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DW, Conv_wgrad_attributes::output_names::DW); + wgrad_operation_builder.setdwDesc(*(tensors.at(DW->second->get_uid()))); + + wgrad_operation_builder.setcDesc(wgrad_descriptor).setAlpha(1.f).setBeta(0.f); + +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = wgrad_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch try { -#endif - - // wgrad descriptor - int64_t const spatial_dim_count = attributes.get_padding().size(); - auto wgrad_descriptor = cudnn_frontend::ConvDescBuilder() - .setComputeType(attributes.compute_data_type) - .setMathMode(CUDNN_CROSS_CORRELATION) - .setSpatialDimCount(spatial_dim_count) - .setSpatialStride(spatial_dim_count, attributes.get_stride().data()) - .setPrePadding(spatial_dim_count, attributes.get_padding().data()) - .setPostPadding(spatial_dim_count, attributes.get_padding().data()) - .setDilation(spatial_dim_count, attributes.get_dilation().data()) - .build(); - - // Create the wgrad operation. - auto&& wgrad_operation_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Conv_wgrad_attributes::input_names::X); - wgrad_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Conv_wgrad_attributes::input_names::DY); - wgrad_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DW, Conv_wgrad_attributes::output_names::DW); - wgrad_operation_builder.setdwDesc(*(tensors.at(DW->second->get_uid()))); - - wgrad_operation_builder.setcDesc(wgrad_descriptor).setAlpha(1.f).setBeta(0.f); - auto operation = wgrad_operation_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -150,6 +171,7 @@ class WgradNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "CONV_WGRAD"})"_json); } }; diff --git a/include/cudnn_frontend/node/dbn.h b/include/cudnn_frontend/node/dbn.h index 71e3b6df..b2df03db 100644 --- a/include/cudnn_frontend/node/dbn.h +++ b/include/cudnn_frontend/node/dbn.h @@ -22,6 +22,11 @@ class DBNNode : public INode { return Type::DBN; } + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + error_t pre_validate_node() const override final { getLogger() << "[cudnn_frontend] INFO: " @@ -87,28 +92,29 @@ class DBNNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building DBNNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } // Special case in BN where peer stats is also an input but is not present in inputs map for (auto const& tensor : attributes.peer_stats) { if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } @@ -123,53 +129,59 @@ class DBNNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building DBNNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION - try { -#endif - - std::vector peer_stats; - for (auto const& peer_stat : attributes.peer_stats) { - peer_stats.emplace_back(std::move(*(tensors.at(peer_stat->get_uid())))); - } + std::vector peer_stats; + for (auto const& peer_stat : attributes.peer_stats) { + peer_stats.emplace_back(std::move(*(tensors.at(peer_stat->get_uid())))); + } - // Create the DBN operation. - auto&& DBN_operation_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR); + // Create the DBN operation. + auto&& DBN_operation_builder = + cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR); - DBN_operation_builder.setNormalizationMode(NormMode_t::BATCH_NORM); + DBN_operation_builder.setNormalizationMode(NormMode_t::BATCH_NORM); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Batchnorm_backward_attributes::input_names::X); - DBN_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Batchnorm_backward_attributes::input_names::X); + DBN_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Batchnorm_backward_attributes::input_names::DY); - DBN_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Batchnorm_backward_attributes::input_names::DY); + DBN_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Batchnorm_backward_attributes::input_names::SCALE); - DBN_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Batchnorm_backward_attributes::input_names::SCALE); + DBN_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Batchnorm_backward_attributes::input_names::MEAN); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, - Batchnorm_backward_attributes::input_names::INV_VARIANCE); - DBN_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), - *(tensors.at(INV_VARIANCE->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Batchnorm_backward_attributes::input_names::MEAN); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, + Batchnorm_backward_attributes::input_names::INV_VARIANCE); + DBN_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), + *(tensors.at(INV_VARIANCE->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Batchnorm_backward_attributes::output_names::DSCALE); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Batchnorm_backward_attributes::output_names::DBIAS); - DBN_operation_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())), - *(tensors.at(DBIAS->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Batchnorm_backward_attributes::output_names::DSCALE); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Batchnorm_backward_attributes::output_names::DBIAS); + DBN_operation_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())), + *(tensors.at(DBIAS->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Batchnorm_backward_attributes::output_names::DX); - DBN_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Batchnorm_backward_attributes::output_names::DX); + DBN_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid()))); - DBN_operation_builder.setPeerStatTensor(peer_stats); + DBN_operation_builder.setPeerStatTensor(peer_stats); +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = DBN_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch + try { auto operation = DBN_operation_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -181,6 +193,7 @@ class DBNNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "DBN"})"_json); } }; diff --git a/include/cudnn_frontend/node/dbn_weight.h b/include/cudnn_frontend/node/dbn_weight.h index e4908025..dda9daee 100644 --- a/include/cudnn_frontend/node/dbn_weight.h +++ b/include/cudnn_frontend/node/dbn_weight.h @@ -79,6 +79,11 @@ class DBNWeightNode : public INode { return {error_code_t::OK, ""}; } + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + error_t post_validate_node() const override final { // Validate outputs @@ -89,21 +94,22 @@ class DBNWeightNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building DBNWeightNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } @@ -118,49 +124,55 @@ class DBNWeightNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building DBNWeightNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION + // Create the batchnorm operation. + auto&& batchnorm_operation_builder = + cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR); + + batchnorm_operation_builder.setComputeType(CUDNN_DATA_FLOAT); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_SCALE_DY, DBN_weight_attributes::output_names::EQ_SCALE_DY); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_SCALE_X, DBN_weight_attributes::output_names::EQ_SCALE_X); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_BIAS, DBN_weight_attributes::output_names::EQ_BIAS); + batchnorm_operation_builder.setEqScalesAndBias(*(tensors.at(EQ_SCALE_DY->second->get_uid())), + *(tensors.at(EQ_SCALE_X->second->get_uid())), + *(tensors.at(EQ_BIAS->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, DBN_weight_attributes::input_names::MEAN); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, DBN_weight_attributes::input_names::INV_VARIANCE); + batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), + *(tensors.at(INV_VARIANCE->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, DBN_weight_attributes::input_names::SCALE); + batchnorm_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, DBN_weight_attributes::input_names::X); + batchnorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, DBN_weight_attributes::input_names::DY); + batchnorm_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, DBN_weight_attributes::output_names::DSCALE); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, DBN_weight_attributes::output_names::DBIAS); + batchnorm_operation_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())), + *(tensors.at(DBIAS->second->get_uid()))); + +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = batchnorm_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch try { -#endif - - // Create the batchnorm operation. - auto&& batchnorm_operation_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR); - - batchnorm_operation_builder.setComputeType(CUDNN_DATA_FLOAT); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_SCALE_DY, DBN_weight_attributes::output_names::EQ_SCALE_DY); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_SCALE_X, DBN_weight_attributes::output_names::EQ_SCALE_X); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_BIAS, DBN_weight_attributes::output_names::EQ_BIAS); - batchnorm_operation_builder.setEqScalesAndBias(*(tensors.at(EQ_SCALE_DY->second->get_uid())), - *(tensors.at(EQ_SCALE_X->second->get_uid())), - *(tensors.at(EQ_BIAS->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, DBN_weight_attributes::input_names::MEAN); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, DBN_weight_attributes::input_names::INV_VARIANCE); - batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), - *(tensors.at(INV_VARIANCE->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, DBN_weight_attributes::input_names::SCALE); - batchnorm_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, DBN_weight_attributes::input_names::X); - batchnorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, DBN_weight_attributes::input_names::DY); - batchnorm_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, DBN_weight_attributes::output_names::DSCALE); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, DBN_weight_attributes::output_names::DBIAS); - batchnorm_operation_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())), - *(tensors.at(DBIAS->second->get_uid()))); - auto operation = batchnorm_operation_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -172,6 +184,7 @@ class DBNWeightNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "DBN_WEIGHT"})"_json); } }; diff --git a/include/cudnn_frontend/node/dln.h b/include/cudnn_frontend/node/dln.h index b2f38e0f..9d4ebbb9 100644 --- a/include/cudnn_frontend/node/dln.h +++ b/include/cudnn_frontend/node/dln.h @@ -107,6 +107,11 @@ class DLNNode : public INode { return {error_code_t::OK, ""}; } + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + error_t post_validate_node() const override final { // Validate outputs @@ -117,26 +122,27 @@ class DLNNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building DLNNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } if (epsilon) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(epsilon, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(epsilon, uid, tensors, invalid_uids)); } return {error_code_t::OK, ""}; @@ -150,51 +156,56 @@ class DLNNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building DLNNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION - try { -#endif - - // Create the DLN operation. - auto&& DLN_op_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR); + // Create the DLN operation. + auto&& DLN_op_builder = cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR); - DLN_op_builder.setNormalizationMode(NormMode_t::LAYER_NORM); + DLN_op_builder.setNormalizationMode(NormMode_t::LAYER_NORM); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Layernorm_backward_attributes::input_names::X); - DLN_op_builder.setxDesc(*(tensors.at(X->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Layernorm_backward_attributes::input_names::X); + DLN_op_builder.setxDesc(*(tensors.at(X->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Layernorm_backward_attributes::input_names::DY); - DLN_op_builder.setdyDesc(*(tensors.at(DY->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Layernorm_backward_attributes::input_names::DY); + DLN_op_builder.setdyDesc(*(tensors.at(DY->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Layernorm_backward_attributes::input_names::SCALE); - DLN_op_builder.setScale(*(tensors.at(SCALE->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Layernorm_backward_attributes::input_names::SCALE); + DLN_op_builder.setScale(*(tensors.at(SCALE->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Layernorm_backward_attributes::input_names::MEAN); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, - Layernorm_backward_attributes::input_names::INV_VARIANCE); - DLN_op_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), - *(tensors.at(INV_VARIANCE->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Layernorm_backward_attributes::input_names::MEAN); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, + Layernorm_backward_attributes::input_names::INV_VARIANCE); + DLN_op_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), + *(tensors.at(INV_VARIANCE->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Layernorm_backward_attributes::output_names::DSCALE); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Layernorm_backward_attributes::output_names::DBIAS); - DLN_op_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())), - *(tensors.at(DBIAS->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Layernorm_backward_attributes::output_names::DSCALE); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Layernorm_backward_attributes::output_names::DBIAS); + DLN_op_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())), + *(tensors.at(DBIAS->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Layernorm_backward_attributes::output_names::DX); - DLN_op_builder.setdxDesc(*(tensors.at(DX->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Layernorm_backward_attributes::output_names::DX); + DLN_op_builder.setdxDesc(*(tensors.at(DX->second->get_uid()))); - if (epsilon) { - DLN_op_builder.setEpsilonTensor(*(tensors.at(epsilon->get_uid()))); - uids_involved_in_operations.insert(epsilon->get_uid()); - } + if (epsilon) { + DLN_op_builder.setEpsilonTensor(*(tensors.at(epsilon->get_uid()))); + uids_involved_in_operations.insert(epsilon->get_uid()); + } +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = DLN_op_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch + try { auto operation = DLN_op_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -206,17 +217,14 @@ class DLNNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "LAYER_NORM_BPROP"})"_json); } error_t - pass_by_value_tensors_( - cudnnHandle_t, - std::unordered_map, void*> const&, - std::unordered_map, pass_by_values_t>& tensor_to_pass_by_value, - void*) const override final { + pass_by_value_tensors_(std::unordered_map& tensor_to_pass_by_value) const override final { if (epsilon) { // can pass in any dummy value - tensor_to_pass_by_value.emplace(epsilon, 0.0f); + tensor_to_pass_by_value.emplace(epsilon->get_uid(), 0.0f); } return {error_code_t::OK, ""}; } diff --git a/include/cudnn_frontend/node/genstats.h b/include/cudnn_frontend/node/genstats.h index 2cd5f21a..2703dec6 100644 --- a/include/cudnn_frontend/node/genstats.h +++ b/include/cudnn_frontend/node/genstats.h @@ -21,6 +21,11 @@ class GenstatsNode : public INode { return Type::GENSTATS; } + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + error_t pre_validate_node() const override final { CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs()); @@ -80,21 +85,22 @@ class GenstatsNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building GenstatsNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } return {error_code_t::OK, ""}; @@ -108,31 +114,36 @@ class GenstatsNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building GenstatsNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION + auto&& genstats_operation_builder = + cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_GEN_STATS_DESCRIPTOR); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Genstats_attributes::input_names::X); + genstats_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); + + genstats_operation_builder.setGenStatsMode(CUDNN_GENSTATS_SUM_SQSUM); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(SUM, Genstats_attributes::output_names::SUM); + genstats_operation_builder.setSumDesc(*(tensors.at(SUM->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(SQ_SUM, Genstats_attributes::output_names::SQ_SUM); + genstats_operation_builder.setSqSumDesc(*(tensors.at(SQ_SUM->second->get_uid()))); +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = genstats_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch try { -#endif - - auto&& genstats_operation_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_GEN_STATS_DESCRIPTOR); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Genstats_attributes::input_names::X); - genstats_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); - - genstats_operation_builder.setGenStatsMode(CUDNN_GENSTATS_SUM_SQSUM); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(SUM, Genstats_attributes::output_names::SUM); - genstats_operation_builder.setSumDesc(*(tensors.at(SUM->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(SQ_SUM, Genstats_attributes::output_names::SQ_SUM); - genstats_operation_builder.setSqSumDesc(*(tensors.at(SQ_SUM->second->get_uid()))); - auto operation = genstats_operation_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -144,6 +155,7 @@ class GenstatsNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "GENSTATS"})"_json); } }; diff --git a/include/cudnn_frontend/node/instancenorm.h b/include/cudnn_frontend/node/instancenorm.h index 0a3d4c33..c8f1b075 100644 --- a/include/cudnn_frontend/node/instancenorm.h +++ b/include/cudnn_frontend/node/instancenorm.h @@ -103,21 +103,22 @@ class InstanceNormNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building InstanceNormNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } return {error_code_t::OK, ""}; @@ -130,44 +131,50 @@ class InstanceNormNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building InstanceNormNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION - try { -#endif + auto&& op_builder = cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR); - auto&& op_builder = cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR); + op_builder.setNormalizationMode(NormMode_t::INSTANCE_NORM); - op_builder.setNormalizationMode(NormMode_t::INSTANCE_NORM); + op_builder.setNormFwdPhase(attributes.forward_phase); - op_builder.setNormFwdPhase(attributes.forward_phase); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Instancenorm_attributes::input_names::X); + op_builder.setxDesc(*(tensors.at(X->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Instancenorm_attributes::input_names::X); - op_builder.setxDesc(*(tensors.at(X->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Instancenorm_attributes::input_names::SCALE); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Instancenorm_attributes::input_names::BIAS); + op_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())), *(tensors.at(BIAS->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Instancenorm_attributes::input_names::SCALE); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Instancenorm_attributes::input_names::BIAS); - op_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())), *(tensors.at(BIAS->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Instancenorm_attributes::input_names::EPSILON); + op_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Instancenorm_attributes::input_names::EPSILON); - op_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Instancenorm_attributes::output_names::Y); + op_builder.setyDesc(*(tensors.at(Y->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Instancenorm_attributes::output_names::Y); - op_builder.setyDesc(*(tensors.at(Y->second->get_uid()))); - - if (attributes.forward_phase == NormFwdPhase_t::TRAINING) { - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, Instancenorm_attributes::output_names::MEAN); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, - Instancenorm_attributes::output_names::INV_VARIANCE); - op_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), - *(tensors.at(INV_VARIANCE->second->get_uid()))); - } + if (attributes.forward_phase == NormFwdPhase_t::TRAINING) { + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, Instancenorm_attributes::output_names::MEAN); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, + Instancenorm_attributes::output_names::INV_VARIANCE); + op_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), + *(tensors.at(INV_VARIANCE->second->get_uid()))); + } +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = op_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch + try { auto operation = op_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -179,6 +186,12 @@ class InstanceNormNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "INSTANCE_NORM"})"_json); + } + + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); } }; @@ -284,21 +297,27 @@ class DINNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + + error_t + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building DINode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } return {error_code_t::OK, ""}; @@ -312,46 +331,52 @@ class DINNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building DINode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION + // Create the DIN operation. + auto&& DIN_operation_builder = + cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR); + + DIN_operation_builder.setNormalizationMode(NormMode_t::INSTANCE_NORM); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Instancenorm_backward_attributes::input_names::X); + DIN_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Instancenorm_backward_attributes::input_names::DY); + DIN_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Instancenorm_backward_attributes::input_names::SCALE); + DIN_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Instancenorm_backward_attributes::input_names::MEAN); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, + Instancenorm_backward_attributes::input_names::INV_VARIANCE); + DIN_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), + *(tensors.at(INV_VARIANCE->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Instancenorm_backward_attributes::output_names::DSCALE); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Instancenorm_backward_attributes::output_names::DBIAS); + DIN_operation_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())), + *(tensors.at(DBIAS->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Instancenorm_backward_attributes::output_names::DX); + DIN_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid()))); + +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = DIN_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch try { -#endif - - // Create the DIN operation. - auto&& DIN_operation_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR); - - DIN_operation_builder.setNormalizationMode(NormMode_t::INSTANCE_NORM); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Instancenorm_backward_attributes::input_names::X); - DIN_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Instancenorm_backward_attributes::input_names::DY); - DIN_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Instancenorm_backward_attributes::input_names::SCALE); - DIN_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Instancenorm_backward_attributes::input_names::MEAN); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, - Instancenorm_backward_attributes::input_names::INV_VARIANCE); - DIN_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), - *(tensors.at(INV_VARIANCE->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Instancenorm_backward_attributes::output_names::DSCALE); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Instancenorm_backward_attributes::output_names::DBIAS); - DIN_operation_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())), - *(tensors.at(DBIAS->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Instancenorm_backward_attributes::output_names::DX); - DIN_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid()))); - auto operation = DIN_operation_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -363,6 +388,7 @@ class DINNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "INSTANCE_NORM_BPROP"})"_json); } }; diff --git a/include/cudnn_frontend/node/layernorm.h b/include/cudnn_frontend/node/layernorm.h index 845ecdd1..27e1ac7b 100644 --- a/include/cudnn_frontend/node/layernorm.h +++ b/include/cudnn_frontend/node/layernorm.h @@ -21,6 +21,11 @@ class LayerNormNode : public INode { return Type::LAYERNORM; } + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + error_t expand_and_infer_properties() override final { getLogger() << "[cudnn_frontend] INFO: Inferencing properties for layernorm node " << attributes.name << "..." @@ -147,21 +152,22 @@ class LayerNormNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building LayerNormNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } return {error_code_t::OK, ""}; @@ -174,43 +180,48 @@ class LayerNormNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building LayerNormNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION - try { -#endif - auto&& layernorm_operation_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR); - layernorm_operation_builder.setNormalizationMode(NormMode_t::LAYER_NORM) - .setNormFwdPhase(attributes.forward_phase); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Layernorm_attributes::input_names::X); - layernorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Layernorm_attributes::input_names::SCALE); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Layernorm_attributes::input_names::BIAS); - layernorm_operation_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())), - *(tensors.at(BIAS->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Layernorm_attributes::input_names::EPSILON); - layernorm_operation_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Layernorm_attributes::output_names::Y); - layernorm_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid()))); - - if (attributes.forward_phase == NormFwdPhase_t::TRAINING) { - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, Layernorm_attributes::output_names::MEAN); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, - Layernorm_attributes::output_names::INV_VARIANCE); - layernorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), - *(tensors.at(INV_VARIANCE->second->get_uid()))); - } + auto&& layernorm_operation_builder = + cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR); + layernorm_operation_builder.setNormalizationMode(NormMode_t::LAYER_NORM) + .setNormFwdPhase(attributes.forward_phase); - auto operation = layernorm_operation_builder.build(); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Layernorm_attributes::input_names::X); + layernorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); - operations.push_back(std::make_shared(std::move(operation))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Layernorm_attributes::input_names::SCALE); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Layernorm_attributes::input_names::BIAS); + layernorm_operation_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())), + *(tensors.at(BIAS->second->get_uid()))); -#ifndef NV_CUDNN_DISABLE_EXCEPTION + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Layernorm_attributes::input_names::EPSILON); + layernorm_operation_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Layernorm_attributes::output_names::Y); + layernorm_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid()))); + + if (attributes.forward_phase == NormFwdPhase_t::TRAINING) { + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, Layernorm_attributes::output_names::MEAN); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, Layernorm_attributes::output_names::INV_VARIANCE); + layernorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())), + *(tensors.at(INV_VARIANCE->second->get_uid()))); + } +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = layernorm_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch + try { + auto operation = layernorm_operation_builder.build(); + operations.push_back(std::make_shared(std::move(operation))); } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -222,6 +233,7 @@ class LayerNormNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "LAYER_NORM"})"_json); } }; diff --git a/include/cudnn_frontend/node/matmul.h b/include/cudnn_frontend/node/matmul.h index 7c4b57a0..aa9fa959 100644 --- a/include/cudnn_frontend/node/matmul.h +++ b/include/cudnn_frontend/node/matmul.h @@ -21,6 +21,11 @@ class MatmulNode : public INode { return Type::MATMUL; } + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + error_t pre_validate_node() const override final { getLogger() << "[cudnn_frontend] INFO: " @@ -86,21 +91,22 @@ class MatmulNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building MatmulNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } @@ -115,51 +121,57 @@ class MatmulNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building MatmulNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION - try { -#endif - - // matmul descriptor - auto matmul_descriptor = cudnn_frontend::MatMulDescBuilder() - .setComputeType(attributes.compute_data_type) - .setPaddingValue(attributes.padding_value) - .build(); + // matmul descriptor + auto matmul_descriptor = cudnn_frontend::MatMulDescBuilder() + .setComputeType(attributes.compute_data_type) + .setPaddingValue(attributes.padding_value) + .build(); - auto&& matmul_operation_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_MATMUL_DESCRIPTOR); + auto&& matmul_operation_builder = + cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_MATMUL_DESCRIPTOR); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(A, Matmul_attributes::input_names::A); - matmul_operation_builder.setaMatDesc(*tensors.at(A->second->get_uid())); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(A, Matmul_attributes::input_names::A); + matmul_operation_builder.setaMatDesc(*tensors.at(A->second->get_uid())); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(B, Matmul_attributes::input_names::B); - matmul_operation_builder.setbMatDesc(*tensors.at(B->second->get_uid())); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(B, Matmul_attributes::input_names::B); + matmul_operation_builder.setbMatDesc(*tensors.at(B->second->get_uid())); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(C, Matmul_attributes::output_names::C); - matmul_operation_builder.setcMatDesc(*tensors.at(C->second->get_uid())); - matmul_operation_builder.setmatmulDesc(matmul_descriptor); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(C, Matmul_attributes::output_names::C); + matmul_operation_builder.setcMatDesc(*tensors.at(C->second->get_uid())); + matmul_operation_builder.setmatmulDesc(matmul_descriptor); - auto M_override = attributes.inputs.find(Matmul_attributes::input_names::M_override); - if ((M_override != attributes.inputs.end()) && (M_override->second != nullptr)) { - matmul_operation_builder.setmOverrideDesc(*tensors.at(M_override->second->get_uid())); - } + auto M_override = attributes.inputs.find(Matmul_attributes::input_names::M_override); + if ((M_override != attributes.inputs.end()) && (M_override->second != nullptr)) { + matmul_operation_builder.setmOverrideDesc(*tensors.at(M_override->second->get_uid())); + } - auto N_override = attributes.inputs.find(Matmul_attributes::input_names::N_override); - if ((N_override != attributes.inputs.end()) && (N_override->second != nullptr)) { - matmul_operation_builder.setnOverrideDesc(*tensors.at(N_override->second->get_uid())); - } + auto N_override = attributes.inputs.find(Matmul_attributes::input_names::N_override); + if ((N_override != attributes.inputs.end()) && (N_override->second != nullptr)) { + matmul_operation_builder.setnOverrideDesc(*tensors.at(N_override->second->get_uid())); + } - auto K_override = attributes.inputs.find(Matmul_attributes::input_names::K_override); - if ((K_override != attributes.inputs.end()) && (K_override->second != nullptr)) { - matmul_operation_builder.setkOverrideDesc(*tensors.at(K_override->second->get_uid())); - } + auto K_override = attributes.inputs.find(Matmul_attributes::input_names::K_override); + if ((K_override != attributes.inputs.end()) && (K_override->second != nullptr)) { + matmul_operation_builder.setkOverrideDesc(*tensors.at(K_override->second->get_uid())); + } +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = matmul_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch + try { auto operation = matmul_operation_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -171,6 +183,7 @@ class MatmulNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "MATMUL"})"_json); } }; diff --git a/include/cudnn_frontend/node/pointwise.h b/include/cudnn_frontend/node/pointwise.h index b16beb40..861d98f7 100644 --- a/include/cudnn_frontend/node/pointwise.h +++ b/include/cudnn_frontend/node/pointwise.h @@ -21,6 +21,11 @@ class PointwiseNode : public INode { return Type::POINTWISE; } + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + error_t pre_validate_node() const override final { getLogger() << "[cudnn_frontend] INFO: " @@ -81,21 +86,22 @@ class PointwiseNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building PointwiseNode " << attributes.name << " tensors X:" << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } @@ -110,57 +116,63 @@ class PointwiseNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building PointwiseNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION - try { -#endif + auto pointwise_descriptor = cudnn_frontend::PointwiseDescBuilder() + .setAxis(attributes.get_axis().value_or(-1)) + .setReluLowerClipSlope(attributes.relu_lower_clip_slope.value_or(0.0)) + .setComputeType(attributes.compute_data_type) + .setMode(attributes.mode) + .build(); + + auto const port_count = get_pointwise_mode_port_count(attributes.mode); - auto pointwise_descriptor = cudnn_frontend::PointwiseDescBuilder() - .setAxis(attributes.get_axis().value_or(-1)) - .setReluLowerClipSlope(attributes.relu_lower_clip_slope.value_or(0.0)) - .setComputeType(attributes.compute_data_type) - .setMode(attributes.mode) - .build(); + auto&& pointwise_operation_builder = + cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_POINTWISE_DESCRIPTOR); + pointwise_operation_builder.setpwDesc(pointwise_descriptor); - auto const port_count = get_pointwise_mode_port_count(attributes.mode); + if (detail::is_activation_backward_mode(attributes.mode)) { + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_0, Pointwise_attributes::input_names::IN_0); + pointwise_operation_builder.setdyDesc(*(tensors.at(IN_0->second->get_uid()))); - auto&& pointwise_operation_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_POINTWISE_DESCRIPTOR); - pointwise_operation_builder.setpwDesc(pointwise_descriptor); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_1, Pointwise_attributes::input_names::IN_1); + pointwise_operation_builder.setxDesc(*(tensors.at(IN_1->second->get_uid()))); - if (detail::is_activation_backward_mode(attributes.mode)) { - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_0, Pointwise_attributes::input_names::IN_0); - pointwise_operation_builder.setdyDesc(*(tensors.at(IN_0->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(OUT_0, Pointwise_attributes::output_names::OUT_0); + pointwise_operation_builder.setdxDesc(*(tensors.at(OUT_0->second->get_uid()))); + } else { + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_0, Pointwise_attributes::input_names::IN_0); + pointwise_operation_builder.setxDesc(*(tensors.at(IN_0->second->get_uid()))); + if (port_count >= 3) { CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_1, Pointwise_attributes::input_names::IN_1); - pointwise_operation_builder.setxDesc(*(tensors.at(IN_1->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(OUT_0, Pointwise_attributes::output_names::OUT_0); - pointwise_operation_builder.setdxDesc(*(tensors.at(OUT_0->second->get_uid()))); - } else { - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_0, Pointwise_attributes::input_names::IN_0); - pointwise_operation_builder.setxDesc(*(tensors.at(IN_0->second->get_uid()))); - - if (port_count >= 3) { - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_1, Pointwise_attributes::input_names::IN_1); - pointwise_operation_builder.setbDesc(*(tensors.at(IN_1->second->get_uid()))); - } - - if (port_count >= 4) { - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_2, Pointwise_attributes::input_names::IN_2); - pointwise_operation_builder.settDesc(*(tensors.at(IN_2->second->get_uid()))); - } - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(OUT_0, Pointwise_attributes::output_names::OUT_0); - pointwise_operation_builder.setyDesc(*(tensors.at(OUT_0->second->get_uid()))); + pointwise_operation_builder.setbDesc(*(tensors.at(IN_1->second->get_uid()))); } - auto operation = pointwise_operation_builder.build(); + if (port_count >= 4) { + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_2, Pointwise_attributes::input_names::IN_2); + pointwise_operation_builder.settDesc(*(tensors.at(IN_2->second->get_uid()))); + } - operations.push_back(std::make_shared(std::move(operation))); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(OUT_0, Pointwise_attributes::output_names::OUT_0); + pointwise_operation_builder.setyDesc(*(tensors.at(OUT_0->second->get_uid()))); + } -#ifndef NV_CUDNN_DISABLE_EXCEPTION +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = pointwise_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch + try { + auto operation = pointwise_operation_builder.build(); + operations.push_back(std::make_shared(std::move(operation))); } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -172,6 +184,7 @@ class PointwiseNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"({"tag": "POINTWISE"})"_json); } }; diff --git a/include/cudnn_frontend/node/reduction.h b/include/cudnn_frontend/node/reduction.h index c11bc863..45bd3b1f 100644 --- a/include/cudnn_frontend/node/reduction.h +++ b/include/cudnn_frontend/node/reduction.h @@ -70,21 +70,27 @@ class ReductionNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + + error_t + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building ReductionNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } return {error_code_t::OK, ""}; @@ -98,33 +104,39 @@ class ReductionNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building ReductionNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION + auto reduction_descriptor = cudnn_frontend::ReductionDescBuilder() + .setComputeType(attributes.compute_data_type) + .setReductionOp(attributes.get_mode().value()) + .build(); + + auto&& reduction_operation_builder = + cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR); + + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Reduction_attributes::input_names::X); + reduction_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); + + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Reduction_attributes::output_names::Y); + reduction_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid()))); + + reduction_operation_builder.setreductionDesc(reduction_descriptor); + +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = reduction_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch try { -#endif - - auto reduction_descriptor = cudnn_frontend::ReductionDescBuilder() - .setComputeType(attributes.compute_data_type) - .setReductionOp(attributes.get_mode().value()) - .build(); - - auto&& reduction_operation_builder = - cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR); - - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Reduction_attributes::input_names::X); - reduction_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Reduction_attributes::output_names::Y); - reduction_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid()))); - - reduction_operation_builder.setreductionDesc(reduction_descriptor); - auto operation = reduction_operation_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -136,6 +148,7 @@ class ReductionNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"({"tag": "REDUCTION"})"_json); } }; diff --git a/include/cudnn_frontend/node/reshape.h b/include/cudnn_frontend/node/reshape.h index ca6a9fce..f9d9686f 100644 --- a/include/cudnn_frontend/node/reshape.h +++ b/include/cudnn_frontend/node/reshape.h @@ -19,6 +19,11 @@ class ReshapeNode : public INode { return Type::RESHAPE; } + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + error_t pre_validate_node() const override final { getLogger() << "[cudnn_frontend] INFO: " @@ -82,21 +87,22 @@ class ReshapeNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building Reshape tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } return {error_code_t::OK, ""}; @@ -110,28 +116,35 @@ class ReshapeNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building ReshapeNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION - try { -#endif - auto&& reshape_op_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_RESHAPE_DESCRIPTOR); + auto&& reshape_op_builder = cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_RESHAPE_DESCRIPTOR); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Reshape_attributes::input_names::X); - reshape_op_builder.setxDesc(*(tensors.at(X->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Reshape_attributes::input_names::X); + reshape_op_builder.setxDesc(*(tensors.at(X->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Reshape_attributes::output_names::Y); - reshape_op_builder.setyDesc(*(tensors.at(Y->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Reshape_attributes::output_names::Y); + reshape_op_builder.setyDesc(*(tensors.at(Y->second->get_uid()))); - auto operation = reshape_op_builder.build(); + reshape_op_builder.setyDesc(*(tensors.at(Y->second->get_uid()))); +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = reshape_op_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch + try { + auto operation = reshape_op_builder.build(); operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif - auto const& non_virtual_uids = attributes.get_non_virtual_uids(); uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end()); return {error_code_t::OK, ""}; @@ -140,6 +153,7 @@ class ReshapeNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "RESHAPE"})"_json); } }; diff --git a/include/cudnn_frontend/node/rmsnorm.h b/include/cudnn_frontend/node/rmsnorm.h index 13380c0c..23cc23ad 100644 --- a/include/cudnn_frontend/node/rmsnorm.h +++ b/include/cudnn_frontend/node/rmsnorm.h @@ -88,25 +88,27 @@ class RMSNormNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building RMSNormNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } return {error_code_t::OK, ""}; } + error_t create_cudnn_operations( std::unordered_set& uids_involved_in_operations, @@ -115,46 +117,49 @@ class RMSNormNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building RMSNormNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION - try { -#endif - - auto&& rmsnorm_operation_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR); + auto&& rmsnorm_operation_builder = + cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR); - rmsnorm_operation_builder.setNormalizationMode(NormMode_t::RMS_NORM) - .setNormFwdPhase(attributes.forward_phase); + rmsnorm_operation_builder.setNormalizationMode(NormMode_t::RMS_NORM).setNormFwdPhase(attributes.forward_phase); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Rmsnorm_attributes::input_names::X); - rmsnorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Rmsnorm_attributes::input_names::X); + rmsnorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Rmsnorm_attributes::input_names::SCALE); - rmsnorm_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Rmsnorm_attributes::input_names::SCALE); + rmsnorm_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Rmsnorm_attributes::input_names::EPSILON); - rmsnorm_operation_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Rmsnorm_attributes::input_names::EPSILON); + rmsnorm_operation_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Rmsnorm_attributes::output_names::Y); - rmsnorm_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Rmsnorm_attributes::output_names::Y); + rmsnorm_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid()))); - if (attributes.forward_phase == NormFwdPhase_t::TRAINING) { - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, - Rmsnorm_attributes::output_names::INV_VARIANCE); - rmsnorm_operation_builder.setSavedInvVar(*(tensors.at(INV_VARIANCE->second->get_uid()))); - } - - auto BIAS = attributes.inputs.find(Rmsnorm_attributes::input_names::BIAS); - if ((BIAS != attributes.inputs.end()) && (BIAS->second != nullptr)) { - rmsnorm_operation_builder.setBias(*(tensors.at(BIAS->second->get_uid()))); - } + if (attributes.forward_phase == NormFwdPhase_t::TRAINING) { + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, Rmsnorm_attributes::output_names::INV_VARIANCE); + rmsnorm_operation_builder.setSavedInvVar(*(tensors.at(INV_VARIANCE->second->get_uid()))); + } + auto BIAS = attributes.inputs.find(Rmsnorm_attributes::input_names::BIAS); + if ((BIAS != attributes.inputs.end()) && (BIAS->second != nullptr)) { + rmsnorm_operation_builder.setBias(*(tensors.at(BIAS->second->get_uid()))); + } +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = rmsnorm_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch + try { auto operation = rmsnorm_operation_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif @@ -166,6 +171,12 @@ class RMSNormNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "RMS_NORM"})"_json); + } + + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); } }; @@ -195,6 +206,11 @@ class DRMSNormNode : public INode { return {error_code_t::OK, ""}; } + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + error_t expand_and_infer_properties() override final { getLogger() << "[cudnn_frontend] INFO: Inferencing properties for DRMSNorm node " << attributes.name << "..." @@ -271,21 +287,22 @@ class DRMSNormNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building DRMSNormNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } @@ -300,49 +317,52 @@ class DRMSNormNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building DRMSNormNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION - try { -#endif + auto&& DRMSNorm_operation_builder = + cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR); - auto&& DRMSNorm_operation_builder = - cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR); + DRMSNorm_operation_builder.setNormalizationMode(NormMode_t::RMS_NORM); - DRMSNorm_operation_builder.setNormalizationMode(NormMode_t::RMS_NORM); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Rmsnorm_backward_attributes::input_names::X); + DRMSNorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Rmsnorm_backward_attributes::input_names::X); - DRMSNorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Rmsnorm_backward_attributes::input_names::DY); + DRMSNorm_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Rmsnorm_backward_attributes::input_names::DY); - DRMSNorm_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Rmsnorm_backward_attributes::input_names::SCALE); + DRMSNorm_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Rmsnorm_backward_attributes::input_names::SCALE); - DRMSNorm_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, Rmsnorm_backward_attributes::input_names::INV_VARIANCE); + DRMSNorm_operation_builder.setSavedInvVar(*(tensors.at(INV_VARIANCE->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, - Rmsnorm_backward_attributes::input_names::INV_VARIANCE); - DRMSNorm_operation_builder.setSavedInvVar(*(tensors.at(INV_VARIANCE->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Rmsnorm_backward_attributes::output_names::DSCALE); + DRMSNorm_operation_builder.setDScale(*(tensors.at(DSCALE->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Rmsnorm_backward_attributes::output_names::DSCALE); - DRMSNorm_operation_builder.setDScale(*(tensors.at(DSCALE->second->get_uid()))); - - if (attributes.use_dbias.value()) { - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Rmsnorm_backward_attributes::output_names::DBIAS); - DRMSNorm_operation_builder.setDBias(*(tensors.at(DBIAS->second->get_uid()))); - } - - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Rmsnorm_backward_attributes::output_names::DX); - DRMSNorm_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid()))); + if (attributes.use_dbias.value()) { + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Rmsnorm_backward_attributes::output_names::DBIAS); + DRMSNorm_operation_builder.setDBias(*(tensors.at(DBIAS->second->get_uid()))); + } + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Rmsnorm_backward_attributes::output_names::DX); + DRMSNorm_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid()))); +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = DRMSNorm_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch + try { auto operation = DRMSNorm_operation_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif - auto const& non_virtual_uids = attributes.get_non_virtual_uids(); uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end()); return {error_code_t::OK, ""}; @@ -351,6 +371,7 @@ class DRMSNormNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "RMS_NORM_BPROP"})"_json); } }; diff --git a/include/cudnn_frontend/node/rng.h b/include/cudnn_frontend/node/rng.h index 4e4993a6..92939251 100644 --- a/include/cudnn_frontend/node/rng.h +++ b/include/cudnn_frontend/node/rng.h @@ -36,21 +36,27 @@ class RngNode : public INode { } error_t - create_cudnn_tensors(int64_t& uid, std::unordered_map>& tensors) - const override final { + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + + error_t + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& tensors, + std::unordered_set const& invalid_uids) const override final { getLogger() << "[cudnn_frontend] INFO: " << "Building RngNode tensors " << attributes.name << "..." << std::endl; for (auto const& [name, tensor] : attributes.inputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } for (auto const& [name, tensor] : attributes.outputs) { (void)name; if (tensor) { - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids)); } } return {error_code_t::OK, ""}; @@ -108,46 +114,51 @@ class RngNode : public INode { getLogger() << "[cudnn_frontend] INFO: " << "Building RngNode operations " << attributes.name << "..." << std::endl; -#ifndef NV_CUDNN_DISABLE_EXCEPTION - try { -#endif - - RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.get_distribution() != RngDistribution_t::BERNOULLI, - error_code_t::ATTRIBUTE_NOT_SET, - "no other distribution except bernoulli supported."); + RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.get_distribution() != RngDistribution_t::BERNOULLI, + error_code_t::ATTRIBUTE_NOT_SET, + "no other distribution except bernoulli supported."); - auto rng_descriptor = cudnn_frontend::RngDescBuilder() - .setRngDistribution(attributes.get_distribution()) - .setBernoulliDistProbability(attributes.get_bernoulli_probability().value()) - .build(); + auto rng_descriptor = cudnn_frontend::RngDescBuilder() + .setRngDistribution(attributes.get_distribution()) + .setBernoulliDistProbability(attributes.get_bernoulli_probability().value()) + .build(); - auto&& Rng_operation_builder = cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_RNG_DESCRIPTOR); + auto&& Rng_operation_builder = cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_RNG_DESCRIPTOR); - CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Rng_attributes::output_names::Y); - Rng_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid()))); + CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Rng_attributes::output_names::Y); + Rng_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid()))); - Rng_operation_builder.setRngDesc(rng_descriptor); + Rng_operation_builder.setRngDesc(rng_descriptor); - if (attributes.seed.has_value()) { - Rng_operation_builder.setSeed(attributes.get_seed().value()); - } else { - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Seed, Rng_attributes::input_names::Seed); - Rng_operation_builder.setSeedDesc(*(tensors.at(Seed->second->get_uid()))); + if (attributes.seed.has_value()) { + Rng_operation_builder.setSeed(attributes.get_seed().value()); + } else { + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Seed, Rng_attributes::input_names::Seed); + Rng_operation_builder.setSeedDesc(*(tensors.at(Seed->second->get_uid()))); - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Offset, Rng_attributes::input_names::Offset); - Rng_operation_builder.setOffsetDesc(*(tensors.at(Offset->second->get_uid()))); - } + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Offset, Rng_attributes::input_names::Offset); + Rng_operation_builder.setOffsetDesc(*(tensors.at(Offset->second->get_uid()))); + } +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto operation = Rng_operation_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + operation.get_error()); + operations.push_back(std::make_shared(std::move(operation))); +#else + // build() can throw + // wrap in try catch + try { auto operation = Rng_operation_builder.build(); - operations.push_back(std::make_shared(std::move(operation))); - -#ifndef NV_CUDNN_DISABLE_EXCEPTION } catch (cudnn_frontend::cudnnException& e) { - throw cudnnException(e.what(), e.getCudnnStatus()); + RETURN_CUDNN_FRONTEND_ERROR_IF( + e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what()); } #endif - auto const& non_virtual_uids = attributes.get_non_virtual_uids(); uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end()); return {error_code_t::OK, ""}; @@ -156,6 +167,7 @@ class RngNode : public INode { virtual void serialize(json& j) const override final { j = attributes; + j.update(R"( {"tag": "RNG"})"_json); } }; diff --git a/include/cudnn_frontend/node/scaled_dot_product_attention.h b/include/cudnn_frontend/node/scaled_dot_product_attention.h index 296472b5..46f73417 100644 --- a/include/cudnn_frontend/node/scaled_dot_product_attention.h +++ b/include/cudnn_frontend/node/scaled_dot_product_attention.h @@ -326,16 +326,12 @@ class ScaledDotProductAttentionNode : public INode { } virtual error_t - pass_by_value_tensors_( - cudnnHandle_t, - std::unordered_map, void*> const&, - std::unordered_map, pass_by_values_t>& tensor_to_pass_by_value, - void*) const override final { + pass_by_value_tensors_(std::map& tensor_to_pass_by_value) const override final { half dropout_scale_value = options.dropout_scale; - tensor_to_pass_by_value.emplace(options.inputs.Dropout_scale, dropout_scale_value); + tensor_to_pass_by_value.emplace(options.inputs.Dropout_scale->get_uid(), dropout_scale_value); float negative_inf_value = std::numeric_limits::min(); - tensor_to_pass_by_value.emplace(negative_inf, negative_inf_value); + tensor_to_pass_by_value.emplace(negative_inf->get_uid(), negative_inf_value); return {error_code_t::OK, ""}; } diff --git a/include/cudnn_frontend/node/scaled_dot_product_flash_attention.h b/include/cudnn_frontend/node/scaled_dot_product_flash_attention.h index b0629514..240eda36 100644 --- a/include/cudnn_frontend/node/scaled_dot_product_flash_attention.h +++ b/include/cudnn_frontend/node/scaled_dot_product_flash_attention.h @@ -37,6 +37,11 @@ class SDPANode : public INode { return Type::COMPOSITE; } + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + error_t pre_validate_node() const override final { getLogger() << "[cudnn_frontend] INFO: " @@ -81,6 +86,7 @@ class SDPANode : public INode { #undef CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE // validate backend limitations for the operation + int64_t s_q = attributes.inputs.at(input_names::Q)->get_dim()[2]; int64_t h_q = attributes.inputs.at(input_names::Q)->get_dim()[1]; int64_t h_k = attributes.inputs.at(input_names::K)->get_dim()[1]; int64_t h_v = attributes.inputs.at(input_names::V)->get_dim()[1]; @@ -121,9 +127,10 @@ class SDPANode : public INode { bool const has_dropout_mask = (dropout_mask != attributes.inputs.end()) && (dropout_mask->second != nullptr); bool const has_dropout = attributes.dropout_probability.has_value() || has_dropout_mask; - RETURN_CUDNN_FRONTEND_ERROR_IF(has_dropout, - error_code_t::GRAPH_NOT_SUPPORTED, - "s_kv not a multiple of 64 is not supported with cudnn version below 9.0.0"); + RETURN_CUDNN_FRONTEND_ERROR_IF( + has_dropout, + error_code_t::GRAPH_NOT_SUPPORTED, + "s_kv not a multiple of 64 with dropout enabled is not supported with cudnn version below 9.0.0"); } if (((s_kv % 64 != 0) || (d_qk % 64 != 0)) && (cudnnGetVersion() <= 8905)) { @@ -163,6 +170,14 @@ class SDPANode : public INode { error_code_t::ATTRIBUTE_NOT_SET, "Intermediate tensor data type needs to be set as internal tensors require it."); + if (((s_q % 64 != 0) || (s_kv % 64 != 0)) && (attributes.padding_mask || has_dropout_mask) && + (cudnnGetVersion() < 90000)) { + RETURN_CUDNN_FRONTEND_ERROR_IF(true, + error_code_t::GRAPH_NOT_SUPPORTED, + "s_q/s_kv not a multiple of 64 with padding/dropout mask is not supported " + "with cudnn version below 9.0.0"); + } + CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs()); return {error_code_t::OK, ""}; } @@ -530,56 +545,58 @@ class SDPANode : public INode { } virtual error_t - pass_by_value_tensors_( - cudnnHandle_t handle, - std::unordered_map, void*> const&, - std::unordered_map, pass_by_values_t>& tensor_to_pass_by_value, - void* node_workspace) const override final { + workspace_modifications_tensors_( + std::unordered_map>>& workspace_modifications, + int64_t& offset) const override final { + if (attributes.alibi_mask) { + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Q, input_names::Q); + int64_t const h_q = Q->second->get_dim()[1]; + auto alibi_slopes_vec = detail::get_abili_slope(h_q); + workspace_modifications.emplace(alibi_slopes->get_uid(), std::make_tuple(0, offset, alibi_slopes_vec)); + } + return {error_code_t::OK, ""}; + } + + virtual error_t + pass_by_value_tensors_(std::unordered_map& tensor_to_pass_by_value) const override final { if (attributes.dropout_probability.has_value() && attributes.dropout_probability.value() != 0.0) { #if CUDNN_VERSION < 8903 half dropout_scale_value = __float2half(1.0f / (1.0f - attributes.dropout_probability.value())); #else float dropout_scale_value = (1.0f / (1.0f - attributes.dropout_probability.value())); #endif - tensor_to_pass_by_value.emplace(dropout_scale, dropout_scale_value); + tensor_to_pass_by_value.emplace(dropout_scale->get_uid(), dropout_scale_value); } if (negative_inf_padding) { float negative_inf_value = std::numeric_limits::lowest(); - tensor_to_pass_by_value.emplace(negative_inf_padding, negative_inf_value); + tensor_to_pass_by_value.emplace(negative_inf_padding->get_uid(), negative_inf_value); } if (WAR_scalar_max_seq_kv) { auto const& v_dim = attributes.inputs.at(input_names::V)->get_dim(); int32_t s_kv = static_cast(v_dim[2]); - tensor_to_pass_by_value.emplace(WAR_scalar_max_seq_kv, s_kv); + tensor_to_pass_by_value.emplace(WAR_scalar_max_seq_kv->get_uid(), s_kv); } if (negative_inf_causal) { float negative_inf_value = std::numeric_limits::lowest(); - tensor_to_pass_by_value.emplace(negative_inf_causal, negative_inf_value); - } - - if (attributes.alibi_mask) { - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Q, input_names::Q); - int64_t const h = Q->second->get_dim()[1]; - auto h_alibi_slopes_vector = detail::get_abili_slope(h); - int64_t alibi_slopes_size = h * sizeof(float); - - cudaStream_t stream; - CHECK_CUDNN_ERROR(cudnnGetStream(handle, &stream)); - CHECK_CUDA_ERROR(cudaMemcpyAsync( - node_workspace, h_alibi_slopes_vector.data(), alibi_slopes_size, cudaMemcpyHostToDevice, stream)); - tensor_to_pass_by_value.emplace(alibi_slopes, node_workspace); + tensor_to_pass_by_value.emplace(negative_inf_causal->get_uid(), negative_inf_value); } if (attributes.attn_scale_value.has_value()) { CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Attn_scale, input_names::Attn_scale); - tensor_to_pass_by_value.emplace(Attn_scale->second, attributes.attn_scale_value.value()); + tensor_to_pass_by_value.emplace(Attn_scale->second->get_uid(), attributes.attn_scale_value.value()); } return {error_code_t::OK, ""}; } + + virtual void + serialize(json& j) const override final { + j = attributes; + j.update(R"({"tag": "SDPA_FWD"})"_json); + } }; class SDPABackwardNode : public INode { @@ -657,10 +674,18 @@ class SDPABackwardNode : public INode { // validate backend limitations for the operation int64_t h_q = attributes.inputs.at(input_names::Q)->get_dim()[1]; + int64_t s_q = attributes.inputs.at(input_names::Q)->get_dim()[2]; int64_t h_k = attributes.inputs.at(input_names::K)->get_dim()[1]; int64_t h_v = attributes.inputs.at(input_names::V)->get_dim()[1]; int64_t d_qk = attributes.inputs.at(input_names::Q)->get_dim()[3]; + int64_t s_kv = attributes.inputs.at(input_names::V)->get_dim()[2]; int64_t d_v = attributes.inputs.at(input_names::V)->get_dim()[3]; + + RETURN_CUDNN_FRONTEND_ERROR_IF( + (s_q < 64) && cudnnGetVersion() < 90000, + error_code_t::GRAPH_NOT_SUPPORTED, + "Sequence length must be greater than or equal to 64 for cudnn version prior to v9.0.0"); + RETURN_CUDNN_FRONTEND_ERROR_IF((h_q % h_k != 0) || (h_q % h_v != 0), error_code_t::GRAPH_NOT_SUPPORTED, "For group-query attention, number of heads for key and query must be a factor " @@ -678,8 +703,9 @@ class SDPABackwardNode : public INode { "attn_scale with tensor and value cannot be set at the same time."); // validate options for bias mask - auto bias_mask = attributes.inputs.find(input_names::Bias); - if (bias_mask != attributes.inputs.end() && bias_mask->second != nullptr) { + auto bias_mask = attributes.inputs.find(input_names::Bias); + bool const has_bias = (bias_mask != attributes.inputs.end() && bias_mask->second != nullptr); + if (has_bias) { auto bias_mask_dtype = bias_mask->second->get_data_type(); RETURN_CUDNN_FRONTEND_ERROR_IF((bias_mask_dtype == DataType_t::BOOLEAN), error_code_t::GRAPH_NOT_SUPPORTED, @@ -716,6 +742,14 @@ class SDPABackwardNode : public INode { error_code_t::ATTRIBUTE_NOT_SET, "Intermediate tensor data type needs to be set as internal tensors require it."); + if (((s_q % 64 != 0) || (s_kv % 64 != 0)) && (attributes.padding_mask || has_dropout_mask) && + (cudnnGetVersion() < 90000)) { + RETURN_CUDNN_FRONTEND_ERROR_IF(true, + error_code_t::GRAPH_NOT_SUPPORTED, + "s_q/s_kv not a multiple of 64 with padding/dropout mask is not supported " + "with cudnn version below 9.0.0"); + } + CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs()); return {error_code_t::OK, ""}; } @@ -729,6 +763,11 @@ class SDPABackwardNode : public INode { return {error_code_t::OK, ""}; } + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + error_t expand_and_infer_properties() override final { getLogger() << "[cudnn_frontend] INFO: Inferrencing properties for SDPABackwardNode " << attributes.name @@ -848,7 +887,7 @@ class SDPABackwardNode : public INode { struct cudaDeviceProp prop; CHECK_CUDA_ERROR(cudaGetDeviceProperties(&prop, 0)); - if (cudnnGetVersion() >= 8905 && prop.major >= 9) { + if ((cudnnGetVersion() >= 8905 && prop.major >= 9) || (cudnnGetVersion() >= 9000)) { // default upper limit for workspace 256MB int64_t max_dp_workspace_bytes = 256 * 1024 * 1024; @@ -1089,7 +1128,7 @@ class SDPABackwardNode : public INode { attributes.inputs[input_names::Stats], Pointwise_attributes().set_name("sub_s_m").set_mode(PointwiseMode_t::SUB)); - // WAR: Explicitly putting the padding value again after the stats have been loaded + // WAR for bug 4475073 by explicitly putting the padding value again after the stats have been loaded if (attributes.padding_mask && cudnnGetVersion() >= 90000) { auto row_idx_output = pointwise(last_output, Pointwise_attributes() @@ -1163,7 +1202,7 @@ class SDPABackwardNode : public INode { // as reshape + matmul last_output = reshape(last_output, Reshape_attributes().set_name("reshape_p")); last_output->set_dim({b, h_q, s_kv, s_q}).set_stride({h_q * s_q * s_kv, s_q * s_kv, 1, s_kv}); - last_output->set_data_type(context.get_io_data_type()); + last_output->set_data_type(attributes.inputs[input_names::Q]->get_data_type()); if (h_q == h_v) { // for MHA @@ -1183,7 +1222,7 @@ class SDPABackwardNode : public INode { .set_m_override(attributes.inputs[input_names::SEQ_LEN_KV]) .set_k_override(attributes.inputs[input_names::SEQ_LEN_Q])); last_output->set_dim({b, h_q, s_kv, d_v}).set_stride({h_q * s_kv * d_v, s_kv * d_v, d_v, 1}); - last_output->set_data_type(context.get_io_data_type()); + last_output->set_data_type(attributes.inputs[input_names::Q]->get_data_type()); reduction(last_output, Reduction_attributes().set_name("red_dV_head").set_mode(ReductionMode_t::ADD), attributes.outputs[output_names::dV]); @@ -1197,7 +1236,7 @@ class SDPABackwardNode : public INode { Matmul_attributes() .set_name("matmul_dO_VT") .set_m_override(attributes.inputs[input_names::SEQ_LEN_Q]) - .set_k_override(attributes.inputs[input_names::SEQ_LEN_KV])); + .set_n_override(attributes.inputs[input_names::SEQ_LEN_KV])); last_output->set_dim({b, h_q, s_q, s_kv}).set_stride({h_q * s_q * s_kv, s_q * s_kv, s_kv, 1}); // last_output = last_output(dP) * mask @@ -1243,7 +1282,7 @@ class SDPABackwardNode : public INode { // as reshape + matmul last_output = reshape(last_output, Reshape_attributes().set_name("reshape_dS")); last_output->set_dim({b, h_q, s_kv, s_q}).set_stride({h_q * s_q * s_kv, s_q * s_kv, 1, s_kv}); - last_output->set_data_type(context.get_io_data_type()); + last_output->set_data_type(attributes.inputs[input_names::Q]->get_data_type()); if (h_q == h_k) { // for MHA @@ -1263,7 +1302,7 @@ class SDPABackwardNode : public INode { .set_m_override(attributes.inputs[input_names::SEQ_LEN_KV]) .set_k_override(attributes.inputs[input_names::SEQ_LEN_Q])); last_output->set_dim({b, h_q, s_kv, d_qk}).set_stride({h_q * s_kv * d_qk, s_kv * d_qk, d_qk, 1}); - last_output->set_data_type(context.get_io_data_type()); + last_output->set_data_type(attributes.inputs[input_names::Q]->get_data_type()); reduction(last_output, Reduction_attributes().set_name("red_dK_head").set_mode(ReductionMode_t::ADD), attributes.outputs[output_names::dK]); @@ -1280,6 +1319,10 @@ class SDPABackwardNode : public INode { last_output->set_dim({kt_dim[0], kt_dim[1], kt_dim[3], kt_dim[2]}) .set_stride({kt_stride[0], kt_stride[1], kt_stride[3], kt_stride[2]}); + if (attributes.inputs[input_names::K]->get_ragged_offset() != nullptr) { + last_output->set_ragged_offset(attributes.inputs[input_names::K]->get_ragged_offset()); + } + matmul(dS_output, last_output, Matmul_attributes() @@ -1316,45 +1359,55 @@ class SDPABackwardNode : public INode { return alibi_slopes_size_padded + dQ_accum_size + softmax_sum_size; } + virtual error_t + workspace_modifications_tensors_( + std::unordered_map>>& workspace_modifications, + int64_t& offset) const override final { + if (attributes.alibi_mask) { + CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Q, input_names::Q); + int64_t const h_q = Q->second->get_dim()[1]; + auto alibi_slopes_vec = detail::get_abili_slope(h_q); + workspace_modifications.emplace(alibi_slopes->get_uid(), std::make_tuple(0, offset, alibi_slopes_vec)); + int64_t alibi_slopes_size_padded = (alibi_slopes_size + 15) & ~15; + offset = offset + alibi_slopes_size_padded; + } + + if (dQ_accum && !dQ_accum->get_is_virtual()) { + std::vector f_vec = {(float)dQ_accum_size}; + workspace_modifications.emplace(dQ_accum->get_uid(), std::make_tuple(1, offset, f_vec)); + offset = offset + dQ_accum_size; + } + + if (softmax_sum && !softmax_sum->get_is_virtual()) { + // There is no requirement for softmax_sum to be memset to 0 + std::vector f_vec = {}; + workspace_modifications.emplace(softmax_sum->get_uid(), std::make_tuple(2, offset, f_vec)); + } + + return {error_code_t::OK, ""}; + } + error_t - pass_by_value_tensors_( - cudnnHandle_t handle, - std::unordered_map, void*> const&, - std::unordered_map, pass_by_values_t>& tensor_to_pass_by_value, - void* node_workspace) const override final { + pass_by_value_tensors_(std::unordered_map& tensor_to_pass_by_value) const override final { using input_names = SDPA_backward_attributes::input_names; if (one_tensor) { - tensor_to_pass_by_value.emplace(one_tensor, 1.0f); + tensor_to_pass_by_value.emplace(one_tensor->get_uid(), 1.0f); } if (attributes.attn_scale_value.has_value()) { CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Attn_scale, input_names::Attn_scale); - tensor_to_pass_by_value.emplace(Attn_scale->second, attributes.attn_scale_value.value()); - } - - if (attributes.alibi_mask) { - CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Q, input_names::Q); - int64_t const h_q = Q->second->get_dim()[1]; - auto alibi_slopes_vec = detail::get_abili_slope(h_q); - int64_t alibi_slopes_size_padded = (alibi_slopes_size + 15) & ~15; - - cudaStream_t stream; - CHECK_CUDNN_ERROR(cudnnGetStream(handle, &stream)); - CHECK_CUDA_ERROR(cudaMemcpyAsync( - node_workspace, alibi_slopes_vec.data(), alibi_slopes_size, cudaMemcpyHostToDevice, stream)); - tensor_to_pass_by_value.emplace(alibi_slopes, node_workspace); - node_workspace = static_cast(node_workspace) + alibi_slopes_size_padded; + tensor_to_pass_by_value.emplace(Attn_scale->second->get_uid(), attributes.attn_scale_value.value()); } if (attributes.padding_mask) { float negative_inf_value = std::numeric_limits::lowest(); - tensor_to_pass_by_value.emplace(negative_inf_padding, negative_inf_value); + tensor_to_pass_by_value.emplace(negative_inf_padding->get_uid(), negative_inf_value); } if (attributes.causal_mask) { float negative_inf_value = std::numeric_limits::lowest(); - tensor_to_pass_by_value.emplace(negative_inf_causal, negative_inf_value); + tensor_to_pass_by_value.emplace(negative_inf_causal->get_uid(), negative_inf_value); } if (attributes.dropout_probability.has_value()) { @@ -1362,27 +1415,20 @@ class SDPABackwardNode : public INode { float dropout_scale_inv_value = (1.0f - attributes.dropout_probability.value()); CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Dropout_scale, input_names::Dropout_scale); - tensor_to_pass_by_value.emplace(Dropout_scale->second, dropout_scale_value); + tensor_to_pass_by_value.emplace(Dropout_scale->second->get_uid(), dropout_scale_value); CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Dropout_scale_inv, input_names::Dropout_scale_inv); - tensor_to_pass_by_value.emplace(Dropout_scale_inv->second, dropout_scale_inv_value); - } - - if (dQ_accum && !dQ_accum->get_is_virtual()) { - cudaStream_t stream; - CHECK_CUDNN_ERROR(cudnnGetStream(handle, &stream)); - CHECK_CUDA_ERROR(cudaMemsetAsync(node_workspace, 0, dQ_accum_size, stream)); - tensor_to_pass_by_value.emplace(dQ_accum, node_workspace); - node_workspace = static_cast(node_workspace) + dQ_accum_size; - } - - if (softmax_sum && !softmax_sum->get_is_virtual()) { - // There is no requirement for softmax_sum to be memset to 0 - tensor_to_pass_by_value.emplace(softmax_sum, node_workspace); + tensor_to_pass_by_value.emplace(Dropout_scale_inv->second->get_uid(), dropout_scale_inv_value); } return {error_code_t::OK, ""}; } + + virtual void + serialize(json& j) const override final { + j = attributes; + j.update(R"({"tag": "SDPA_BWD"})"_json); + } }; -} // namespace cudnn_frontend::graph +} // namespace cudnn_frontend::graph \ No newline at end of file diff --git a/include/cudnn_frontend/node/softmax.h b/include/cudnn_frontend/node/softmax.h index f821a7bd..dbd4963f 100644 --- a/include/cudnn_frontend/node/softmax.h +++ b/include/cudnn_frontend/node/softmax.h @@ -122,5 +122,15 @@ class SoftmaxNode : public INode { return {error_code_t::OK, ""}; } + + error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const override final { + return attributes.get_prefilled_uids(pre_assigned_uids); + } + + virtual void + serialize(json& j) const override final { + j = attributes; + } }; } // namespace cudnn_frontend::graph \ No newline at end of file diff --git a/include/cudnn_frontend/node_interface.h b/include/cudnn_frontend/node_interface.h index bd63a020..5ec2dfb4 100644 --- a/include/cudnn_frontend/node_interface.h +++ b/include/cudnn_frontend/node_interface.h @@ -38,6 +38,10 @@ class INode : public ICudnn { detail::Context context; private: + std::unordered_map deserialized_pass_by_value; + std::unordered_map>> deserialized_workspace_modifications; + int64_t fe_workspace_size = 0; + std::shared_ptr output_tensor(std::string const& name) { auto tensor = std::make_shared(); @@ -56,15 +60,21 @@ class INode : public ICudnn { virtual int64_t get_fe_workspace_size_node() const { - // Mostly no FE nodes have require workspace - return 0; + // Mostly no FE nodes have require workspace initiailized to 0 + return fe_workspace_size; } int64_t - get_cudnn_workspace_size() const { - int64_t cudnn_workspace_size = get_cudnn_workspace_size_node(); + get_cudnn_workspace_size(int64_t plan_index = -1) const { + int64_t cudnn_workspace_size = 0; + + auto status = get_cudnn_workspace_size_node(plan_index, cudnn_workspace_size); + if (status.is_bad()) { + getLogger() << "[cudnn_frontend] ERROR: Querying workspace failed." << std::endl; + } + for (auto const& sub_node : sub_nodes) { - cudnn_workspace_size = std::max(cudnn_workspace_size, sub_node->get_cudnn_workspace_size()); + cudnn_workspace_size = std::max(cudnn_workspace_size, sub_node->get_cudnn_workspace_size(plan_index)); } return cudnn_workspace_size; } @@ -88,27 +98,79 @@ class INode : public ICudnn { } virtual error_t - pass_by_value_tensors_(cudnnHandle_t, - std::unordered_map, void*> const&, - std::unordered_map, pass_by_values_t>&, - void*) const { + pass_by_value_tensors_(std::unordered_map& pass_by_values) const { + for (auto [uid, value] : deserialized_pass_by_value) { + pass_by_values.emplace(uid, value); + } return {error_code_t::OK, ""}; } error_t - gather_pass_by_value_tensors( - cudnnHandle_t const& handle, - std::unordered_map, void*> const& tensor_to_pointer_map, - std::unordered_map, pass_by_values_t>& tensor_to_pass_by_value, - void* fe_workspace) const { - void* node_workspace = fe_workspace; - CHECK_CUDNN_FRONTEND_ERROR( - pass_by_value_tensors_(handle, tensor_to_pointer_map, tensor_to_pass_by_value, node_workspace)); - node_workspace = static_cast(node_workspace) + get_fe_workspace_size_node(); + run_auxiliary_kernels( + cudnnHandle_t handle, + void* fe_workspace, + std::unordered_map>>& workspace_modifications) const { + cudaStream_t stream; + CHECK_CUDNN_ERROR(cudnnGetStream(handle, &stream)); + char* workspace = static_cast(fe_workspace); + + for (auto [uid, data] : workspace_modifications) { + (void)uid; + if (std::get<0>(data) == 0) { + auto& vec_data = std::get<2>(data); + CHECK_CUDA_ERROR(cudaMemcpyAsync(workspace + std::get<1>(data), + vec_data.data(), + vec_data.size() * sizeof(float), + cudaMemcpyHostToDevice, + stream)); + } else if (std::get<0>(data) == 1) { + int64_t memset_size = (int64_t)std::get<2>(data)[0]; + CHECK_CUDA_ERROR(cudaMemsetAsync(workspace + std::get<1>(data), 0, memset_size, stream)); + } + } + return {error_code_t::OK, ""}; + } + + error_t + gather_pass_by_value_tensors_(std::unordered_map& tensor_to_pass_by_value) const { + CHECK_CUDNN_FRONTEND_ERROR(pass_by_value_tensors_(tensor_to_pass_by_value)); for (auto const& sub_node : sub_nodes) { - CHECK_CUDNN_FRONTEND_ERROR(sub_node->gather_pass_by_value_tensors( - handle, tensor_to_pointer_map, tensor_to_pass_by_value, node_workspace)); - node_workspace = static_cast(node_workspace) + sub_node->get_fe_workspace_size_node(); + CHECK_CUDNN_FRONTEND_ERROR(sub_node->gather_pass_by_value_tensors_(tensor_to_pass_by_value)); + } + return {error_code_t::OK, ""}; + } + + virtual error_t + workspace_modifications_tensors_( + std::unordered_map>>& worskspace_modifications, + int64_t&) const { + for (auto [uid, value] : deserialized_workspace_modifications) { + worskspace_modifications.emplace(uid, value); + } + return {error_code_t::OK, ""}; + } + + error_t + gather_workspace_modifications( + std::unordered_map>>& worskspace_modifications, + int64_t& offset) const { + CHECK_CUDNN_FRONTEND_ERROR(workspace_modifications_tensors_(worskspace_modifications, offset)); + offset = get_fe_workspace_size_node(); + for (auto const& sub_node : sub_nodes) { + CHECK_CUDNN_FRONTEND_ERROR(sub_node->gather_workspace_modifications(worskspace_modifications, offset)); + offset += sub_node->get_fe_workspace_size_node(); + } + return {error_code_t::OK, ""}; + } + + error_t + extend_tensor_map_with_workspace_tensors_( + std::unordered_map& tensor_to_pointer_map, + void* workspace, + std::unordered_map>> const& worskspace_modifications) + const { + for (auto const& [uid, data] : worskspace_modifications) { + tensor_to_pointer_map.emplace(uid, static_cast(workspace) + std::get<1>(data)); } return {error_code_t::OK, ""}; } @@ -116,16 +178,16 @@ class INode : public ICudnn { error_t extend_tensor_map_with_pass_by_value_tensors_( std::unordered_map& tensor_to_pointer_map, - std::unordered_map, pass_by_values_t>& tensor_to_pass_by_value) const { - for (auto& [tensor, value] : tensor_to_pass_by_value) { + std::unordered_map& tensor_to_pass_by_value) const { + for (auto& [uid, value] : tensor_to_pass_by_value) { if (half* half_value_ptr = std::get_if(&value)) { - tensor_to_pointer_map.emplace(tensor->get_uid(), half_value_ptr); + tensor_to_pointer_map.emplace(uid, half_value_ptr); } else if (int32_t* int32_t_value_ptr = std::get_if(&value)) { - tensor_to_pointer_map.emplace(tensor->get_uid(), int32_t_value_ptr); + tensor_to_pointer_map.emplace(uid, int32_t_value_ptr); } else if (float* float_value_ptr = std::get_if(&value)) { - tensor_to_pointer_map.emplace(tensor->get_uid(), float_value_ptr); + tensor_to_pointer_map.emplace(uid, float_value_ptr); } else if (void** void_value_ptr = std::get_if(&value)) { - tensor_to_pointer_map.emplace(tensor->get_uid(), *void_value_ptr); + tensor_to_pointer_map.emplace(uid, *void_value_ptr); } else { RETURN_CUDNN_FRONTEND_ERROR_IF( true, error_code_t::INVALID_VARIANT_PACK, "Unexpected type for pass by value tensor."); @@ -242,11 +304,11 @@ class INode : public ICudnn { // Creates cudnn tensors for each node (and its sub nodes) virtual error_t - create_cudnn_tensors( - int64_t& uid, - std::unordered_map>& uid_to_backend_tensors) const { + create_cudnn_tensors(int64_t& uid, + std::unordered_map>& uid_to_backend_tensors, + std::unordered_set const& invalid_uids) const { for (auto const& sub_node : sub_nodes) { - CHECK_CUDNN_FRONTEND_ERROR(sub_node->create_cudnn_tensors(uid, uid_to_backend_tensors)); + CHECK_CUDNN_FRONTEND_ERROR(sub_node->create_cudnn_tensors(uid, uid_to_backend_tensors, invalid_uids)); } return {error_code_t::OK, ""}; } @@ -265,6 +327,14 @@ class INode : public ICudnn { return {error_code_t::OK, ""}; } + virtual error_t + collect_pre_assigned_uids(std::unordered_set& pre_assigned_uids) const { + for (auto const& sub_node : sub_nodes) { + auto x = sub_node->collect_pre_assigned_uids(pre_assigned_uids); + } + return {error_code_t::OK, ""}; + } + // An implicitly topological-sorted vector of sub nodes. // The sorted order is a side effect of functional API. std::vector> sub_nodes; @@ -318,8 +388,14 @@ class INode : public ICudnn { // TODO: Maybe just use uid_to_tensors size as uid each time? int64_t uid = 1; + std::unordered_set pre_assigned_uids; + CHECK_CUDNN_FRONTEND_ERROR(collect_pre_assigned_uids(pre_assigned_uids)); + while (pre_assigned_uids.find(uid) != pre_assigned_uids.end()) { + uid++; + } + // Lower each sub node to cudnn backend. - CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensors(uid, uid_to_tensors)); + CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensors(uid, uid_to_tensors, pre_assigned_uids)); // INode needs to keep track of all uids that an operation graph uses. // This is because cudnn backend will not accept extra tensors in variant pack. @@ -344,6 +420,14 @@ class INode : public ICudnn { return get_fe_workspace_size() + get_cudnn_workspace_size(); } + int64_t + get_workspace_size_plan_at_index(int64_t plan_index) const { + // There are two workspaces: + // - cudnn execution plan workspace + // - FE node workspace (example: alibiSlope for fmha) + return get_fe_workspace_size() + get_cudnn_workspace_size(plan_index); + } + int64_t get_autotune_workspace_size() const { // There are two workspaces: @@ -352,56 +436,249 @@ class INode : public ICudnn { return get_fe_workspace_size() + get_max_cudnn_workspace_size(); } + error_t + autotune(cudnnHandle_t handle, + std::unordered_map& tensor_uid_to_pointer_map, + void* workspace, + void* user_impl = nullptr) { + // Add pass_by_value data pointers to tensor_uid_to_pointer map + // object lifetime is controlled by tensor_to_pass_by_value which means the pointer should stay valid during + // execute. + std::unordered_map tensor_to_pass_by_value; + CHECK_CUDNN_FRONTEND_ERROR(gather_pass_by_value_tensors_(tensor_to_pass_by_value)); + + CHECK_CUDNN_FRONTEND_ERROR( + extend_tensor_map_with_pass_by_value_tensors_(tensor_uid_to_pointer_map, tensor_to_pass_by_value)); + + std::unordered_map>> workspace_modifications; + int64_t workspace_offset = 0; + CHECK_CUDNN_FRONTEND_ERROR(gather_workspace_modifications(workspace_modifications, workspace_offset)); + + CHECK_CUDNN_FRONTEND_ERROR(run_auxiliary_kernels(handle, workspace, workspace_modifications)); + + CHECK_CUDNN_FRONTEND_ERROR( + extend_tensor_map_with_workspace_tensors_(tensor_uid_to_pointer_map, workspace, workspace_modifications)); + + // offset workspace by the already used fe graph workspace + // this is where cudnn backend can start using workspace for its execution plans + void* cudnn_workspace = static_cast(workspace) + get_fe_workspace_size(); + + for (auto& plan_list : plans) { + CHECK_CUDNN_FRONTEND_ERROR( + plan_list.autotune(handle, tensor_uid_to_pointer_map, cudnn_workspace, user_impl)); + } + return {error_code_t::OK, ""}; + } + + error_t + autotune(cudnnHandle_t handle, + std::unordered_map, void*>& tensor_to_pointer_map, + void* workspace, + void* user_impl = nullptr) { + // First get all the uids from the map + std::unordered_map tensor_uid_to_pointer_map; + for (auto const& [tensor, pointer] : tensor_to_pointer_map) { + tensor_uid_to_pointer_map.emplace(tensor->get_uid(), pointer); + } + + return autotune(handle, tensor_uid_to_pointer_map, workspace, user_impl); + } + + error_t + execute_plan_at_index(cudnnHandle_t handle, + std::unordered_map, void*>& tensor_to_pointer_map, + void* workspace, + int64_t plan_index) const { + // First get all the uids from the map + std::unordered_map tensor_uid_to_pointer_map; + for (auto const& [tensor, pointer] : tensor_to_pointer_map) { + tensor_uid_to_pointer_map.emplace(tensor->get_uid(), pointer); + } + + return execute_plan_at_index(handle, tensor_uid_to_pointer_map, workspace, plan_index); + } + error_t execute(cudnnHandle_t handle, - std::unordered_map, void*> const& tensor_to_pointer_map, + std::unordered_map, void*>& tensor_to_pointer_map, void* workspace) const { + // First get all the uids from the map std::unordered_map tensor_uid_to_pointer_map; for (auto const& [tensor, pointer] : tensor_to_pointer_map) { tensor_uid_to_pointer_map.emplace(tensor->get_uid(), pointer); } - std::unordered_map, pass_by_values_t> tensor_to_pass_by_value; - void* fe_workspace = workspace; - void* cudnn_workspace = static_cast(fe_workspace) + get_fe_workspace_size(); + return execute(handle, tensor_uid_to_pointer_map, workspace); + } + + error_t + execute_plan_at_index(cudnnHandle_t handle, + std::unordered_map& tensor_uid_to_pointer_map, + void* workspace, + int64_t plan_index) const { + // Add pass_by_value data pointers to uid_to_pointer map + // object lifetime is controlled by tensor_to_pass_by_value which means the pointer should stay valid during + // execute. + std::unordered_map tensor_to_pass_by_value; + CHECK_CUDNN_FRONTEND_ERROR(gather_pass_by_value_tensors_(tensor_to_pass_by_value)); CHECK_CUDNN_FRONTEND_ERROR( - gather_pass_by_value_tensors(handle, tensor_to_pointer_map, tensor_to_pass_by_value, fe_workspace)); + extend_tensor_map_with_pass_by_value_tensors_(tensor_uid_to_pointer_map, tensor_to_pass_by_value)); - // Add pass_by_value data pointers to tensor_uid_to_pointer map + std::unordered_map>> workspace_modifications; + int64_t workspace_offset = 0; + CHECK_CUDNN_FRONTEND_ERROR(gather_workspace_modifications(workspace_modifications, workspace_offset)); + + CHECK_CUDNN_FRONTEND_ERROR(run_auxiliary_kernels(handle, workspace, workspace_modifications)); + + CHECK_CUDNN_FRONTEND_ERROR( + extend_tensor_map_with_workspace_tensors_(tensor_uid_to_pointer_map, workspace, workspace_modifications)); + // offset workspace by the already used fe graph workspace + // this is where cudnn backend can start using workspace for its execution plans + void* cudnn_workspace = static_cast(workspace) + get_fe_workspace_size(); + + CHECK_CUDNN_FRONTEND_ERROR( + execute_cudnn_plans_with_uid(handle, tensor_uid_to_pointer_map, cudnn_workspace, plan_index)); + + return {error_code_t::OK, ""}; + } + + error_t + execute(cudnnHandle_t handle, + std::unordered_map& tensor_uid_to_pointer_map, + void* workspace) const { + // Add pass_by_value data pointers to uid_to_pointer map // object lifetime is controlled by tensor_to_pass_by_value which means the pointer should stay valid during - // execute - for (auto& [tensor, value] : tensor_to_pass_by_value) { - if (half* half_value_ptr = std::get_if(&value)) { - tensor_uid_to_pointer_map.emplace(tensor->get_uid(), half_value_ptr); - } else if (int32_t* int32_t_value_ptr = std::get_if(&value)) { - tensor_uid_to_pointer_map.emplace(tensor->get_uid(), int32_t_value_ptr); - } else if (float* float_value_ptr = std::get_if(&value)) { - tensor_uid_to_pointer_map.emplace(tensor->get_uid(), float_value_ptr); - } else if (void** void_value_ptr = std::get_if(&value)) { - tensor_uid_to_pointer_map.emplace(tensor->get_uid(), *void_value_ptr); - } else { - RETURN_CUDNN_FRONTEND_ERROR_IF( - true, error_code_t::INVALID_VARIANT_PACK, "Execute unexpected type for pass by value tensor."); + // execute. + std::unordered_map tensor_to_pass_by_value; + CHECK_CUDNN_FRONTEND_ERROR(gather_pass_by_value_tensors_(tensor_to_pass_by_value)); + + CHECK_CUDNN_FRONTEND_ERROR( + extend_tensor_map_with_pass_by_value_tensors_(tensor_uid_to_pointer_map, tensor_to_pass_by_value)); + + std::unordered_map>> workspace_modifications; + int64_t workspace_offset = 0; + CHECK_CUDNN_FRONTEND_ERROR(gather_workspace_modifications(workspace_modifications, workspace_offset)); + + CHECK_CUDNN_FRONTEND_ERROR(run_auxiliary_kernels(handle, workspace, workspace_modifications)); + + CHECK_CUDNN_FRONTEND_ERROR( + extend_tensor_map_with_workspace_tensors_(tensor_uid_to_pointer_map, workspace, workspace_modifications)); + // offset workspace by the already used fe graph workspace + // this is where cudnn backend can start using workspace for its execution plans + void* cudnn_workspace = static_cast(workspace) + get_fe_workspace_size(); + + CHECK_CUDNN_FRONTEND_ERROR(execute_cudnn_plans_with_uid(handle, tensor_uid_to_pointer_map, cudnn_workspace)); + + return {error_code_t::OK, ""}; + } + + error_t + deserialize(cudnnHandle_t handle, std::vector const& data) { + json j = json::from_ubjson(data); + auto serialized_plans = j["cudnn_backend_data"]; + if (serialized_plans.size() == 0) { + return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, "No plans in the serialized json"}; + } + + auto index = 0; + for (auto const& serialized_plan : serialized_plans) { + Execution_plan_list plan_list; + CHECK_CUDNN_FRONTEND_ERROR(plan_list.build_plans(handle, serialized_plan)); + plans.emplace_back(std::move(plan_list)); + std::unordered_set&& opgraph_variant_packs = j["variant_pack_uids"][index]; + variant_pack_uids.emplace_back(opgraph_variant_packs); + index++; + } + + std::unordered_map integer_pass_by_values; + std::unordered_map half_pass_by_values; + std::unordered_map float_pass_by_values; + + auto pass_by_value_tensors = j["pass_by_values"]; + for (auto i = 0u; i < pass_by_value_tensors.size(); i++) { + if (i == 0) { + integer_pass_by_values = pass_by_value_tensors[i].get>(); + } else if (i == 1) { + half_pass_by_values = pass_by_value_tensors[i].get>(); + } else if (i == 2) { + float_pass_by_values = pass_by_value_tensors[i].get>(); + } + } + + for (auto const& [uid, value] : integer_pass_by_values) { + deserialized_pass_by_value.emplace(uid, value); + } + for (auto const& [uid, value] : half_pass_by_values) { + deserialized_pass_by_value.emplace(uid, __float2half(value)); + } + for (auto const& [uid, value] : float_pass_by_values) { + deserialized_pass_by_value.emplace(uid, value); + } + + deserialized_workspace_modifications = j["workspace_modifications"]; + + fe_workspace_size = j["fe_workspace_size"]; + + return {error_code_t::OK, ""}; + } + + error_t + serialize(std::vector& data) const { + json j; + serialize(j); + j["cudnn_backend_data"]; + int index = 0; + for (auto& plan_list : plans) { + auto const candidate = plan_list.candidate; + auto execution_plan = plan_list.execution_plans[candidate]; + if (execution_plan != nullptr) { + auto serialized_plan = execution_plan->getJsonRepresentation(); + j["cudnn_backend_data"].push_back(serialized_plan); + j["variant_pack_uids"].push_back(variant_pack_uids[index]); + index++; } } - CHECK_CUDNN_FRONTEND_ERROR(execute_cudnn_plans(handle, tensor_uid_to_pointer_map, cudnn_workspace)); + std::unordered_map tensor_to_pass_by_value; + CHECK_CUDNN_FRONTEND_ERROR(gather_pass_by_value_tensors_(tensor_to_pass_by_value)); + + j["pass_by_values"]; + std::unordered_map integer_pass_by_values; + std::unordered_map half_pass_by_values; + std::unordered_map float_pass_by_values; + // std::unordered_map void_ptr_pass_by_values; + for (auto const& [uid, pass_by_value] : tensor_to_pass_by_value) { + if (pass_by_value.index() == 0) { + integer_pass_by_values.emplace(uid, std::get<0>(pass_by_value)); + } else if (pass_by_value.index() == 1) { + half_pass_by_values.emplace(uid, __half2float(std::get<1>(pass_by_value))); + } else if (pass_by_value.index() == 2) { + float_pass_by_values.emplace(uid, std::get<2>(pass_by_value)); + } + } + // json j = half_pass_by_values; + j["pass_by_values"].push_back(integer_pass_by_values); + j["pass_by_values"].push_back(half_pass_by_values); + j["pass_by_values"].push_back(float_pass_by_values); + + std::unordered_map>> workspace_modifications; + int64_t workspace_offset = 0; + CHECK_CUDNN_FRONTEND_ERROR(gather_workspace_modifications(workspace_modifications, workspace_offset)); + + j["workspace_modifications"] = workspace_modifications; + j["fe_workspace_size"] = get_fe_workspace_size(); + + data = json::to_ubjson(j); return {error_code_t::OK, ""}; } INode(detail::Context const& context) : context(context) {} + // Make sure each node implements a public serialize function virtual void - serialize(json& j) const { - j["nodes"]; - for (auto const& sub_node : sub_nodes) { - json j_sub_node; - sub_node->serialize(j_sub_node); - j["nodes"].push_back(j_sub_node); - } - }; + serialize(json& j) const = 0; size_t key() { diff --git a/include/cudnn_frontend/plans.h b/include/cudnn_frontend/plans.h index 7f1e13ad..e9d66b9f 100644 --- a/include/cudnn_frontend/plans.h +++ b/include/cudnn_frontend/plans.h @@ -1,14 +1,68 @@ #pragma once +#include #include #include #include "../cudnn_frontend_EngineConfig.h" #include "../cudnn_frontend_Logging.h" +#include "graph_helpers.h" namespace cudnn_frontend { namespace detail { + +inline error_t +execute(cudnnHandle_t handle, + ExecutionPlan* plan, + std::vector& device_ptrs, + std::vector const& uids, + void* workspace_ptr) { + // TODO: below line fails with MSVC. warning C4127: conditional expression is constant + // RETURN_CUDNN_FRONTEND_ERROR_IF(!plan, error_code_t::GRAPH_EXECUTION_FAILED, "No plan found to execute!!"); + getLogger() << "[cudnn_frontend] INFO: Executing " << plan->getTag() << "..." << std::endl; + + auto&& variant_pack_builder = VariantPackBuilder(); + variant_pack_builder.setDataPointers(device_ptrs.size(), device_ptrs.data()) + .setUids(uids.size(), uids.data()) + .setWorkspacePointer(workspace_ptr); + + cudnnBackendDescriptor_t raw_variant_pack = nullptr; +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto variant_pack = variant_pack_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(variant_pack.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::INVALID_VARIANT_PACK, + variant_pack.get_error()); + raw_variant_pack = variant_pack.get_raw_desc(); +#else + // build() can throw + // wrap in try catch + try { + auto variant_pack = variant_pack_builder.build(); + raw_variant_pack = variant_pack.get_raw_desc(); + } catch (cudnn_frontend::cudnnException& e) { + // Silly MSVC error that thinks below condition is constexpr + // RETURN_CUDNN_FRONTEND_ERROR_IF( + // e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::INVALID_VARIANT_PACK, e.what()); + getLogger() << "[cudnn_frontend] ERROR: " << e.what() << ". "; + getLogger() << error_code_t::INVALID_VARIANT_PACK << " because variant packing building failed at " << __FILE__ + << ":" << __LINE__ << "\n"; + return {error_code_t::INVALID_VARIANT_PACK, e.what()}; + } +#endif + + auto status = cudnnBackendExecute(handle, plan->get_raw_desc(), raw_variant_pack); + if (status != CUDNN_STATUS_SUCCESS) { + std::string message = "[cudnn_frontend] ERROR: Graph execution failed."; + return {error_code_t::GRAPH_EXECUTION_FAILED, message}; + } + getLogger() << "[cudnn_frontend] INFO: Executed " << plan->getTag() << "." << std::endl; + + return {error_code_t::OK, ""}; +} + inline error_t query_cudnn_heuristics_impl(std::shared_ptr const& operation_graph, cudnn_frontend::EngineConfigList& configs, @@ -17,7 +71,26 @@ query_cudnn_heuristics_impl(std::shared_ptr const& operation_ getLogger() << "[cudnn_frontend] INFO: " << " Getting plan from heuristics for " << operation_graph_tag << " ..." << std::endl; - auto statuses = cudnn_frontend::get_heuristics_list(modes, *operation_graph, allowAllConfig, configs, true); + std::vector statuses; +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + statuses = cudnn_frontend::get_heuristics_list(modes, *operation_graph, allowAllConfig, configs, true); +#else + // build() can throw + // wrap in try catch + try { + statuses = cudnn_frontend::get_heuristics_list(modes, *operation_graph, allowAllConfig, configs, true); + } catch (cudnn_frontend::cudnnException& e) { + // Silly MSVC error that thinks below condition is constexpr + // RETURN_CUDNN_FRONTEND_ERROR_IF( + // e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::HEURISTIC_QUERY_FAILED, e.what()); + getLogger() << "[cudnn_frontend] ERROR: " << e.what() << ". "; + getLogger() << error_code_t::HEURISTIC_QUERY_FAILED << " because querying heuristics failed at " << __FILE__ + << ":" << __LINE__ << "\n"; + return {error_code_t::HEURISTIC_QUERY_FAILED, e.what()}; + } +#endif getLogger() << "[cudnn_frontend] INFO: get_heuristics_list statuses: "; for (size_t i = 0; i < statuses.size(); i++) { @@ -68,31 +141,73 @@ query_heuristics(std::vector> const& operatio inline error_t create_cudnn_execution_plan(std::shared_ptr& plan, - ManagedOpaqueDescriptor const& config, - std::string const& operation_graph_tag, + std::string const& serialized_data, cudnnHandle_t handle) { -#ifndef NV_CUDNN_DISABLE_EXCEPTION + auto&& plan_builder = cudnn_frontend::ExecutionPlanBuilder(); + + plan_builder.setHandle(handle); + +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto built_plan = plan_builder.loadFromJson(serialized_data); + RETURN_CUDNN_FRONTEND_ERROR_IF(built_plan.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, + built_plan.get_error()); + plan = std::make_shared(std::move(built_plan)); +#else + // build() can throw + // wrap in try catch try { + auto built_plan = plan_builder.loadFromJson(serialized_data); + plan = std::make_shared(std::move(built_plan)); + } catch (cudnn_frontend::cudnnException& e) { + // Silly MSVC error that thinks below condition is constexpr + // RETURN_CUDNN_FRONTEND_ERROR_IF( + // e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, + // e.what()); + getLogger() << "[cudnn_frontend] ERROR: " << e.what() << ". "; + getLogger() << error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED << " because plan building failed at " + << __FILE__ << ":" << __LINE__ << "\n"; + return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, e.what()}; + } #endif - auto built_plan = cudnn_frontend::ExecutionPlanBuilder() - .setHandle(handle) - .setEngineConfig(config, operation_graph_tag) - .build(); - if (built_plan.get_status() != CUDNN_STATUS_SUCCESS) { - getLogger() << "[cudnn_frontend] ERROR: " - << "Config failed with " << built_plan.get_error() << std::endl; - return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, "Couldn't build plan from Config."}; - } - getLogger() << "[cudnn_frontend] INFO: Config succeeded! Plan has built!\n"; - getLogger() << "[cudnn_frontend] INFO: " << built_plan.describe() << std::endl; - plan = std::make_shared(std::move(built_plan)); + return {error_code_t::OK, ""}; +} -#ifndef NV_CUDNN_DISABLE_EXCEPTION +inline error_t +create_cudnn_execution_plan(std::shared_ptr& plan, + ManagedOpaqueDescriptor const& config, + std::string const& operation_graph_tag, + cudnnHandle_t handle) { + auto&& plan_builder = cudnn_frontend::ExecutionPlanBuilder(); + + plan_builder.setHandle(handle).setEngineConfig(config, operation_graph_tag); + +#ifdef NV_CUDNN_DISABLE_EXCEPTION + // disable exception macro is defined. Calling build will not throw. + // Check status of desc and return error. + auto built_plan = plan_builder.build(); + RETURN_CUDNN_FRONTEND_ERROR_IF(built_plan.get_status() != CUDNN_STATUS_SUCCESS, + error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, + built_plan.get_error()); + plan = std::make_shared(std::move(built_plan)); +#else + // build() can throw + // wrap in try catch + try { + auto built_plan = plan_builder.build(); + plan = std::make_shared(std::move(built_plan)); } catch (cudnn_frontend::cudnnException& e) { - getLogger() << "[cudnn_frontend] ERROR: " - << "Config failed with " << e.getCudnnStatus() << " " << e.what() << std::endl; - return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, "Couldn't build plan from Config."}; + // Silly MSVC error that thinks below condition is constexpr + // RETURN_CUDNN_FRONTEND_ERROR_IF( + // e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, + // e.what()); + getLogger() << "[cudnn_frontend] ERROR: " << e.what() << ". "; + getLogger() << error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED << " because plan building failed at " + << __FILE__ << ":" << __LINE__ << "\n"; + return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, e.what()}; } #endif @@ -104,18 +219,21 @@ create_cudnn_execution_plan(std::shared_ptr& plan, namespace graph { class Execution_plan_list { std::string operation_tag; - EngineConfigList engine_configs; std::vector> numeric_notes; std::vector> behavior_notes; - std::vector filtered_indices; + int64_t max_workspace_allowed = std::numeric_limits::max(); - std::shared_ptr candidate; + EngineConfigList engine_configs; public: std::vector> - execution_plans; // Filtered engine configs that have been made as plans + execution_plans; // a built plan corresponding to each engine config, irrespective of whether config is + // selected or deselected. + + // Stores position of best plan in above vector of execution plan + int64_t candidate = -1; void set_tag(std::string const& tag) { @@ -135,7 +253,10 @@ class Execution_plan_list { query_properties() { numeric_notes.reserve(engine_configs.size()); behavior_notes.reserve(engine_configs.size()); - filtered_indices.resize(engine_configs.size()); + + filtered_indices.resize(engine_configs.size(), 0); + execution_plans.resize(engine_configs.size()); + for (auto& engine_config : engine_configs) { int64_t elem_count = 0; std::vector numerics; @@ -200,10 +321,17 @@ class Execution_plan_list { } error_t - filter_out_numeric_notes(std::vector const& notes) { - for (auto note : notes) { + deselect_numeric_notes(std::vector const& notes) { + for (auto& note : notes) { + cudnnBackendNumericalNote_t backend_note; + + RETURN_CUDNN_FRONTEND_ERROR_IF(detail::convert_to_cudnn_type(note, backend_note) != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + "Unexpected behaviour note provided."); + for (auto i = 0u; i < engine_configs.size(); i++) { - if (std::find(numeric_notes[i].begin(), numeric_notes[i].end(), note) != numeric_notes[i].end()) { + if (std::find(numeric_notes[i].begin(), numeric_notes[i].end(), backend_note) != + numeric_notes[i].end()) { filtered_indices[i] = true; } } @@ -212,10 +340,17 @@ class Execution_plan_list { } error_t - filter_out_behavior_notes(std::vector const& notes) { - for (auto note : notes) { + deselect_behavior_notes(std::vector const& notes) { + for (auto& note : notes) { + cudnnBackendBehaviorNote_t backend_note; + + RETURN_CUDNN_FRONTEND_ERROR_IF(detail::convert_to_cudnn_type(note, backend_note) != CUDNN_STATUS_SUCCESS, + error_code_t::CUDNN_BACKEND_API_FAILED, + "Unexpected behaviour note provided."); + for (auto i = 0u; i < engine_configs.size(); i++) { - if (std::find(behavior_notes[i].begin(), behavior_notes[i].end(), note) != behavior_notes[i].end()) { + if (std::find(behavior_notes[i].begin(), behavior_notes[i].end(), backend_note) != + behavior_notes[i].end()) { filtered_indices[i] = true; } } @@ -245,76 +380,117 @@ class Execution_plan_list { error_t check_support(cudnnHandle_t handle) { - auto const& configs = get_filtered_engine_configs(); - for (auto const& config : configs) { - std::shared_ptr plan; - auto const& fe_status = detail::create_cudnn_execution_plan(plan, config, operation_tag, handle); - - if (fe_status.is_good() && plan->getWorkspaceSize() <= max_workspace_allowed) { - RETURN_CUDNN_FRONTEND_ERROR_IF(execution_plans.size(), - error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, - "[cudnn_frontend] Check support or build called already."); - - // No plans should be pushed here. - // But check_support in v8 incurs compilation cost. - // If not pushed, build_plans will incur compilation cost again. - // TODO: Uncomment after https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=4299195&cmtNo= - // if(cudnnGetVersion() < 9100) - { execution_plans.push_back(std::move(plan)); } + for (auto i = 0u; i < engine_configs.size(); i++) { + if (filtered_indices[i]) { + getLogger() << "[cudnn_frontend] INFO: Deselecting execution plan at position " << i << std::endl; + continue; + } + + auto const& config = engine_configs[i]; + auto fe_status = detail::create_cudnn_execution_plan(execution_plans[i], config, operation_tag, handle); + getLogger() << "[cudnn_frontend] INFO: Building plan at index " << i << " gave " << fe_status.get_code() + << " with message: " << fe_status.get_message() << std::endl; + + // If a plan is built successfully, set it as a candidate + if (fe_status.is_good()) { + // Filter out execution plans with workspace greater than whats available from user + if (execution_plans[i]->getWorkspaceSize() > max_workspace_allowed) { + filtered_indices[i] = true; + getLogger() << "[cudnn_frontend] INFO: Deselecting execution plan at position " << i << std::endl; + continue; + } + + candidate = static_cast(i); return {error_code_t::OK, ""}; } } + // No plans were able to be built. Return error. return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, "[cudnn_frontend] Error: No execution plans built successfully."}; } + error_t + build_plans(cudnnHandle_t handle, std::string const& json) { + execution_plans.resize(1); + auto const& fe_status = detail::create_cudnn_execution_plan(execution_plans[0], json, handle); + + if (fe_status.is_good()) { + candidate = 0; + } + + return fe_status; + } + + error_t + build_plan_at_index(cudnnHandle_t handle, int64_t index) { + RETURN_CUDNN_FRONTEND_ERROR_IF(filtered_indices[index] == true, + error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, + "Chosen plan index has been deselected."); + + if (execution_plans[index] != nullptr && execution_plans[index]->getWorkspaceSize() <= max_workspace_allowed) { + return {error_code_t::OK, ""}; + }; + + auto fe_status = + detail::create_cudnn_execution_plan(execution_plans[index], engine_configs[index], operation_tag, handle); + + getLogger() << "[cudnn_frontend] INFO: Building plan at index " << index << " gave " << fe_status.get_code() + << " with message: " << fe_status.get_message() << std::endl; + + // Sets candidate in case user does not call execute with plan_index later. + if (fe_status.is_good()) { + if (execution_plans[index]->getWorkspaceSize() <= max_workspace_allowed) { + candidate = index; + } else { + filtered_indices[index] = true; + return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, + "[cudnn_frontend] Error: Workspace size is too large."}; + } + } + + return fe_status; + } + error_t build_plans(cudnnHandle_t handle, BuildPlanPolicy_t const policy, bool const do_multithreaded_builds) { RETURN_CUDNN_FRONTEND_ERROR_IF(do_multithreaded_builds, error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, "Doing multithreaded builds is not yet supported."); - auto const& configs = get_filtered_engine_configs(); - - switch (policy) { - case BuildPlanPolicy_t::HEURISTICS_CHOICE: - // short circuit in case a plan was already created. - // This happens as check_support for v8 builds a plan. - // Should not happen in v9. - // TODO: Uncomment after https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=4299195&cmtNo= - // if(cudnnGetVersion() < 9100) - { - if (execution_plans.size() > 0) { - return {error_code_t::OK, ""}; - } - } + // short circuit in case a plan was already created. + // This happens as check_support for v8 builds a plan. + if (policy == BuildPlanPolicy_t::HEURISTICS_CHOICE && candidate != -1) { + return {error_code_t::OK, ""}; + } - for (auto const& config : configs) { - std::shared_ptr plan; - auto const& fe_status = detail::create_cudnn_execution_plan(plan, config, operation_tag, handle); + for (auto i = 0u; i < engine_configs.size(); i++) { + if (filtered_indices[i]) { + getLogger() << "[cudnn_frontend] INFO: Skipping deselected engine plan at index " << i << std::endl; + continue; + } - if (fe_status.is_good() && plan->getWorkspaceSize() <= max_workspace_allowed) { - execution_plans.push_back(std::move(plan)); - break; - } + auto fe_status = + detail::create_cudnn_execution_plan(execution_plans[i], engine_configs[i], operation_tag, handle); + getLogger() << "[cudnn_frontend] INFO: Building plan at index " << i << " gave " << fe_status.get_code() + << " with message: " << fe_status.get_message() << std::endl; + + if (fe_status.is_good()) { + if (execution_plans[i]->getWorkspaceSize() > max_workspace_allowed) { + filtered_indices[i] = true; + continue; } - break; - case BuildPlanPolicy_t::ALL: - for (auto const& config : configs) { - std::shared_ptr plan; - auto const& fe_status = detail::create_cudnn_execution_plan(plan, config, operation_tag, handle); - - if (fe_status.is_good() && plan->getWorkspaceSize() <= max_workspace_allowed) { - execution_plans.push_back(std::move(plan)); - } + // Only set the candidate the first time, as the order of iteration is from highest to lowest priority + if (candidate == -1) { + candidate = static_cast(i); } - break; - } - RETURN_CUDNN_FRONTEND_ERROR_IF(execution_plans.empty(), - error_code_t::GRAPH_NOT_SUPPORTED, - "No execution plans finalized successfully. Hence, not supported."); + // Return from this function as first successfully built plan is found. + if (policy == BuildPlanPolicy_t::HEURISTICS_CHOICE) { + return {error_code_t::OK, ""}; + } + } + } return {error_code_t::OK, ""}; } @@ -328,34 +504,20 @@ class Execution_plan_list { return max_size; } - std::shared_ptr - get_best_candidate() const { - if (execution_plans.empty()) return nullptr; - return execution_plans.front(); - } - static error_t autotune_default_impl(std::vector>& execution_plans, cudnnHandle_t handle, - std::unordered_map, void*> variants, - void* workspace, + std::unordered_map const& tensor_to_pointer_map, + void* workspace_ptr, void*) { // Create the variant pack for all the plans to use. std::vector uids; std::vector ptrs; - for (auto it : variants) { - if (it.first != nullptr) { - uids.push_back(it.first->get_uid()); - ptrs.push_back(it.second); - } + for (auto it : tensor_to_pointer_map) { + uids.push_back(it.first); + ptrs.push_back(it.second); } - auto variantPack = VariantPackBuilder() - .setDataPointers(ptrs.size(), ptrs.data()) - .setUids(uids.size(), uids.data()) - .setWorkspacePointer(workspace) - .build(); - std::vector> time_sorted_plans; auto plan_cmp = [](std::shared_ptr a, std::shared_ptr b) { @@ -381,19 +543,14 @@ class Execution_plan_list { float min_time_ms = std::numeric_limits::max(); // Warm-up run - auto warmup_status = cudnnBackendExecute(handle, plan->get_raw_desc(), variantPack.get_raw_desc()); - if (warmup_status != CUDNN_STATUS_SUCCESS) { - getLogger() << "[cudnn_frontend] Plan " << plan->getTag() << " failed with " << to_string(warmup_status) - << std::endl; - continue; - } + CHECK_CUDNN_FRONTEND_ERROR(detail::execute(handle, plan.get(), ptrs, uids, workspace_ptr)); successful_plan_count++; cudaDeviceSynchronize(); for (int i = 0; i < maxIterCount; i++) { cudaEventRecord(start, stream); - cudnnBackendExecute(handle, plan->get_raw_desc(), variantPack.get_raw_desc()); + auto status = detail::execute(handle, plan.get(), ptrs, uids, workspace_ptr); cudaEventRecord(stop, stream); cudaEventSynchronize(stop); @@ -427,17 +584,17 @@ class Execution_plan_list { std::function>&, cudnnHandle_t, - std::unordered_map, void*>, + std::unordered_map const&, void*, void*)> autotune_impl = &Execution_plan_list::autotune_default_impl; error_t autotune(cudnnHandle_t handle, - std::unordered_map, void*> variants, + std::unordered_map const& tensor_to_pointer_map, void* workspace, void* user_impl = nullptr) { - auto error = autotune_impl(execution_plans, handle, variants, workspace, user_impl); + auto error = autotune_impl(execution_plans, handle, tensor_to_pointer_map, workspace, user_impl); return error; } }; diff --git a/include/cudnn_frontend/utils/serialize.h b/include/cudnn_frontend/utils/serialize.h new file mode 100644 index 00000000..2a59f42f --- /dev/null +++ b/include/cudnn_frontend/utils/serialize.h @@ -0,0 +1,328 @@ +#pragma once + +#include "../graph_properties.h" +#include "../graph_helpers.h" + +namespace cudnn_frontend::graph { + +NLOHMANN_JSON_SERIALIZE_ENUM(BN_finalize_attributes::input_names, + { + {BN_finalize_attributes::input_names::SUM, "SUM"}, + {BN_finalize_attributes::input_names::SQ_SUM, "SQ_SUM"}, + {BN_finalize_attributes::input_names::SCALE, "SCALE"}, + {BN_finalize_attributes::input_names::BIAS, "BIAS"}, + {BN_finalize_attributes::input_names::EPSILON, "EPSILON"}, + {BN_finalize_attributes::input_names::ACCUM_COUNT, "ACCUM_COUNT"}, + {BN_finalize_attributes::input_names::PREV_RUNNING_MEAN, "PREV_RUNNING_MEAN"}, + {BN_finalize_attributes::input_names::PREV_RUNNING_VAR, "PREV_RUNNING_VAR"}, + {BN_finalize_attributes::input_names::MOMENTUM, "MOMENTUM"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(BN_finalize_attributes::output_names, + { + {BN_finalize_attributes::output_names::EQ_SCALE, "EQ_SCALE"}, + {BN_finalize_attributes::output_names::EQ_BIAS, "EQ_BIAS"}, + {BN_finalize_attributes::output_names::MEAN, "MEAN"}, + {BN_finalize_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"}, + {BN_finalize_attributes::output_names::NEXT_RUNNING_MEAN, "NEXT_RUNNING_MEAN"}, + {BN_finalize_attributes::output_names::NEXT_RUNNING_VAR, "NEXT_RUNNING_VAR"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_attributes::input_names, + { + {Batchnorm_attributes::input_names::X, "X"}, + {Batchnorm_attributes::input_names::SCALE, "SCALE"}, + {Batchnorm_attributes::input_names::BIAS, "BIAS"}, + {Batchnorm_attributes::input_names::EPSILON, "EPSILON"}, + {Batchnorm_attributes::input_names::PREV_RUNNING_MEAN, "PREV_RUNNING_MEAN"}, + {Batchnorm_attributes::input_names::PREV_RUNNING_VAR, "PREV_RUNNING_VAR"}, + {Batchnorm_attributes::input_names::MOMENTUM, "MOMENTUM"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_attributes::output_names, + { + {Batchnorm_attributes::output_names::Y, "Y"}, + {Batchnorm_attributes::output_names::MEAN, "MEAN"}, + {Batchnorm_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"}, + {Batchnorm_attributes::output_names::NEXT_RUNNING_MEAN, "NEXT_RUNNING_MEAN"}, + {Batchnorm_attributes::output_names::NEXT_RUNNING_VAR, "NEXT_RUNNING_VAR"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_backward_attributes::input_names, + { + {Batchnorm_backward_attributes::input_names::DY, "DY"}, + {Batchnorm_backward_attributes::input_names::X, "X"}, + {Batchnorm_backward_attributes::input_names::SCALE, "SCALE"}, + {Batchnorm_backward_attributes::input_names::MEAN, "MEAN"}, + {Batchnorm_backward_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_backward_attributes::output_names, + { + {Batchnorm_backward_attributes::output_names::DX, "DX"}, + {Batchnorm_backward_attributes::output_names::DSCALE, "DSCALE"}, + {Batchnorm_backward_attributes::output_names::DBIAS, "DBIAS"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_inference_attributes::input_names, + { + {Batchnorm_inference_attributes::input_names::X, "X"}, + {Batchnorm_inference_attributes::input_names::SCALE, "SCALE"}, + {Batchnorm_inference_attributes::input_names::BIAS, "BIAS"}, + {Batchnorm_inference_attributes::input_names::MEAN, "MEAN"}, + {Batchnorm_inference_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_inference_attributes::output_names, + {{Batchnorm_inference_attributes::output_names::Y, "Y"}}) + +NLOHMANN_JSON_SERIALIZE_ENUM(Conv_dgrad_attributes::input_names, + { + {Conv_dgrad_attributes::input_names::W, "W"}, + {Conv_dgrad_attributes::input_names::DY, "DY"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Conv_dgrad_attributes::output_names, + { + {Conv_dgrad_attributes::output_names::DX, "DX"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Conv_fprop_attributes::input_names, + { + {Conv_fprop_attributes::input_names::X, "X"}, + {Conv_fprop_attributes::input_names::W, "W"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Conv_fprop_attributes::output_names, + { + {Conv_fprop_attributes::output_names::Y, "Y"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Conv_wgrad_attributes::input_names, + { + {Conv_wgrad_attributes::input_names::DY, "DY"}, + {Conv_wgrad_attributes::input_names::X, "X"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Conv_wgrad_attributes::output_names, + { + {Conv_wgrad_attributes::output_names::DW, "DW"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(DBN_weight_attributes::input_names, + { + {DBN_weight_attributes::input_names::DY, "DY"}, + {DBN_weight_attributes::input_names::X, "X"}, + {DBN_weight_attributes::input_names::SCALE, "SCALE"}, + {DBN_weight_attributes::input_names::MEAN, "MEAN"}, + {DBN_weight_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(DBN_weight_attributes::output_names, + { + {DBN_weight_attributes::output_names::DSCALE, "DSCALE"}, + {DBN_weight_attributes::output_names::DBIAS, "DBIAS"}, + {DBN_weight_attributes::output_names::EQ_BIAS, "EQ_BIAS"}, + {DBN_weight_attributes::output_names::EQ_SCALE_DY, "EQ_SCALE_DY"}, + {DBN_weight_attributes::output_names::EQ_SCALE_X, "EQ_SCALE_X"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Genstats_attributes::input_names, + { + {Genstats_attributes::input_names::X, "X"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Genstats_attributes::output_names, + { + {Genstats_attributes::output_names::SUM, "SUM"}, + {Genstats_attributes::output_names::SQ_SUM, "SQ_SUM"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Instancenorm_attributes::input_names, + { + {Instancenorm_attributes::input_names::X, "X"}, + {Instancenorm_attributes::input_names::SCALE, "SCALE"}, + {Instancenorm_attributes::input_names::BIAS, "BIAS"}, + {Instancenorm_attributes::input_names::EPSILON, "EPSILON"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Instancenorm_attributes::output_names, + { + {Instancenorm_attributes::output_names::Y, "Y"}, + {Instancenorm_attributes::output_names::MEAN, "MEAN"}, + {Instancenorm_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Instancenorm_backward_attributes::input_names, + { + {Instancenorm_backward_attributes::input_names::DY, "DY"}, + {Instancenorm_backward_attributes::input_names::X, "X"}, + {Instancenorm_backward_attributes::input_names::SCALE, "SCALE"}, + {Instancenorm_backward_attributes::input_names::MEAN, "MEAN"}, + {Instancenorm_backward_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Instancenorm_backward_attributes::output_names, + { + {Instancenorm_backward_attributes::output_names::DX, "DX"}, + {Instancenorm_backward_attributes::output_names::DSCALE, "DSCALE"}, + {Instancenorm_backward_attributes::output_names::DBIAS, "DBIAS"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Layernorm_attributes::input_names, + { + {Layernorm_attributes::input_names::X, "X"}, + {Layernorm_attributes::input_names::SCALE, "SCALE"}, + {Layernorm_attributes::input_names::BIAS, "BIAS"}, + {Layernorm_attributes::input_names::EPSILON, "EPSILON"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Layernorm_attributes::output_names, + { + {Layernorm_attributes::output_names::Y, "Y"}, + {Layernorm_attributes::output_names::MEAN, "MEAN"}, + {Layernorm_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Layernorm_backward_attributes::input_names, + { + {Layernorm_backward_attributes::input_names::DY, "DY"}, + {Layernorm_backward_attributes::input_names::X, "X"}, + {Layernorm_backward_attributes::input_names::SCALE, "SCALE"}, + {Layernorm_backward_attributes::input_names::MEAN, "MEAN"}, + {Layernorm_backward_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Layernorm_backward_attributes::output_names, + { + {Layernorm_backward_attributes::output_names::DX, "DX"}, + {Layernorm_backward_attributes::output_names::DSCALE, "DSCALE"}, + {Layernorm_backward_attributes::output_names::DBIAS, "DBIAS"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Matmul_attributes::input_names, + { + {Matmul_attributes::input_names::A, "A"}, + {Matmul_attributes::input_names::B, "B"}, + {Matmul_attributes::input_names::M_override, "M_override"}, + {Matmul_attributes::input_names::N_override, "N_override"}, + {Matmul_attributes::input_names::K_override, "K_override"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Matmul_attributes::output_names, + { + {Matmul_attributes::output_names::C, "C"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Pointwise_attributes::input_names, + { + {Pointwise_attributes::input_names::IN_0, "IN_0"}, + {Pointwise_attributes::input_names::IN_1, "IN_1"}, + {Pointwise_attributes::input_names::IN_2, "IN_2"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Pointwise_attributes::output_names, + { + {Pointwise_attributes::output_names::OUT_0, "OUT_0"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Reduction_attributes::input_names, + { + {Reduction_attributes::input_names::X, "X"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Reduction_attributes::output_names, {{Reduction_attributes::output_names::Y, "Y"}}) + +NLOHMANN_JSON_SERIALIZE_ENUM(Reshape_attributes::input_names, + { + {Reshape_attributes::input_names::X, "X"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Reshape_attributes::output_names, {{Reshape_attributes::output_names::Y, "Y"}}) + +NLOHMANN_JSON_SERIALIZE_ENUM(Rmsnorm_attributes::input_names, + { + {Rmsnorm_attributes::input_names::X, "X"}, + {Rmsnorm_attributes::input_names::SCALE, "SCALE"}, + {Rmsnorm_attributes::input_names::BIAS, "BIAS"}, + {Rmsnorm_attributes::input_names::EPSILON, "EPSILON"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Rmsnorm_attributes::output_names, + { + {Rmsnorm_attributes::output_names::Y, "Y"}, + {Rmsnorm_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Rmsnorm_backward_attributes::input_names, + { + {Rmsnorm_backward_attributes::input_names::DY, "DY"}, + {Rmsnorm_backward_attributes::input_names::X, "X"}, + {Rmsnorm_backward_attributes::input_names::SCALE, "SCALE"}, + {Rmsnorm_backward_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Rmsnorm_backward_attributes::output_names, + { + {Rmsnorm_backward_attributes::output_names::DX, "DX"}, + {Rmsnorm_backward_attributes::output_names::DSCALE, "DSCALE"}, + {Rmsnorm_backward_attributes::output_names::DBIAS, "DBIAS"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Rng_attributes::input_names, + { + {Rng_attributes::input_names::Seed, "Seed"}, + {Rng_attributes::input_names::Offset, "Offset"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(Rng_attributes::output_names, {{Rng_attributes::output_names::Y, "Y"}}) + +NLOHMANN_JSON_SERIALIZE_ENUM(SDPA_attributes::input_names, + { + {SDPA_attributes::input_names::Q, "Q"}, + {SDPA_attributes::input_names::K, "K"}, + {SDPA_attributes::input_names::V, "V"}, + {SDPA_attributes::input_names::Attn_scale, "Attn_scale"}, + {SDPA_attributes::input_names::Bias, "Bias"}, + {SDPA_attributes::input_names::SEQ_LEN_Q, "SEQ_LEN_Q"}, + {SDPA_attributes::input_names::SEQ_LEN_KV, "SEQ_LEN_KV"}, + {SDPA_attributes::input_names::Seed, "Seed"}, + {SDPA_attributes::input_names::Offset, "Offset"}, + {SDPA_attributes::input_names::Dropout_mask, "Dropout_mask"}, + {SDPA_attributes::input_names::Dropout_scale, "Dropout_scale"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(SDPA_attributes::output_names, + {{SDPA_attributes::output_names::O, "O"}, + {SDPA_attributes::output_names::Stats, "Stats"}, + {SDPA_attributes::output_names::RNG_DUMP, "RNG_DUMP"}}) + +NLOHMANN_JSON_SERIALIZE_ENUM(SDPA_backward_attributes::input_names, + { + {SDPA_backward_attributes::input_names::Q, "Q"}, + {SDPA_backward_attributes::input_names::K, "K"}, + {SDPA_backward_attributes::input_names::V, "V"}, + {SDPA_backward_attributes::input_names::O, "O"}, + {SDPA_backward_attributes::input_names::dO, "dO"}, + {SDPA_backward_attributes::input_names::Stats, "Stats"}, + {SDPA_backward_attributes::input_names::Attn_scale, "Attn_scale"}, + {SDPA_backward_attributes::input_names::Bias, "Bias"}, + {SDPA_backward_attributes::input_names::SEQ_LEN_Q, "SEQ_LEN_Q"}, + {SDPA_backward_attributes::input_names::SEQ_LEN_KV, "SEQ_LEN_KV"}, + {SDPA_backward_attributes::input_names::Seed, "Seed"}, + {SDPA_backward_attributes::input_names::Offset, "Offset"}, + {SDPA_backward_attributes::input_names::Dropout_mask, "Dropout_mask"}, + {SDPA_backward_attributes::input_names::Dropout_scale, "Dropout_scale"}, + {SDPA_backward_attributes::input_names::Dropout_scale_inv, "Dropout_scale_inv"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(SDPA_backward_attributes::output_names, + { + {SDPA_backward_attributes::output_names::dQ, "dQ"}, + {SDPA_backward_attributes::output_names::dK, "dK"}, + {SDPA_backward_attributes::output_names::dV, "dV"}, + {SDPA_backward_attributes::output_names::dBias, "dBias"}, + {SDPA_backward_attributes::output_names::RNG_DUMP, "RNG_DUMP"}, + }) + +} // namespace cudnn_frontend::graph \ No newline at end of file diff --git a/include/cudnn_frontend_Heuristics.h b/include/cudnn_frontend_Heuristics.h index 187003a2..e404951b 100644 --- a/include/cudnn_frontend_Heuristics.h +++ b/include/cudnn_frontend_Heuristics.h @@ -96,10 +96,11 @@ class EngineHeuristics_v8 : public BackendDescriptor { count, &result, heuristic_results_.data()); - if (status != CUDNN_STATUS_SUCCESS) { + if (status != CUDNN_STATUS_SUCCESS || result < 1) { set_error_and_throw_exception( this, status, "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: GetAttribute CUDNN_ATTR_ENGINEHEUR_RESULTS Failed"); }; + m_heuristic_results.resize(result); return m_heuristic_results; } diff --git a/include/cudnn_frontend_utils.h b/include/cudnn_frontend_utils.h index b0bacfc3..5454a71a 100644 --- a/include/cudnn_frontend_utils.h +++ b/include/cudnn_frontend_utils.h @@ -84,7 +84,6 @@ namespace cudnn_frontend { /// Detailed feature_vector. Generally the Tensor and Operation properties using feature_vector_t = std::vector; -#ifndef NV_CUDNN_DISABLE_EXCEPTION class cudnnException : public std::runtime_error { public: cudnnException(const char* message, cudnnStatus_t status) throw() : std::runtime_error(message) { @@ -101,7 +100,6 @@ class cudnnException : public std::runtime_error { cudnnStatus_t error_status; }; -#endif static inline bool AllowAll(cudnnBackendDescriptor_t engine_config) { @@ -109,28 +107,14 @@ AllowAll(cudnnBackendDescriptor_t engine_config) { return false; } -static inline void -throw_if(std::function expr, const char* message, cudnnStatus_t status) { - if (expr()) { -#ifndef NV_CUDNN_DISABLE_EXCEPTION - throw cudnnException(message, status); -#endif - } -} -static inline void -throw_if(bool expr, const char* message, cudnnStatus_t status) { - if (expr) { -#ifndef NV_CUDNN_DISABLE_EXCEPTION - throw cudnnException(message, status); -#endif - } -} - static inline std::string to_string(cudnnStatus_t const status) { return cudnnGetErrorString(status); } +#ifndef NV_CUDNN_DISABLE_EXCEPTION +[[noreturn]] +#endif static inline void set_error_and_throw_exception(BackendDescriptor const* desc, cudnnStatus_t status, const char* message) { if (desc != nullptr) { @@ -620,6 +604,7 @@ get_pointwise_mode_port_count(PointwiseMode_t const& mode) { case PointwiseMode_t::LOGICAL_OR: case PointwiseMode_t::MIN: case PointwiseMode_t::MAX: + case PointwiseMode_t::MOD: case PointwiseMode_t::RELU_BWD: case PointwiseMode_t::TANH_BWD: case PointwiseMode_t::SIGMOID_BWD: @@ -642,7 +627,6 @@ get_pointwise_mode_port_count(PointwiseMode_t const& mode) { case PointwiseMode_t::EXP: case PointwiseMode_t::LOG: case PointwiseMode_t::NEG: - case PointwiseMode_t::MOD: case PointwiseMode_t::ABS: case PointwiseMode_t::CEIL: case PointwiseMode_t::FLOOR: @@ -785,7 +769,7 @@ get_abili_slope(int64_t const n_heads) { #pragma warning(push) #pragma warning(disable : 4244) // this could be ommited with c++17 and contexpr #endif - int n = 1 << static_cast(log2f(n_heads)); + int n = 1 << static_cast(log2(static_cast(n_heads))); #ifdef _MSC_VER #pragma warning(pop) #endif @@ -794,12 +778,12 @@ get_abili_slope(int64_t const n_heads) { } for (int i = 0; i < 2 * (n_heads - n); i += 2) { - slope.push_back((float)(i + 1.0f) * 0.5f); + slope.push_back(static_cast(i + 1) * 0.5f); } for (float& elem : slope) { - elem *= -8.0; - elem /= n; + elem *= -8.0f; + elem /= static_cast(n); elem = powf(2.0, elem); } diff --git a/python_bindings/properties.cpp b/python_bindings/properties.cpp index de2e9b4d..5fd700bb 100644 --- a/python_bindings/properties.cpp +++ b/python_bindings/properties.cpp @@ -90,6 +90,7 @@ init_properties(py::module_& m) { .def("set_is_pass_by_value", &cudnn_frontend::graph::Tensor_attributes::set_is_pass_by_value) .def("get_uid", &cudnn_frontend::graph::Tensor_attributes::get_uid) .def("set_uid", &cudnn_frontend::graph::Tensor_attributes::set_uid) + .def("set_ragged_offset", &cudnn_frontend::graph::Tensor_attributes::set_ragged_offset) .def("__repr__", [](cudnn_frontend::graph::Tensor_attributes const& props) { std::ostringstream out; out << json{props}; diff --git a/python_bindings/pygraph/pointwise.cpp b/python_bindings/pygraph/pointwise.cpp index d5e8b486..5dabc23a 100644 --- a/python_bindings/pygraph/pointwise.cpp +++ b/python_bindings/pygraph/pointwise.cpp @@ -301,7 +301,7 @@ init_pygraph_pointwise_submodule(py::class_& m) { )pbdoc"); m.def("tanh", &PyGraph::pointwise_unary, - py::arg("input0"), + py::arg("input"), py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET), py::arg_v("name", ""), R"pbdoc( diff --git a/python_bindings/pygraph/pygraph.cpp b/python_bindings/pygraph/pygraph.cpp index 015d15ef..2c516b78 100644 --- a/python_bindings/pygraph/pygraph.cpp +++ b/python_bindings/pygraph/pygraph.cpp @@ -107,6 +107,7 @@ PyGraph::tensor(std::vector const& dim, cudnn_frontend::DataType_t const& data_type, bool const& is_virtual, bool const& is_pass_by_value, + std::shared_ptr const& ragged_offset, std::string const& name) { auto props = cudnn_frontend::graph::Tensor_attributes() .set_data_type(data_type) @@ -114,6 +115,7 @@ PyGraph::tensor(std::vector const& dim, .set_is_pass_by_value(is_pass_by_value) .set_dim(dim) .set_stride(stride) + .set_ragged_offset(ragged_offset) .set_name(name); return graph.tensor(props); @@ -168,13 +170,15 @@ PyGraph::tensor_like(py::object const& pyobj) { std::shared_ptr PyGraph::conv_fprop(std::shared_ptr& image, std::shared_ptr& weight, - std::vector const& padding, + std::vector const& pre_padding, + std::vector const& post_padding, std::vector const& stride, std::vector const& dilation, cudnn_frontend::DataType_t const& compute_data_type, std::string const& name) { auto attributes = cudnn_frontend::graph::Conv_fprop_attributes() - .set_padding(padding) + .set_pre_padding(pre_padding) + .set_post_padding(post_padding) .set_stride(stride) .set_dilation(dilation) .set_compute_data_type(compute_data_type) @@ -187,13 +191,15 @@ PyGraph::conv_fprop(std::shared_ptr& i std::shared_ptr PyGraph::conv_dgrad(std::shared_ptr& loss, std::shared_ptr& filter, - std::vector const& padding, + std::vector const& pre_padding, + std::vector const& post_padding, std::vector const& stride, std::vector const& dilation, cudnn_frontend::DataType_t const& compute_data_type, std::string const& name) { auto attributes = cudnn_frontend::graph::Conv_dgrad_attributes() - .set_padding(padding) + .set_pre_padding(pre_padding) + .set_post_padding(post_padding) .set_stride(stride) .set_dilation(dilation) .set_compute_data_type(compute_data_type) @@ -205,13 +211,15 @@ PyGraph::conv_dgrad(std::shared_ptr& l std::shared_ptr PyGraph::conv_wgrad(std::shared_ptr& image, std::shared_ptr& loss, - std::vector const& padding, + std::vector const& pre_padding, + std::vector const& post_padding, std::vector const& stride, std::vector const& dilation, cudnn_frontend::DataType_t const& compute_data_type, std::string const& name) { auto attributes = cudnn_frontend::graph::Conv_wgrad_attributes() - .set_padding(padding) + .set_pre_padding(pre_padding) + .set_post_padding(post_padding) .set_stride(stride) .set_dilation(dilation) .set_compute_data_type(compute_data_type) @@ -331,6 +339,22 @@ PyGraph::execute(std::unordered_map var_pack, py::object workspace) { + std::unordered_map var_pack_; + for (auto const& [uid, pyobject] : var_pack) { + var_pack_.emplace(uid, extract_data_pointer(pyobject)); + } + + void* workspace_ptr = extract_data_pointer(workspace); + + // TODO: Probably concatenate in a macro? + auto status = graph.execute(handle, var_pack_, workspace_ptr); + throw_if(status.is_bad(), status.get_code(), status.get_message()); + + return; +} + std::vector default_vector(void) { return {}; @@ -363,6 +387,7 @@ init_pygraph_submodule(py::module_& m) { py::arg_v("data_type", cudnn_frontend::DataType_t::NOT_SET), py::arg_v{"is_virtual", false}, py::arg_v{"is_pass_by_value", false}, + py::arg_v{"ragged_offset", nullptr}, py::arg_v("name", ""), R"pbdoc( Create a tensor. @@ -373,6 +398,7 @@ init_pygraph_submodule(py::module_& m) { data_type (cudnn.data_type): The data type of the tensor. Default is cudnn.data_type.NOT_SET. is_virtual (bool): Flag indicating if the tensor is virtual. Default is False. is_pass_by_value (bool): Flag indicating if the tensor is passed by value. Default is False. + ragged_offset (cudnn_tensor): The ragged offset tensor. Default is nullptr. name (Optional[str]): The name of the tensor. Returns: @@ -383,11 +409,31 @@ init_pygraph_submodule(py::module_& m) { py::arg("input"), py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET), py::arg_v("name", "")) + .def( + "conv_fprop", + [](PyGraph& self, + std::shared_ptr& image, + std::shared_ptr& weight, + std::vector const& padding, + std::vector const& stride, + std::vector const& dilation, + cudnn_frontend::DataType_t const& compute_data_type, + std::string const& name) { + return self.conv_fprop(image, weight, padding, padding, stride, dilation, compute_data_type, name); + }, + py::arg("image"), + py::arg("weight"), + py::arg_v{"padding", default_vector()}, + py::arg_v{"stride", default_vector()}, + py::arg_v{"dilation", default_vector()}, + py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET), + py::arg_v("name", "")) .def("conv_fprop", &PyGraph::conv_fprop, py::arg("image"), py::arg("weight"), - py::arg_v{"padding", default_vector()}, + py::arg_v{"pre_padding", default_vector()}, + py::arg_v{"post_padding", default_vector()}, py::arg_v{"stride", default_vector()}, py::arg_v{"dilation", default_vector()}, py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET), @@ -398,7 +444,8 @@ init_pygraph_submodule(py::module_& m) { Args: image (cudnn_tensor): The image tensor. weight (cudnn_tensor): The weight tensor. - padding (Optional[List[int]]): The padding values for the operation. Default is an empty list. + pre_padding (Optional[List[int]]): The pre padding values for the operation. Default is an empty list. + post_padding (Optional[List[int]]): The post padding values for the operation. Default is an empty list. stride (Optional[List[int]]): The stride values for the operation. Default is an empty list. dilation (Optional[List[int]]): The dilation values for the operation. Default is an empty list. compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET. @@ -407,11 +454,31 @@ init_pygraph_submodule(py::module_& m) { Returns: cudnn_tensor: The created tensor. )pbdoc") + .def( + "conv_wgrad", + [](PyGraph& self, + std::shared_ptr& image, + std::shared_ptr& loss, + std::vector const& padding, + std::vector const& stride, + std::vector const& dilation, + cudnn_frontend::DataType_t const& compute_data_type, + std::string const& name) { + return self.conv_wgrad(image, loss, padding, padding, stride, dilation, compute_data_type, name); + }, + py::arg("image"), + py::arg("loss"), + py::arg_v{"padding", default_vector()}, + py::arg_v{"stride", default_vector()}, + py::arg_v{"dilation", default_vector()}, + py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET), + py::arg_v("name", "")) .def("conv_wgrad", &PyGraph::conv_wgrad, py::arg("image"), py::arg("loss"), - py::arg_v{"padding", default_vector()}, + py::arg_v{"pre_padding", default_vector()}, + py::arg_v{"post_padding", default_vector()}, py::arg_v{"stride", default_vector()}, py::arg_v{"dilation", default_vector()}, py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET), @@ -422,8 +489,8 @@ init_pygraph_submodule(py::module_& m) { Args: image (cudnn_tensor): The image tensor. loss (cudnn_tensor): The loss tensor. - padding (Optional[List[int]]): The padding values for the operation. Default is an empty list. - stride (Optional[List[int]]): The stride values for the operation. Default is an empty list. + pre_padding (Optional[List[int]]): The pre padding values for the operation. Default is an empty list. + post_padding (Optional[List[int]]): The post padding values for the operation. Default is an empty list. stride (Optional[List[int]]): The stride values for the operation. Default is an empty list. dilation (Optional[List[int]]): The dilation values for the operation. Default is an empty list. compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET. name (Optional[str]): A name for the operation to be performed. @@ -431,11 +498,31 @@ init_pygraph_submodule(py::module_& m) { Returns: cudnn_tensor: The created tensor. )pbdoc") + .def( + "conv_dgrad", + [](PyGraph& self, + std::shared_ptr& loss, + std::shared_ptr& filter, + std::vector const& padding, + std::vector const& stride, + std::vector const& dilation, + cudnn_frontend::DataType_t const& compute_data_type, + std::string const& name) { + return self.conv_dgrad(loss, filter, padding, padding, stride, dilation, compute_data_type, name); + }, + py::arg("loss"), + py::arg("filter"), + py::arg_v{"padding", default_vector()}, + py::arg_v{"stride", default_vector()}, + py::arg_v{"dilation", default_vector()}, + py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET), + py::arg_v("name", "")) .def("conv_dgrad", &PyGraph::conv_dgrad, py::arg("loss"), py::arg("filter"), - py::arg_v{"padding", default_vector()}, + py::arg_v{"pre_padding", default_vector()}, + py::arg_v{"post_padding", default_vector()}, py::arg_v{"stride", default_vector()}, py::arg_v{"dilation", default_vector()}, py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET), @@ -446,7 +533,8 @@ init_pygraph_submodule(py::module_& m) { Args: loss (cudnn_tensor): The loss tensor. filter (cudnn_tensor): The filter tensor. - padding (Optional[List[int]]): The padding values for the operation. Default is an empty list. + pre_padding (Optional[List[int]]): The pre padding values for the operation. Default is an empty list. + post_padding (Optional[List[int]]): The post padding values for the operation. Default is an empty list. stride (Optional[List[int]]): The stride values for the operation. Default is an empty list. dilation (Optional[List[int]]): The dilation values for the operation. Default is an empty list. compute_data_type (Optional[pycudnn.data_type]): The data type for computation. Default is NOT_SET. @@ -505,7 +593,13 @@ init_pygraph_submodule(py::module_& m) { py::arg("policy") = cudnn_frontend::BuildPlanPolicy_t::HEURISTICS_CHOICE) .def("build", &PyGraph::build) .def("get_workspace_size", &PyGraph::get_workspace_size) - .def("execute", &PyGraph::execute) + .def( + "execute", + static_cast, py::object>, py::object)>( + &PyGraph::execute)) + .def("execute", + static_cast, py::object)>(&PyGraph::execute)) .def("__repr__", [](PyGraph const& pygraph) { std::stringstream ss; json j = pygraph.graph; diff --git a/python_bindings/pygraph/pygraph.h b/python_bindings/pygraph/pygraph.h index cd5dc6ea..3d2dd86e 100644 --- a/python_bindings/pygraph/pygraph.h +++ b/python_bindings/pygraph/pygraph.h @@ -71,6 +71,7 @@ class PyGraph { cudnn_frontend::DataType_t const& data_type, bool const& is_virtual, bool const& is_pass_by_value, + std::shared_ptr const& ragged_offset, std::string const& name); std::shared_ptr @@ -131,7 +132,8 @@ class PyGraph { std::shared_ptr conv_fprop(std::shared_ptr& image, std::shared_ptr& weight, - std::vector const& padding, + std::vector const& pre_padding, + std::vector const& post_padding, std::vector const& stride, std::vector const& dilation, cudnn_frontend::DataType_t const& compute_data_type, @@ -140,7 +142,8 @@ class PyGraph { std::shared_ptr conv_dgrad(std::shared_ptr& loss, std::shared_ptr& filter, - std::vector const& padding, + std::vector const& pre_padding, + std::vector const& post_padding, std::vector const& stride, std::vector const& dilation, cudnn_frontend::DataType_t const& compute_data_type, @@ -149,7 +152,8 @@ class PyGraph { std::shared_ptr conv_wgrad(std::shared_ptr& image, std::shared_ptr& loss, - std::vector const& padding, + std::vector const& pre_padding, + std::vector const& post_padding, std::vector const& stride, std::vector const& dilation, cudnn_frontend::DataType_t const& compute_data_type, @@ -306,6 +310,9 @@ class PyGraph { execute(std::unordered_map, py::object> var_pack, py::object workspace); + void + execute(std::unordered_map var_pack, py::object workspace); + void deselect_numeric_notes(std::vector const& notes) { graph.deselect_numeric_notes(notes); diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 990d8fbb..95fbaa77 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -21,6 +21,8 @@ add_executable( cpp/layernorm.cpp cpp/rmsnorm.cpp cpp/wgrads.cpp + cpp/serialization.cpp + cpp/pointwise.cpp legacy_samples/conv_sample.cpp legacy_samples/resnet_test_list.cpp diff --git a/samples/README.md b/samples/README.md index a5821258..71deae82 100644 --- a/samples/README.md +++ b/samples/README.md @@ -1,4 +1,10 @@ -This directory contains several samples for you to see how we envision using the CUDNN Frontend API. +# FE - Programming Samples +## Python Interface Samples +Samples leveraging FE's Python interface are located in [samples/python](/samples/python/). -For questions or to provide feedback, please contact cuDNN@nvidia.com. +## C++ Interface Samples +Samples leveraging FE's C++ interface are located in [samples/cpp](/samples/cpp/). + +## [Deprecated] C++ v0.x Interface Samples +Samples leveraging FE's C++ 0.x interface are located in [samples/legacy_samples](/samples/legacy_samples/). diff --git a/samples/cpp/convolutions.cpp b/samples/cpp/convolutions.cpp index 514da386..51fc5816 100644 --- a/samples/cpp/convolutions.cpp +++ b/samples/cpp/convolutions.cpp @@ -28,7 +28,11 @@ TEST_CASE("Convolution fprop", "[conv][graph][caching]") { namespace fe = cudnn_frontend; - int64_t n = 16, c = 128, h = 56, w = 56, k = 256, r = 3, s = 3; + if (is_arch_supported_by_cudnn() == false) { + SKIP("Architecture is not supported by currend cudnn version"); + } + + int64_t n = 16, c = 128, h = 64, w = 64, k = 256, r = 1, s = 1; auto build_new_graph = [=](cudnnHandle_t handle) { auto graph = std::make_shared(); @@ -44,8 +48,10 @@ TEST_CASE("Convolution fprop", "[conv][graph][caching]") { .set_dim({k, c, r, s}) .set_stride({c * r * s, 1, c * s, c})); - auto conv_options = - fe::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1}); + auto conv_options = fe::graph::Conv_fprop_attributes() + .set_padding({0, 0}) + .set_stride({1, 1}) + .set_dilation({1, 1}); auto Y = graph->conv_fprop(X, W, conv_options); Y->set_output(true); @@ -73,10 +79,13 @@ TEST_CASE("Convolution fprop", "[conv][graph][caching]") { Surface w_tensor(k * c * r * s, false); Surface y_tensor(n * k * h * w, false); // Should be p, q. - std::unordered_map, void*> variant_pack = { - {X, x_tensor.devPtr}, {W, w_tensor.devPtr}, {Y, y_tensor.devPtr}}; + std::unordered_map variant_pack = { + {X->get_uid(), x_tensor.devPtr}, {W->get_uid(), w_tensor.devPtr}, {Y->get_uid(), y_tensor.devPtr}}; Surface workspace(graph->get_workspace_size(), false); + + std::cout << *graph << std::endl; + REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good()); cudnnDestroy(handle); } @@ -358,4 +367,4 @@ TEST_CASE("Conv with Int8 datatypes", "[conv][graph][caching]") { Surface workspace(graph->get_workspace_size(), false); REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good()); cudnnDestroy(handle); -} \ No newline at end of file +} diff --git a/samples/cpp/dgrads.cpp b/samples/cpp/dgrads.cpp index b3e66514..36a3654c 100644 --- a/samples/cpp/dgrads.cpp +++ b/samples/cpp/dgrads.cpp @@ -27,6 +27,9 @@ TEST_CASE("Convolution Dgrad", "[dgrad][graph]") { namespace fe = cudnn_frontend; + if (is_arch_supported_by_cudnn() == false) { + SKIP("Architecture is not supported by currend cudnn version"); + } fe::graph::Graph graph; graph.set_io_data_type(fe::DataType_t::HALF) .set_intermediate_data_type(fe::DataType_t::FLOAT) @@ -241,4 +244,4 @@ TEST_CASE("Dgrad Drelu DBNweight Graph", "[dgrad][graph]") { {drelu_output, drelu_output_tensor.devPtr}}; REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good()); cudnnDestroy(handle); -} \ No newline at end of file +} diff --git a/samples/cpp/matmuls.cpp b/samples/cpp/matmuls.cpp index c0380b60..3c2cdf78 100644 --- a/samples/cpp/matmuls.cpp +++ b/samples/cpp/matmuls.cpp @@ -21,11 +21,17 @@ */ #include + +#include + #include "../utils/helpers.h" #include TEST_CASE("Matmul", "[matmul][graph]") { + if (is_arch_supported_by_cudnn() == false) { + SKIP("Architecture is not supported by currend cudnn version"); + } namespace fe = cudnn_frontend; // matmul problem size @@ -69,6 +75,8 @@ TEST_CASE("Matmul", "[matmul][graph]") { REQUIRE(graph.build_operation_graph(handle).is_good()); REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good()); + REQUIRE(graph.check_support(handle).is_good()); + REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good()); // Run cudnn graph @@ -80,6 +88,104 @@ TEST_CASE("Matmul", "[matmul][graph]") { checkCudnnErr(cudnnDestroy(handle)); } +TEST_CASE("Matmul fp8 precision", "[matmul][graph]") { + if (cudnnGetCudartVersion() < 12000) { + SKIP("Test requires cuda toolkit 12.0 or above"); + } + + if ((is_hopper_arch() && cudnnGetVersion() >= 90000) == false) { + SKIP("FP8 gemm not supported pre-Hopper or pre-cudnn-9.0.0"); + } + + namespace fe = cudnn_frontend; + // matmul problem size + int64_t const b = 16; + int64_t const m = 32; + int64_t const n = 64; + int64_t const k = 128; + + // Initialize input tensors with int8_t as proxy for fp8 + Surface A_gpu(b * m * k, false); + Surface B_gpu(b * k * n, false); + + Surface A_descale_gpu(1, false); + Surface B_descale_gpu(1, false); + + fe::graph::Graph graph{}; + + // Create the two non-virtual input tensors A and B. + // There are read from global memory. + auto A_attributes = fe::graph::Tensor_attributes() + .set_name("A") + .set_dim({b, m, k}) + .set_stride({m * k, k, 1}) + .set_data_type(fe::DataType_t::FP8_E4M3); + auto A = graph.tensor(A_attributes); + + auto B_attributes = fe::graph::Tensor_attributes() + .set_name("B") + .set_dim({b, k, n}) + .set_stride({k * n, 1, k}) + .set_data_type(fe::DataType_t::FP8_E4M3); + auto B = graph.tensor(B_attributes); + + auto A_descale_attributes = + fe::graph::Tensor_attributes().set_name("A").set_dim({1, 1, 1}).set_stride({1, 1, 1}).set_data_type( + fe::DataType_t::FLOAT); + auto B_descale_attributes = + fe::graph::Tensor_attributes().set_name("B").set_dim({1, 1, 1}).set_stride({1, 1, 1}).set_data_type( + fe::DataType_t::FLOAT); + + auto A_descale = graph.tensor(A_descale_attributes); + auto B_descale = graph.tensor(B_descale_attributes); + + auto matmul_attributes = + fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT); + auto C = graph.matmul(A, B, matmul_attributes); + C->set_data_type(fe::DataType_t::FLOAT); + + // Add scale_A operation + auto pw_0_attributes = fe::graph::Pointwise_attributes() + .set_name("pw0_Mul") + .set_mode(fe::PointwiseMode_t::MUL) + .set_compute_data_type(fe::DataType_t::FLOAT); + auto C_after_pw_0 = graph.pointwise(C, A_descale, pw_0_attributes); + C_after_pw_0->set_data_type(fe::DataType_t::FLOAT); + + // Add descale_B operation + auto pw_1_attributes = fe::graph::Pointwise_attributes() + .set_name("pw1_Mul") + .set_mode(fe::PointwiseMode_t::MUL) + .set_compute_data_type(fe::DataType_t::FLOAT); + auto C_after_pw_1 = graph.pointwise(C_after_pw_0, B_descale, pw_1_attributes); + C_after_pw_1->set_output(true).set_data_type(fe::DataType_t::BFLOAT16); + + REQUIRE(graph.validate().is_good()); + + cudnnHandle_t handle; + checkCudnnErr(cudnnCreate(&handle)); + + REQUIRE(graph.build_operation_graph(handle).is_good()); + REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good()); + + REQUIRE(graph.check_support(handle).is_good()); + + REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good()); + + Surface C_gpu(b * m * n, false); + Surface workspace(graph.get_workspace_size(), false); + std::unordered_map, void*> variant_pack = { + {A, A_gpu.devPtr}, + {B, B_gpu.devPtr}, + {C_after_pw_1, C_gpu.devPtr}, + {A_descale, A_descale_gpu.devPtr}, + {B_descale, B_descale_gpu.devPtr}}; + + std::cout << graph.print() << std::endl; + REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good()); + checkCudnnErr(cudnnDestroy(handle)); +} + TEST_CASE("Mixed Precision Matmul", "[matmul][graph]") { if (cudnnGetCudartVersion() < 12000) { SKIP("Test requires cuda toolkit 12.0 or above"); @@ -140,7 +246,7 @@ TEST_CASE("Mixed Precision Matmul", "[matmul][graph]") { if (is_hopper_arch() && cudnnGetVersion() >= 8906) { REQUIRE(graph.check_support(handle).is_good()); } else { - SKIP("int8_bf16 mixe precision gemm not supported pre-Hopper or pre-cudnn-8.9.6"); + SKIP("int8_bf16 mixed precision gemm not supported pre-Hopper or pre-cudnn-8.9.6"); } REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good()); @@ -151,6 +257,93 @@ TEST_CASE("Mixed Precision Matmul", "[matmul][graph]") { Surface workspace(graph.get_workspace_size(), false); std::unordered_map, void*> variant_pack = { {A, A_gpu.devPtr}, {B, B_gpu.devPtr}, {C, C_gpu.devPtr}}; + + std::cout << graph.print() << std::endl; + REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good()); + checkCudnnErr(cudnnDestroy(handle)); +} + +TEST_CASE("Int8 Matmul", "[matmul][graph]") { + if (cudnnGetCudartVersion() < 12000) { + SKIP("Test requires cuda toolkit 12.0 or above"); + } + namespace fe = cudnn_frontend; + + // matmul problem size + int64_t const b = 16; + int64_t const m = 32; + int64_t const n = 64; + int64_t const k = 128; + + // Initialize input tensors + Surface A_gpu(b * m * k, false); + // note this is a bf16 tensor, but half is used just for memory allocation + Surface B_gpu(b * k * n, false); + + // Make cudnn graph + fe::graph::Graph graph{}; + + // Create the two non-virtual input tensors A and B. + // There are read from global memory. + auto A_attributes = fe::graph::Tensor_attributes() + .set_name("A") + .set_dim({b, m, k}) + .set_stride({m * k, k, 1}) + .set_data_type(fe::DataType_t::INT8); + auto A = graph.tensor(A_attributes); + auto B_attributes = fe::graph::Tensor_attributes() + .set_name("B") + .set_dim({b, k, n}) + .set_stride({k * n, 1, n}) + .set_data_type(fe::DataType_t::INT8); + auto B = graph.tensor(B_attributes); + + auto Bias_attributes = cudnn_frontend::graph::Tensor_attributes() + .set_name("Bias") + .set_dim({b, m, n}) + .set_data_type(cudnn_frontend::DataType_t::FLOAT) + .set_stride({m * n, n, 1}); + auto Bias = graph.tensor(Bias_attributes); + + // Add MATMUL operation + auto matmul_attributes = cudnn_frontend::graph::Matmul_attributes() + .set_compute_data_type(cudnn_frontend::DataType_t::INT32) + .set_name("GEMM"); + auto C = graph.matmul(A, B, matmul_attributes); + C->set_data_type(cudnn_frontend::DataType_t::FLOAT); + + // Add ADD operation + auto add_attributes = cudnn_frontend::graph::Pointwise_attributes() + .set_name("pw1_add") + .set_mode(cudnn_frontend::PointwiseMode_t::ADD) + .set_compute_data_type(cudnn_frontend::DataType_t::FLOAT); + auto C_after_add = graph.pointwise(C, Bias, add_attributes); + C_after_add->set_output(true).set_data_type(cudnn_frontend::DataType_t::FLOAT); + REQUIRE(graph.validate().is_good()); + + cudnnHandle_t handle; + checkCudnnErr(cudnnCreate(&handle)); + + REQUIRE(graph.build_operation_graph(handle).is_good()); + REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good()); + + if (check_device_arch_newer_than("ampere") && cudnnGetVersion() >= 8906) { + REQUIRE(graph.check_support(handle).is_good()); + } else { + SKIP("int8 gemm not supported pre-Ampere or pre-cudnn-8.9.6"); + } + + REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good()); + + // Run cudnn graph + // note this is a bf16 tensor, but half is used just for memory allocation + Surface C_gpu(b * m * n, false); + Surface Bias_gpu(b * m * n, false); + Surface workspace(graph.get_workspace_size(), false); + std::unordered_map, void*> variant_pack = { + {A, A_gpu.devPtr}, {B, B_gpu.devPtr}, {C_after_add, C_gpu.devPtr}, {Bias, Bias_gpu.devPtr}}; + + std::cout << graph.print() << std::endl; REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good()); checkCudnnErr(cudnnDestroy(handle)); } @@ -207,6 +400,8 @@ TEST_CASE("Abs + Matmul", "[matmul][graph]") { REQUIRE(graph.build_operation_graph(handle).is_good()); REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good()); + REQUIRE(graph.check_support(handle).is_good()); + REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good()); // Run cudnn graph @@ -286,14 +481,44 @@ TEST_CASE("Bias + Matmul", "[matmul][graph]") { REQUIRE(graph.build_operation_graph(handle).is_good()); REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good()); - REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good()); + int64_t plan_count = graph.get_execution_plan_count(); + + std::vector successful_plans; + std::vector unsuccessful_plans; + for (int64_t plan_index = 0; plan_index < plan_count; plan_index++) { + bool did_build_successfully = graph.build_plan_at_index(handle, plan_index).is_good(); + if (did_build_successfully) { + successful_plans.push_back(plan_index); + } else { + unsuccessful_plans.push_back(plan_index); + } + } // Run cudnn graph Surface C_gpu(b * m * n, false); - Surface workspace(graph.get_workspace_size(), false); std::unordered_map, void*> variant_pack = { {A, A_gpu.devPtr}, {B, B_gpu.devPtr}, {C, C_gpu.devPtr}, {Bias, Bias_gpu.devPtr}}; - REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good()); + + // Run a unsuccessful engine and except error + std::vector random_unsuccessful; + std::sample(unsuccessful_plans.begin(), + unsuccessful_plans.end(), + std::back_inserter(random_unsuccessful), + 1, + std::mt19937{std::random_device{}()}); + if (random_unsuccessful.size()) { + REQUIRE(graph.execute_plan_at_index(handle, variant_pack, nullptr, random_unsuccessful.front()).is_bad()); + } + + // Run a successful engine and except success + std::vector random_successful; + std::sample(successful_plans.begin(), + successful_plans.end(), + std::back_inserter(random_successful), + 1, + std::mt19937{std::random_device{}()}); + Surface workspace(graph.get_workspace_size_plan_at_index(random_successful.front()), false); + REQUIRE(graph.execute_plan_at_index(handle, variant_pack, workspace.devPtr, random_successful.front()).is_good()); checkCudnnErr(cudnnDestroy(handle)); } @@ -398,4 +623,4 @@ TEST_CASE("Matmul SBR Graph", "[matmul][graph]") { {O, y_tensor.devPtr}}; REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good()); cudnnDestroy(handle); -} \ No newline at end of file +} diff --git a/samples/cpp/pointwise.cpp b/samples/cpp/pointwise.cpp new file mode 100644 index 00000000..8137bf5a --- /dev/null +++ b/samples/cpp/pointwise.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include "../utils/helpers.h" + +#include + +TEST_CASE("Reduction", "[reduction]") { + namespace fe = cudnn_frontend; + constexpr int n = 64; + if (cudnnGetVersion() < 8600) { + SKIP("TEST REQUIRES minimum cudnn version 8.6.0"); + } + Surface A_gpu(n * n * n * n, false); + fe::graph::Graph graph{}; + auto A = graph.tensor(fe::graph::Tensor_attributes() + .set_dim({n, n, n, n}) + .set_stride({n * n * n, 1, n * n, n}) + .set_data_type(fe::DataType_t::FLOAT)); + auto C = graph.reduction(A, + fe::graph::Reduction_attributes() + .set_mode(fe::ReductionMode_t::MAX) + .set_compute_data_type(fe::DataType_t::FLOAT)); + C->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_dim({1, 1, 1, 1}); + REQUIRE(graph.validate().is_good()); + cudnnHandle_t handle; + checkCudnnErr(cudnnCreate(&handle)); + REQUIRE(graph.build_operation_graph(handle).is_good()); + REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good()); + REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good()); + Surface C_gpu(n * n * n * n, false); + std::unordered_map, void*> variant_pack = {{A, A_gpu.devPtr}, + {C, C_gpu.devPtr}}; + Surface workspace(graph.get_workspace_size(), false); + REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good()); + checkCudnnErr(cudnnDestroy(handle)); +} diff --git a/samples/cpp/serialization.cpp b/samples/cpp/serialization.cpp new file mode 100644 index 00000000..32651382 --- /dev/null +++ b/samples/cpp/serialization.cpp @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include "../utils/helpers.h" + +#include + +TEST_CASE("CSBR Graph with serialization", "[conv][graph][serialization]") { + enum UIDs { + x_tensor, + w_tensor, + y_tensor, + scale_tensor, + bias_tensor, + }; + +#if (CUDNN_VERSION < 8905) + SKIP("Serialization tests is not supported in cudnn versions prior to 8.9.5"); +#endif + + int64_t n = 8, c = 32, h = 16, w = 16, k = 64, r = 3, s = 3; + + cudnnHandle_t handle; // Handle to use during deserialize and execute + + checkCudnnErr(cudnnCreate(&handle)); + + auto build_and_validate_graph_helper = + [](int64_t n, int64_t c, int64_t h, int64_t w, int64_t k, int64_t r, int64_t s) + -> std::shared_ptr { + auto graph = std::make_shared(); + graph->set_io_data_type(cudnn_frontend::DataType_t::HALF) + .set_intermediate_data_type(cudnn_frontend::DataType_t::FLOAT) + .set_compute_data_type(cudnn_frontend::DataType_t::FLOAT); + + auto X = graph->tensor(cudnn_frontend::graph::Tensor_attributes() + .set_name("image") + .set_uid(x_tensor) + .set_dim({n, c, h, w}) + .set_stride({c * h * w, 1, c * w, c})); + + auto W = graph->tensor(cudnn_frontend::graph::Tensor_attributes() + .set_uid(w_tensor) + .set_name("filter") + .set_dim({k, c, r, s}) + .set_stride({c * r * s, 1, c * s, c})); + + auto conv_options = + cudnn_frontend::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1}); + auto conv_output = graph->conv_fprop(X, W, conv_options); + + auto S = graph->tensor(cudnn_frontend::graph::Tensor_attributes() + .set_uid(scale_tensor) + .set_name("scale") + .set_dim({1, k, 1, 1}) + .set_stride({k, 1, k, k})); + auto scale_options = + cudnn_frontend::graph::Pointwise_attributes().set_mode(cudnn_frontend::PointwiseMode_t::MUL); + auto scale_output = graph->pointwise(conv_output, S, scale_options); + + auto B = graph->tensor(cudnn_frontend::graph::Tensor_attributes() + .set_name("bias") + .set_uid(bias_tensor) + .set_dim({1, k, 1, 1}) + .set_stride({k, 1, k, k})); + auto bias_options = + cudnn_frontend::graph::Pointwise_attributes().set_mode(cudnn_frontend::PointwiseMode_t::ADD); + auto bias_output = graph->pointwise(scale_output, B, bias_options); + + auto relu_options = + cudnn_frontend::graph::Pointwise_attributes().set_mode(cudnn_frontend::PointwiseMode_t::RELU_FWD); + auto Y = graph->pointwise(bias_output, relu_options); + Y->set_output(true).set_uid(y_tensor); + + REQUIRE(graph->validate().is_good()); + + return graph; + }; + + // Check support + + auto check_support = [build_and_validate_graph_helper]( + int64_t n, int64_t c, int64_t h, int64_t w, int64_t k, int64_t r, int64_t s) -> bool { + cudnnHandle_t handle; + + checkCudnnErr(cudnnCreate(&handle)); + + auto graph = build_and_validate_graph_helper(n, c, h, w, k, r, s); + + REQUIRE(graph->build_operation_graph(handle).is_good()); + + REQUIRE(graph->create_execution_plans({cudnn_frontend::HeurMode_t::A}).is_good()); + + REQUIRE(graph->check_support(handle).is_good()); + + cudnnDestroy(handle); + + return true; + }; + + // Serialization Phase + + auto serialize = + [build_and_validate_graph_helper]( + int64_t n, int64_t c, int64_t h, int64_t w, int64_t k, int64_t r, int64_t s) -> std::vector { + cudnnHandle_t handle; + + std::vector serialized_data; + + checkCudnnErr(cudnnCreate(&handle)); + + auto graph = build_and_validate_graph_helper(n, c, h, w, k, r, s); + + REQUIRE(graph->build_operation_graph(handle).is_good()); + + REQUIRE(graph->create_execution_plans({cudnn_frontend::HeurMode_t::A}).is_good()); + + REQUIRE(graph->check_support(handle).is_good()); + + REQUIRE(graph->build_plans(handle).is_good()); + + // Insert auto-tuning logic here + + REQUIRE(graph->serialize(serialized_data).is_good()); + + cudnnDestroy(handle); + + return serialized_data; + }; + + auto deserialize = [](cudnnHandle_t handle, + std::vector const& data) -> std::shared_ptr { + auto graph = std::make_shared(); + + REQUIRE(graph->deserialize(handle, data).is_good()); + + return graph; + }; + + // Check if the graph is supported + REQUIRE(check_support(n, c, h, w, k, r, s)); + + // Serialize the graph. + auto serialize_data = serialize(n, c, h, w, k, r, s); + + // Deserialize the graph and execute + auto graph = deserialize(handle, serialize_data); + + Surface x_device_memory(n * c * h * w, false); + Surface w_device_memory(k * c * r * s, false); + Surface s_device_memory(k, false); + Surface b_device_memory(k, false); + Surface y_device_memory(n * k * h * w, false); // Should be p, q. + + Surface workspace(graph->get_workspace_size(), false); + + std::unordered_map variant_pack = {{x_tensor, x_device_memory.devPtr}, + {w_tensor, w_device_memory.devPtr}, + {scale_tensor, s_device_memory.devPtr}, + {bias_tensor, b_device_memory.devPtr}, + {y_tensor, y_device_memory.devPtr}}; + + REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good()); + + cudnnDestroy(handle); +} + +TEST_CASE("SDPA Graph with serialization", "[sdpa][graph][serialization]") { + int64_t b = 12; // batch size + int64_t h = 6; // head dim + int64_t s_q = 1024; // q tensor is padded to this seq length + int64_t s_kv = 1024; // k and v tensor is padded to this seq length + int64_t d = 128; // hidden dim + +#if (CUDNN_VERSION < 8905) + SKIP("Serialization tests is not supported in cudnn versions prior to 8.9.5"); +#endif + + // Mode of sdpa operation + bool is_inference = true; + + // attention scale + bool is_attn_scale = true; + float attn_scale_cpu = 0.5f; + + // Dropout configutation + bool use_dropout_with_rng = true; + float dropout_probability = 0.1f; + + enum UIDs { uid_Q, uid_K, uid_V, uid_ATTN_SCALE, uid_SEED, uid_OFFSET, uid_O, uid_STATS }; + + auto build_and_validate_graph_helper = + [](int64_t b, + int64_t h, + int64_t s_q, + int64_t s_kv, + int64_t d, + bool is_attn_scale, + bool is_inference, + bool use_dropout_with_rng, + float dropout_probability) -> std::shared_ptr { + namespace fe = cudnn_frontend; + + auto graph = std::make_shared(); + + graph->set_io_data_type(fe::DataType_t::HALF) + .set_intermediate_data_type(fe::DataType_t::FLOAT) + .set_compute_data_type(fe::DataType_t::FLOAT); + + auto Q = graph->tensor(fe::graph::Tensor_attributes() + .set_name("Q") + .set_dim({b, h, s_q, d}) + .set_uid(uid_Q) + .set_stride({3 * h * d, 3 * d, 3 * b * h * d, 1})); + auto K = graph->tensor(fe::graph::Tensor_attributes() + .set_name("K") + .set_uid(uid_K) + .set_dim({b, h, s_kv, d}) + .set_stride({3 * h * d, 3 * d, 3 * b * h * d, 1})); + auto V = graph->tensor(fe::graph::Tensor_attributes() + .set_name("V") + .set_uid(uid_V) + .set_dim({b, h, s_kv, d}) + .set_stride({3 * h * d, 3 * d, 3 * b * h * d, 1})); + + auto attn_scale = is_attn_scale ? graph->tensor(fe::graph::Tensor_attributes() + .set_name("attn_scale") + .set_dim({1, 1, 1, 1}) + .set_uid(uid_ATTN_SCALE) + .set_stride({1, 1, 1, 1}) + .set_is_pass_by_value(true) + .set_data_type(fe::DataType_t::FLOAT)) + : nullptr; + + auto sdpa_options = fe::graph::SDPA_attributes().set_name("flash_attention").set_is_inference(is_inference); + + sdpa_options.set_causal_mask(true); + sdpa_options.set_alibi_mask(true); + + if (is_attn_scale) { + sdpa_options.set_attn_scale(attn_scale); + }; + + auto seed = use_dropout_with_rng ? graph->tensor(fe::graph::Tensor_attributes() + .set_name("Seed") + .set_uid(uid_SEED) + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)) + : nullptr; + + auto offset = use_dropout_with_rng ? graph->tensor(fe::graph::Tensor_attributes() + .set_uid(uid_OFFSET) + .set_name("Offset") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)) + : nullptr; + + if (use_dropout_with_rng) { + sdpa_options.set_dropout(dropout_probability, seed, offset); + } + + auto [O, stats] = graph->sdpa(Q, K, V, sdpa_options); + + O->set_output(true).set_dim({b, h, s_q, d}).set_uid(uid_O).set_stride({h * d, d, b * h * d, 1}); + + // Check that Stats tensor is real, which is only when its training step + if (is_inference) { + REQUIRE(stats == nullptr); + } else { + stats->set_output(true).set_uid(uid_STATS).set_data_type(fe::DataType_t::FLOAT); + } + + REQUIRE(graph->validate().is_good()); + + return graph; + }; + + auto check_support = [build_and_validate_graph_helper](int64_t b, + int64_t h, + int64_t s_q, + int64_t s_kv, + int64_t d, + bool is_attn_scale, + bool is_inference, + bool use_dropout_with_rng, + float dropout_probability) -> bool { + cudnnHandle_t handle; + + checkCudnnErr(cudnnCreate(&handle)); + + auto graph = build_and_validate_graph_helper( + b, h, s_q, s_kv, d, is_attn_scale, is_inference, use_dropout_with_rng, dropout_probability); + + REQUIRE(graph->build_operation_graph(handle).is_good()); + + REQUIRE(graph->create_execution_plans({cudnn_frontend::HeurMode_t::A}).is_good()); + + REQUIRE(graph->check_support(handle).is_good()); + + cudnnDestroy(handle); + + return true; + }; + + auto serialize = [build_and_validate_graph_helper](int64_t b, + int64_t h, + int64_t s_q, + int64_t s_kv, + int64_t d, + bool is_attn_scale, + bool is_inference, + bool use_dropout_with_rng, + float dropout_probability) -> std::vector { + cudnnHandle_t handle; + + std::vector serialized_data; + + checkCudnnErr(cudnnCreate(&handle)); + + auto graph = build_and_validate_graph_helper( + b, h, s_q, s_kv, d, is_attn_scale, is_inference, use_dropout_with_rng, dropout_probability); + + REQUIRE(graph->build_operation_graph(handle).is_good()); + + REQUIRE(graph->create_execution_plans({cudnn_frontend::HeurMode_t::A}).is_good()); + + REQUIRE(graph->check_support(handle).is_good()); + + REQUIRE(graph->build_plans(handle).is_good()); + + // Insert auto-tuning logic here + + REQUIRE(graph->serialize(serialized_data).is_good()); + + cudnnDestroy(handle); + + return serialized_data; + }; + + auto deserialize = [](cudnnHandle_t handle, + std::vector const& data) -> std::shared_ptr { + auto graph = std::make_shared(); + + REQUIRE(graph->deserialize(handle, data).is_good()); + + return graph; + }; + + // Check support + REQUIRE(check_support(b, h, s_q, s_kv, d, is_attn_scale, is_inference, use_dropout_with_rng, dropout_probability)); + + // Serialize the graph. + auto serialize_data = + serialize(b, h, s_q, s_kv, d, is_attn_scale, is_inference, use_dropout_with_rng, dropout_probability); + + cudnnHandle_t handle; + checkCudnnErr(cudnnCreate(&handle)); + + auto graph = deserialize(handle, serialize_data); + + //// Build variant pack + Surface qkvTensor(b * s_q * 3 * h * d, false); + Surface oTensor(b * s_q * h * d, false); + void* devPtrQ = qkvTensor.devPtr; + void* devPtrK = (qkvTensor.devPtr + d); + void* devPtrV = (qkvTensor.devPtr + 2 * d); + void* devPtrO = oTensor.devPtr; + + int32_t scaleSize = 1; + int32_t seed_value = 123456; + Surface dropoutSeed(scaleSize, false, seed_value); + Surface dropoutOffset(scaleSize, false, (int32_t)1); + + Surface workspace(graph->get_workspace_size(), false); + + std::cout << "Graph requires workspace " << graph->get_workspace_size() << std::endl; + + std::unordered_map variant_pack = {{uid_Q, devPtrQ}, + {uid_K, devPtrK}, + {uid_V, devPtrV}, + {uid_ATTN_SCALE, &attn_scale_cpu}, + {uid_SEED, dropoutSeed.devPtr}, + {uid_OFFSET, dropoutOffset.devPtr}, + {uid_O, devPtrO}}; + + REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good()); + + checkCudnnErr(cudnnDestroy(handle)); +} \ No newline at end of file diff --git a/samples/cpp/wgrads.cpp b/samples/cpp/wgrads.cpp index e2bb4e64..dfcec459 100644 --- a/samples/cpp/wgrads.cpp +++ b/samples/cpp/wgrads.cpp @@ -27,6 +27,9 @@ TEST_CASE("Convolution Wgrad", "[wgrad][graph][wgrad][Conv_wgrad]") { namespace fe = cudnn_frontend; + if (is_arch_supported_by_cudnn() == false) { + SKIP("Architecture is not supported by currend cudnn version"); + } fe::graph::Graph graph; graph.set_io_data_type(fe::DataType_t::HALF) .set_intermediate_data_type(fe::DataType_t::HALF) @@ -135,4 +138,4 @@ TEST_CASE("Wgrad Graph", "[wgrad][graph][scale-bias-relu-wgrad][ConvBNwgrad]") { {DW, dw_tensor.devPtr}}; REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good()); cudnnDestroy(handle); -} \ No newline at end of file +} diff --git a/samples/legacy_samples/helpers.cpp b/samples/legacy_samples/helpers.cpp index 2cfcd162..ac0abc4f 100644 --- a/samples/legacy_samples/helpers.cpp +++ b/samples/legacy_samples/helpers.cpp @@ -47,6 +47,14 @@ is_hopper_arch() { return (90 <= cc); } +bool +is_arch_supported_by_cudnn() { + if (cudnnGetVersion() < 8600 && (is_hopper_arch() || is_ada_arch())) { + return false; + } + return true; +} + bool check_device_arch_newer_than(std::string const& arch) { size_t arch_major = 6; diff --git a/samples/legacy_samples/norm_samples.h b/samples/legacy_samples/norm_samples.h index ee7c9ba8..480c3aa7 100644 --- a/samples/legacy_samples/norm_samples.h +++ b/samples/legacy_samples/norm_samples.h @@ -33,7 +33,6 @@ #include #include -#include #include /** diff --git a/samples/utils/error_util.h b/samples/utils/error_util.h index 3980fea7..c8abd199 100644 --- a/samples/utils/error_util.h +++ b/samples/utils/error_util.h @@ -23,11 +23,14 @@ #if !defined(_ERROR_UTIL_H_) #define _ERROR_UTIL_H_ +#include #include #include #include #include +#include + #define TOSTR_(s) #s #define TOSTR(s) TOSTR_(s) #if defined(__GNUC__) @@ -100,6 +103,24 @@ } \ } +namespace cudnn_frontend { +static inline void +throw_if(std::function expr, [[maybe_unused]] const char *message, [[maybe_unused]] cudnnStatus_t status) { + if (expr()) { +#ifndef NV_CUDNN_DISABLE_EXCEPTION + throw cudnn_frontend::cudnnException(message, status); +#endif + } +} +static inline void +throw_if(bool expr, [[maybe_unused]] const char *message, [[maybe_unused]] cudnnStatus_t status) { + if (expr) { +#ifndef NV_CUDNN_DISABLE_EXCEPTION + throw cudnn_frontend::cudnnException(message, status); +#endif + } +} +} // namespace cudnn_frontend // CUDA Utility Helper Functions static void diff --git a/samples/utils/helpers.h b/samples/utils/helpers.h index 3583f3f0..90badc14 100644 --- a/samples/utils/helpers.h +++ b/samples/utils/helpers.h @@ -63,6 +63,8 @@ bool is_hopper_arch(); bool check_device_arch_newer_than(std::string const& arch); +bool +is_arch_supported_by_cudnn(); int64_t getFwdConvDilatedFilterDim(int64_t filterDim, int64_t dilation); diff --git a/setup.py b/setup.py index 4a845eae..335e35c8 100644 --- a/setup.py +++ b/setup.py @@ -88,7 +88,7 @@ def build_extension(self, ext: CMakeExtension) -> None: # logic and declaration, and simpler if you include description/version in a file. setup( name="cudnn", - version="1.0.3", + version="1.1.0", author="", author_email="", description="cudnn_frontend python package", diff --git a/samples/python/test_apply_rope.py b/test/python_fe/test_apply_rope.py similarity index 100% rename from samples/python/test_apply_rope.py rename to test/python_fe/test_apply_rope.py diff --git a/samples/python/test_batchnorm.py b/test/python_fe/test_batchnorm.py similarity index 100% rename from samples/python/test_batchnorm.py rename to test/python_fe/test_batchnorm.py diff --git a/samples/python/test_conv_bias.py b/test/python_fe/test_conv_bias.py similarity index 95% rename from samples/python/test_conv_bias.py rename to test/python_fe/test_conv_bias.py index 454e5fc9..98a1c920 100644 --- a/samples/python/test_conv_bias.py +++ b/test/python_fe/test_conv_bias.py @@ -1,4 +1,5 @@ import cudnn +import pytest import torch def convert_to_cudnn_type(torch_type): @@ -17,12 +18,14 @@ def forward(self, x, w, b = None, padding = [1,1], stride = [1,1], dilation = [1 return torch.nn.functional.relu(conv_output) def test_conv_bias_relu(): + torch.manual_seed(0) + # Reference code X_gpu = torch.randn(4, 16, 56, 56, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last) W_gpu = torch.randn(16, 16, 3, 3, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last) B_gpu = torch.randn(1, 16, 1, 1, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last) - padding = [0,1] - stride = [2,3] + padding = [1,1] + stride = [3,3] dilation = [1,1] model = CSBR().eval().to("cuda").to(torch.float16) Y_expected = model(X_gpu, W_gpu, b = B_gpu, padding = padding, stride = stride, dilation = dilation) @@ -37,7 +40,7 @@ def test_conv_bias_relu(): W = graph.tensor(name = "W", dim = W_gpu.size(), stride = W_gpu.stride(), data_type = convert_to_cudnn_type(W_gpu.dtype)) B = graph.tensor(name = "B", dim = B_gpu.size(), stride = B_gpu.stride(), data_type = convert_to_cudnn_type(B_gpu.dtype)) - conv_output = graph.conv_fprop(image = X, weight = W, padding = padding, stride = stride, dilation = dilation) + conv_output = graph.conv_fprop(image = X, weight = W, pre_padding = padding, post_padding = padding, stride = stride, dilation = dilation) bias_output = graph.bias(name = "bias", input = conv_output, bias = B) @@ -55,7 +58,7 @@ def test_conv_bias_relu(): Y_actual = torch.zeros_like(Y_expected) graph.execute({X: X_gpu, W: W_gpu, B: B_gpu, Y: Y_actual}, workspace) - torch.testing.assert_close(Y_expected, Y_actual, atol=1e-2, rtol=1e-2) + torch.testing.assert_close(Y_expected, Y_actual, atol=0.05, rtol=1e-2) cudnn.destroy_handle(handle) @@ -171,6 +174,7 @@ def dleaky_relu(grad: torch.Tensor, mask: torch.Tensor, negative_slope: float): torch.testing.assert_close(Y_expected, Y_actual, atol=1e-4, rtol=1e-4) +@pytest.mark.skipif(cudnn.backend_version() < 8600, reason="requires cudnn 8.6.0 or higher") def test_conv_int8(): N, C, H, W = 1, 64, 32, 32 K, R, S = 4, 3, 3 @@ -215,8 +219,8 @@ def test_conv_int8(): torch.testing.assert_close(Y_expected, Y_actual, atol=1e-2, rtol=1e-2) if __name__ == "__main__": - # test_conv_int8() - # test_conv_relu() + test_conv_int8() + test_conv_relu() test_conv_bias_relu() - # test_conv3d_bias_leaky_relu() - # test_leaky_relu_backward() \ No newline at end of file + test_conv3d_bias_leaky_relu() + test_leaky_relu_backward() diff --git a/samples/python/test_conv_genstats.py b/test/python_fe/test_conv_genstats.py similarity index 100% rename from samples/python/test_conv_genstats.py rename to test/python_fe/test_conv_genstats.py diff --git a/samples/python/test_conv_reduction.py b/test/python_fe/test_conv_reduction.py similarity index 100% rename from samples/python/test_conv_reduction.py rename to test/python_fe/test_conv_reduction.py diff --git a/samples/python/test_instancenorm.py b/test/python_fe/test_instancenorm.py similarity index 100% rename from samples/python/test_instancenorm.py rename to test/python_fe/test_instancenorm.py diff --git a/samples/python/test_layernorm.py b/test/python_fe/test_layernorm.py similarity index 95% rename from samples/python/test_layernorm.py rename to test/python_fe/test_layernorm.py index ffe96c7f..e6d4887f 100644 --- a/samples/python/test_layernorm.py +++ b/test/python_fe/test_layernorm.py @@ -28,7 +28,7 @@ def param_extract(request): return request.param @pytest.mark.skipif(cudnn.backend_version() < 8905, reason="LN not supported below cudnn 8.9.5") -def test_in(param_extract): +def test_layernorm(param_extract): torch.manual_seed(0) embedding_dim, input_type = param_extract @@ -48,12 +48,10 @@ def test_in(param_extract): bias_gpu = 7*torch.randn(1, C, H, W, requires_grad=True, device="cuda", dtype=input_type).to(memory_format=torch.channels_last) -2 epsilon_cpu = torch.full((1, 1, 1, 1), epsilon_value, requires_grad=False, device="cpu", dtype=torch.float32) - print("Running reference") Y_expected = torch.nn.functional.layer_norm(x_gpu, [C, H, W], weight=scale_gpu.squeeze(0), bias=bias_gpu.squeeze(0), eps=epsilon_value) mean_expected = x_gpu.to(torch.float32).mean(dim=(1, 2, 3), keepdim=True) inv_var_expected = torch.rsqrt(torch.var(x_gpu.to(torch.float32), dim=(1, 2, 3), keepdim=True) + epsilon_value) - print("Building cudnn graph") graph = cudnn.pygraph(intermediate_data_type = cudnn.data_type.FLOAT, compute_data_type = cudnn.data_type.FLOAT) @@ -84,7 +82,6 @@ def test_in(param_extract): inv_var_actual = torch.empty_like(inv_var_expected) workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8) - print("Executing cudnn graph") graph.execute({ X : x_gpu.detach() @@ -96,11 +93,9 @@ def test_in(param_extract): , inv_var: inv_var_actual }, workspace) - print("Comparing with reference") torch.testing.assert_close(Y_expected, Y_actual, atol=atol, rtol=rtol) torch.testing.assert_close(mean_expected, mean_actual, atol=atol, rtol=rtol) torch.testing.assert_close(inv_var_expected, inv_var_actual, atol=atol, rtol=rtol) - print("Success!!") target = torch.randn_like(Y_expected) criterion = torch.nn.MSELoss() @@ -143,7 +138,6 @@ def test_in(param_extract): Dbias_actual = torch.empty_like(bias_gpu) workspace = torch.empty(bwd_graph.get_workspace_size(), device="cuda", dtype=torch.uint8) - print("Executing cudnn bwd_graph") bwd_graph.execute({ X_bwd : x_gpu.detach() @@ -156,11 +150,9 @@ def test_in(param_extract): , Dbias: Dbias_actual }, workspace) - print("Comparing with reference") torch.testing.assert_close(x_gpu.grad, DX_actual, atol=2e-4, rtol=2e-4) torch.testing.assert_close(scale_gpu.grad, DScale_actual, atol=2e-4, rtol=2e-4) torch.testing.assert_close(bias_gpu.grad, Dbias_actual, atol=2e-4, rtol=2e-4) - print("Success!!") if __name__ == "__main__": - test_in((1600, torch.bfloat16)) \ No newline at end of file + test_layernorm((1600, torch.bfloat16)) \ No newline at end of file diff --git a/samples/python/test_matmul_bias_relu.py b/test/python_fe/test_matmul_bias_relu.py similarity index 74% rename from samples/python/test_matmul_bias_relu.py rename to test/python_fe/test_matmul_bias_relu.py index 1f56cc4e..745d014a 100644 --- a/samples/python/test_matmul_bias_relu.py +++ b/test/python_fe/test_matmul_bias_relu.py @@ -21,7 +21,52 @@ def convert_to_cudnn_type(torch_type): def get_cc(): (major, minor) = torch.cuda.get_device_capability() - return major*10 + minor + return major*10 + minor + +def compare_tensors(expected, actual, name, rtol=2e-2, atol=2e-2, fudge=1e-9): + assert expected.shape == actual.shape + + expected = expected.float().cuda().flatten() + actual = actual.float().cuda().flatten() + + n_elem = torch.numel(expected) + + mae = (expected - actual).abs().mean().item() + perr = ((expected - actual).abs().sum() / expected.abs().sum()).item() + snr = (expected**2).mean().sqrt() / ((expected - actual) ** 2).mean().sqrt() + snr_db = (10 * torch.log10(snr)).item() + + absolute_error = (expected - actual).abs() + relative_error = absolute_error / torch.where(expected.abs() < fudge, fudge, expected.abs()) + + abs_error_indices = absolute_error > atol + rel_error_indices = relative_error > rtol + n_abs_errors = torch.sum(abs_error_indices) + n_rel_errors = torch.sum(rel_error_indices) + error_indices = torch.logical_and(abs_error_indices, rel_error_indices) + n_errors = torch.sum(error_indices) + + n_nans = torch.isnan(actual).sum() + n_zeros = n_elem - torch.count_nonzero(actual) + + if n_errors != 0: + print(f"========== Comparison for {name} ==========") + print(f"Absolute Tolerance = {atol}") + print(f"Relative Tolerance = {rtol}") + print(f"Number of elements = {n_elem}") + print(f"Number of absolute errors = {n_abs_errors} ({n_abs_errors * 100 / n_elem:.2f}%)") + print(f"Number of relative errors = {n_rel_errors} ({n_rel_errors * 100 / n_elem:.2f}%)") + print(f"Number of errors (absolute and relative) = {n_errors} ({(n_errors * 100)/n_elem:.2f}%)") + print(f"Maximum absolute error = {absolute_error.max():.4f}") + print(f"Maximum relative error = {relative_error.max():.4f}") + print(f"Mean average error = {mae:.4f}") + print(f"Perr error = {perr:.4f} = 1/{(1/perr) if perr != 0 else float('inf'):.2f}") + print(f"Signal to noise ratio = {snr.item():.2f} = {snr_db:.2f}dB") + print(f"Number of Nans = {n_nans} ({n_nans * 100 / n_elem:.2f}%)") + print(f"Number of Zeros = {n_zeros} ({n_zeros * 100 / n_elem:.2f}%)") + print("===================================\n") + + return n_errors @pytest.mark.skipif(cudnn.backend_version() < 8906, reason="requires cudnn 8.9.6 or higher") @pytest.mark.skipif(torch.cuda.get_device_capability()[0] < 9, reason="requires Hopper or newer arch") @@ -82,10 +127,12 @@ def test_mixed_precision_matmul(A_data_type, B_data_type, MMA_data_type): A_gpu = torch.randint(4, (B, M, K), requires_grad=False, device="cuda", dtype=A_data_type) - 1 if B_data_type != torch.int8: - B_gpu = 3 * torch.randn(B, K, N, requires_grad=False, device="cuda", dtype=B_data_type) - 1.25 + B_gpu_strided = 3 * torch.randn(B, K, N, requires_grad=False, device="cuda", dtype=B_data_type) - 1.25 else: - B_gpu = torch.randint(3, (B, K, N), requires_grad=False, device="cuda", dtype=B_data_type) - 2 - + B_gpu_strided = torch.randint(3, (B, K, N), requires_grad=False, device="cuda", dtype=B_data_type).contiguous() - 2 + + B_gpu = torch.as_strided(B_gpu_strided, (B, K, N), (N*K, 1, N)) + # Make cudnn graph graph = cudnn.pygraph() @@ -123,7 +170,7 @@ def test_mixed_precision_matmul(A_data_type, B_data_type, MMA_data_type): graph.execute({A: A_gpu, B: B_gpu, C: C_actual}, workspace) # compare'em - torch.testing.assert_close(C_expected, C_actual) + compare_tensors(C_expected, C_actual, "output", atol=1e-4, rtol=1e-4) problem_size_options = [(1, 128, 768) , (16, 512, 1600) diff --git a/samples/python/test_mhas.py b/test/python_fe/test_mhas.py similarity index 69% rename from samples/python/test_mhas.py rename to test/python_fe/test_mhas.py index 76b4598a..5e6f7a6e 100644 --- a/samples/python/test_mhas.py +++ b/test/python_fe/test_mhas.py @@ -69,28 +69,6 @@ def compare_tensors(expected, actual, name, rtol=2e-2, atol=2e-2, fudge=1e-9): return n_errors + n_nans -def get_alibi_slopes(n_heads, device="cuda"): - # Get the closest power of 2 to `n_heads`. - # If `n_heads` is not a power of 2, then we first calculate slopes to the closest (smaller) power of 2, - # and then add the remaining slopes. - n = 2 ** math.floor(math.log2(n_heads)) - m_0 = 2.0 ** (-8.0 / n) - m = torch.pow(m_0, torch.arange(1, 1 + n)) - - # If `n_heads` is not a power of 2, then we add the remaining slopes. - # We calculate the remaining slopes for $n * 2$ (avoiding slopes added previously). - # And pick the slopes upto `n_heads`. - if n < n_heads: - m_hat_0 = 2.0 ** (-4.0 / n) - m_hat = torch.pow(m_hat_0, torch.arange(1, 1 + 2 * (n_heads - n), 2)) - # Concatenate the slopes with the remaining slopes. - m = torch.cat([m, m_hat]) - - # Reshape the tensor to [1, num_heads, 1, 1] - m = m.view(1, -1, 1, 1).to(device=device) - return m - - def compute_ref( q, k, @@ -163,7 +141,27 @@ def compute_ref( index_row = torch.arange(s_q, dtype=torch.float32, device=device).view(-1, 1) index_col = torch.arange(s_kv, dtype=torch.float32, device=device) distance = index_col - index_row - alibi_mask = distance.to(dtype=torch.float32) * get_alibi_slopes(h_q, device=device) + + # Get the closest power of 2 to `n_heads`. + # If `n_heads` is not a power of 2, then we first calculate slopes to the closest (smaller) power of 2, + # and then add the remaining slopes. + n = 2 ** math.floor(math.log2(h_q)) + m_0 = 2.0 ** (-8.0 / n) + m = torch.pow(m_0, torch.arange(1, 1 + n)) + + # If `n_heads` is not a power of 2, then we add the remaining slopes. + # We calculate the remaining slopes for $n * 2$ (avoiding slopes added previously). + # And pick the slopes upto `n_heads`. + if n < h_q: + m_hat_0 = 2.0 ** (-4.0 / n) + m_hat = torch.pow(m_hat_0, torch.arange(1, 1 + 2 * (h_q - n), 2)) + # Concatenate the slopes with the remaining slopes. + m = torch.cat([m, m_hat]) + + # Reshape the tensor to [1, num_heads, 1, 1] + m = m.view(1, -1, 1, 1).to(device=device) + + alibi_mask = distance.to(dtype=torch.float32) * m s = s + alibi_mask if padding is not None: s = s.masked_fill(s_mask, float("-inf")) @@ -202,6 +200,7 @@ def compute_ref( padding_mask_options = [False, True] causal_mask_options = [False, True] dropout_options = [False, True] +ragged_options = [False, True] is_infer_options = [False, True] all_options_forward = [ @@ -216,6 +215,7 @@ def compute_ref( padding_mask_options, causal_mask_options, dropout_options, + ragged_options, is_infer_options, ] ) @@ -233,6 +233,7 @@ def compute_ref( padding_mask_options, causal_mask_options, dropout_options, + ragged_options, ] ) ] @@ -308,15 +309,63 @@ def generate_layout(layout, head_group, shape_q, shape_k, shape_v, shape_o): return stride_q, stride_k, stride_v, stride_o, offset_q, offset_k, offset_v +def compute_exclusive_prefix_sum(tensor): + # tensor has shape (B, 1, 1, 1) + # output has shape (B+1, 1, 1, 1) + # ex) tensor = [[[[2, 4, 1, 6]]]] + # output = [[[[0, 2, 6, 7, 13]]]] + assert tensor.size(1) == tensor.size(2) == tensor.size(3) == 1 + return torch.cat((torch.zeros(1, 1, 1, 1, dtype=tensor.dtype, device=tensor.device), torch.cumsum(tensor, dim=0))) + + +def convert_ragged_to_uniform(ragged_tensor, ragged_offset): + # limitations: + # 1. tensor is non-interleaved with bhsd dim order and bshd stride order + # 2. ragged tensor is packed and in-order, therefore + # ragged offset is monatomically increasing + assert ragged_tensor.dim() == 4 + b, h, s, d = ragged_tensor.size() + b_stride, h_stride, s_stride, d_stride = ragged_tensor.stride() + assert b_stride >= s_stride >= h_stride >= d_stride + assert ragged_offset.dim() == 4 and (b + 1, 1, 1, 1) == ragged_offset.size() + + # ragged offset is given in 4D, convert to 1D locally + ragged_offset = ragged_offset.flatten() + + # convert bhsd to bshd and flatten + ragged_tensor_flat = torch.einsum("bhsd->bshd", ragged_tensor).flatten() + uniform_tensor_flat = torch.zeros_like(ragged_tensor_flat) + + # copy + for i, num_elements in enumerate(ragged_offset[1:] - ragged_offset[:-1]): + unif_a = i * s * h * d + unif_b = unif_a + num_elements + ragg_a = ragged_offset[i] + ragg_b = ragg_a + num_elements + uniform_tensor_flat[unif_a:unif_b] = ragged_tensor_flat[ragg_a:ragg_b] + + # unflatten and convert bshd to bhsd + uniform_tensor = uniform_tensor_flat.view(b, s, h, d) + uniform_tensor = torch.einsum("bshd->bhsd", uniform_tensor) + return uniform_tensor + + @pytest.fixture(params=all_options_forward) def param_extract_forward(request): return request.param -@pytest.mark.skipif(cudnn.backend_version() < 8903, reason="requires cudnn 8.9.3 or higher") -def test_sdpa(param_extract_forward): - ( - input_type, +@pytest.mark.parametrize("input_type", input_type_options) +@pytest.mark.parametrize("layout", layout_options) +@pytest.mark.parametrize("head_group", head_group_options) +@pytest.mark.parametrize("is_bias", bias_options) +@pytest.mark.parametrize("is_alibi", alibi_mask_options) +@pytest.mark.parametrize("is_padding", padding_mask_options) +@pytest.mark.parametrize("is_causal", causal_mask_options) +@pytest.mark.parametrize("is_dropout", dropout_options) +@pytest.mark.parametrize("is_ragged", ragged_options) +@pytest.mark.parametrize("is_infer", is_infer_options) +def test_sdpa(input_type, layout, head_group, is_bias, @@ -324,8 +373,10 @@ def test_sdpa(param_extract_forward): is_padding, is_causal, is_dropout, - is_infer, - ) = param_extract_forward + is_ragged, + is_infer): + if cudnn.backend_version() < 8903: + pytest.skip("SDPA fprop requires cudnn 8.9.3 or higher") if head_group != "multi_head" and cudnn.backend_version() < 8907: pytest.skip("GQA and MQA is only supported 8.9.7 onwards.") @@ -339,16 +390,28 @@ def test_sdpa(param_extract_forward): if is_dropout and cudnn.backend_version() < 8906: pytest.skip("Dropout reference is only supported on 8.9.6 onwards.") + if is_ragged and cudnn.backend_version() < 90000: + pytest.skip("Ragged tensor is only supported 9.0.0 onwards") + + if is_ragged and torch.cuda.get_device_capability()[0] < 9: + pytest.skip("Ragged tensor is only supported hopper") + + if is_ragged and layout != "non_interleaved": + pytest.skip("Ragged tensor is only tested with non-interleaved bshd layout") + + if is_ragged and not is_padding: + pytest.skip("Ragged tensor is only tested with packed variable length tensors") + # batch size b = 2 # query sequence length - s_q = random.choice([256, 512, 1024, 2048]) + s_q = random.choice([8, 16, 24, 32, 256, 512, 1024, 2048]) # key+value sequence length s_kv = random.choice([8, 16, 24, 32, 256, 512, 1024, 2048]) if layout == "non_interleaved" else s_q # query+key embedding dimension per head d_qk = random.choice([32, 56, 64, 128]) # value embedding dimension per head - d_v = random.choice([64, 96, 128]) if layout == "non_interleaved" else d_qk + d_v = random.choice([64, 96, 128]) if (layout == "non_interleaved" and not is_ragged) else d_qk # number of heads h_q = 6 if head_group == "multi_head": @@ -366,12 +429,23 @@ def test_sdpa(param_extract_forward): if d_qk != d_v and cudnn.backend_version() < 8906: pytest.skip("d_qk != d_v is only supported on 8.9.6 onwards.") - if is_dropout and (s_kv % 64 != 0) and cudnn.backend_version() < 90000: - pytest.skip("Dropout mask dump with not-multiple-of-64 seq_kv is not supported.") + if cudnn.backend_version() < 90000: + if ((s_q % 64 != 0) or (s_kv % 64 != 0)) and (is_padding or is_dropout): + pytest.skip("s_q not a multiple of 64 with padding/dropout is not supported with cudnn version 9.0.0") - if ((d_qk % 64 != 0) or (s_kv % 64 != 0)) and cudnn.backend_version() < 8906: + if cudnn.backend_version() < 8906: pytest.skip("d not a multiple of 64, not-multiple-of-64 seq_kv is not supported below 8.9.6") - + + if (d_qk % 64 != 0) and cudnn.backend_version() < 8906: + pytest.skip("d not a multiple of 64 is not supported below 8.9.6") + + if (d_qk % 64 != 0) and cudnn.backend_version() < 8906: + pytest.skip("d not a multiple of 64 is not supported below 8.9.6") + + # TODO file bug + if d_qk != d_v and is_ragged: + pytest.skip("d_qk != d_v is not supported with ragged offset") + print(f"{s_q=} {s_kv=} {d_qk=} {d_v=} {h_q=} {h_k=} {h_v=}") attn_scale = 0.125 @@ -409,6 +483,11 @@ def test_sdpa(param_extract_forward): rng_dump_gpu = torch.empty((b, h_q, s_q, s_kv), dtype=torch.float32, device="cuda") if is_dropout else None + q_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_q_gpu) * h_q * d_qk).int() if is_ragged else None + k_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_kv_gpu) * h_k * d_qk).int() if is_ragged else None + v_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_kv_gpu) * h_v * d_v).int() if is_ragged else None + o_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_q_gpu) * h_q * d_v).int() if is_ragged else None + o_gpu = torch.empty(b * h_q * s_q * d_v, dtype=input_type, device="cuda").as_strided(shape_o, stride_o) stats_gpu = torch.empty(b, h_q, s_q, 1, dtype=torch.float32, device="cuda") if not is_infer else None @@ -418,6 +497,7 @@ def test_sdpa(param_extract_forward): intermediate_data_type=cudnn.data_type.FLOAT, compute_data_type=cudnn.data_type.FLOAT, ) + q = graph.tensor_like(q_gpu) k = graph.tensor_like(k_gpu) v = graph.tensor_like(v_gpu) @@ -434,6 +514,16 @@ def test_sdpa(param_extract_forward): rng_dump = graph.tensor_like(rng_dump_gpu) if is_dropout else None + q_ragged_offset = graph.tensor_like(q_ragged_offset_gpu) if is_ragged else None + k_ragged_offset = graph.tensor_like(k_ragged_offset_gpu) if is_ragged else None + v_ragged_offset = graph.tensor_like(v_ragged_offset_gpu) if is_ragged else None + o_ragged_offset = graph.tensor_like(o_ragged_offset_gpu) if is_ragged else None + + if is_ragged: + q.set_ragged_offset(q_ragged_offset) + k.set_ragged_offset(k_ragged_offset) + v.set_ragged_offset(v_ragged_offset) + o, stats = graph.sdpa( name="sdpa", q=q, @@ -452,6 +542,9 @@ def test_sdpa(param_extract_forward): ) o.set_output(True).set_dim(shape_o).set_stride(stride_o) + if is_ragged: + o.set_ragged_offset(o_ragged_offset) + if is_infer == False: stats.set_output(True).set_data_type(cudnn.data_type.FLOAT) @@ -468,6 +561,10 @@ def test_sdpa(param_extract_forward): bias: bias_gpu, seq_len_q: seq_len_q_gpu, seq_len_kv: seq_len_kv_gpu, + q_ragged_offset: q_ragged_offset_gpu, + k_ragged_offset: k_ragged_offset_gpu, + v_ragged_offset: v_ragged_offset_gpu, + o_ragged_offset: o_ragged_offset_gpu, o: o_gpu, stats: stats_gpu, rng_dump: rng_dump_gpu, @@ -481,10 +578,16 @@ def test_sdpa(param_extract_forward): graph.execute(variant_pack, workspace) torch.cuda.synchronize() + # compare with torch autograd reference q_ref = q_gpu.detach().float() k_ref = k_gpu.detach().float() v_ref = v_gpu.detach().float() + if is_ragged: + q_ref = convert_ragged_to_uniform(q_ref, q_ragged_offset_gpu.detach()) + k_ref = convert_ragged_to_uniform(k_ref, k_ragged_offset_gpu.detach()) + v_ref = convert_ragged_to_uniform(v_ref, v_ragged_offset_gpu.detach()) + if is_bias: bias_ref = bias_gpu.detach().float() @@ -513,6 +616,9 @@ def test_sdpa(param_extract_forward): else: o_ref = ret + if is_ragged: + o_gpu = convert_ragged_to_uniform(o_gpu, o_ragged_offset_gpu.detach()) + if is_padding: # zero out padded region of the output for comparison for i, m in enumerate(seq_len_q_ref): @@ -527,15 +633,16 @@ def test_sdpa(param_extract_forward): assert compare_tensors(stats_ref, stats_gpu, "stats") == 0 -@pytest.fixture(params=all_options_backward) -def param_extract_backward(request): - return request.param - - -@pytest.mark.skipif(cudnn.backend_version() < 8903, reason="requires cudnn 8.9.3 or higher") -def test_sdpa_backward(param_extract_backward): - ( - input_type, +@pytest.mark.parametrize("input_type", input_type_options) +@pytest.mark.parametrize("layout", layout_options) +@pytest.mark.parametrize("head_group", head_group_options) +@pytest.mark.parametrize("is_bias", bias_options) +@pytest.mark.parametrize("is_alibi", alibi_mask_options) +@pytest.mark.parametrize("is_padding", padding_mask_options) +@pytest.mark.parametrize("is_causal", causal_mask_options) +@pytest.mark.parametrize("is_dropout", dropout_options) +@pytest.mark.parametrize("is_ragged", ragged_options) +def test_sdpa_backward(input_type, layout, head_group, is_bias, @@ -543,7 +650,9 @@ def test_sdpa_backward(param_extract_backward): is_padding, is_causal, is_dropout, - ) = param_extract_backward + is_ragged): + if cudnn.backend_version() < 8903: + pytest.skip("SDPA bprop requires cudnn 8.9.3 or higher") if head_group != "multi_head" and cudnn.backend_version() < 8907: pytest.skip("GQA and MQA is only supported 8.9.7 onwards.") @@ -557,6 +666,9 @@ def test_sdpa_backward(param_extract_backward): if is_bias and is_padding: pytest.skip("dBias is not supported with padding mask") + if is_alibi and not is_causal: + pytest.skip("ALiBi mask is only supported with causal mask") + if is_alibi and cudnn.backend_version() < 8904: pytest.skip("ALiBi mask is only supported 8.9.4 onwards.") @@ -566,19 +678,34 @@ def test_sdpa_backward(param_extract_backward): if is_dropout and cudnn.backend_version() < 8906: pytest.skip("RNG dump is only supported on 8.9.6 onwards.") + if is_ragged and cudnn.backend_version() < 90000: + pytest.skip("Ragged tensor is only supported 9.0.0 onwards") + + if is_ragged and torch.cuda.get_device_capability()[0] < 9: + pytest.skip("Ragged tensor is only supported hopper") + + if is_ragged and layout != "non_interleaved": + pytest.skip("Ragged tensor is only tested with non-interleaved bshd layout") + + if is_ragged and head_group != "multi_head": + pytest.skip("Ragged offset is only supported with multi_head") + + if is_ragged and not is_padding: + pytest.skip("Ragged tensor is only tested with packed variable length tensors") + # test both dP workspace optimization by lowering dP workspace limit to 8MB os.environ["CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT"] = str(8 * 1024 * 1024) # batch size b = 2 # query sequence length - s_q = random.choice([256, 512, 1024]) + s_q = random.choice([8, 16, 24, 32, 256, 512, 1024]) # key+value sequence length - s_kv = random.choice([32, 256, 512, 1024]) if layout == "non_interleaved" else s_q + s_kv = random.choice([8, 16, 24, 32, 256, 512, 1024]) if layout == "non_interleaved" else s_q # query+key embedding dimension per head d_qk = random.choice([32, 56, 64, 128]) # value embedding dimension per head - d_v = random.choice([64, 96, 128]) if layout == "non_interleaved" else d_qk + d_v = random.choice([64, 96, 128]) if (layout == "non_interleaved" and not is_ragged) else d_qk # number of heads h_q = 6 if head_group == "multi_head": @@ -596,13 +723,27 @@ def test_sdpa_backward(param_extract_backward): if d_qk != d_v and cudnn.backend_version() < 8906: pytest.skip("d_qk != d_v is only supported on 8.9.6 onwards.") - if (s_kv % 64 != 0) and layout == "non_interleaved": - pytest.skip("cudnn backend does not support non-interlaved layout with non-64-aligned seq_kv.") - - if ((d_qk % 64 != 0) or (s_kv % 64 != 0)) and cudnn.backend_version() < 8906: - pytest.skip("d not a multiple of 64, not-multiple-of-64 seq_kv is not supported below 8.9.6") + if (cudnn.backend_version() < 90000): + if (s_q < 64): + pytest.skip("s_q less than 64 is not supported before cudnn 9.0.0") + + if ((s_q % 64 != 0) or (s_kv % 64 != 0)) and (is_padding or is_dropout): + pytest.skip("s_q not a multiple of 64 with padding/dropout is not supported with cudnn version 9.0.0") + + if ((s_q % 64 != 0) or (s_kv % 64 != 0)) and is_bias: + pytest.skip("cudnn backend does not support bias with non-64-aligned seq_q or seq_kv.") + + if (s_kv % 64 != 0) and cudnn.backend_version() < 8906: + pytest.skip("not-multiple-of-64 seq_kv is not supported below 8.9.6") - print(f"{str(param_extract_backward)} {s_q=} {s_kv=} {d_qk=} {d_v=} {h_q=} {h_k=} {h_v=}") + if (d_qk % 64 != 0) and cudnn.backend_version() < 8906: + pytest.skip("d not a multiple of 64 is not supported below 8.9.6") + + # TODO file bug + if d_qk != d_v and is_ragged: + pytest.skip("d_qk != d_v is not supported with ragged offset") + + print(f"{s_q=} {s_kv=} {d_qk=} {d_v=} {h_q=} {h_k=} {h_v=}") attn_scale = 0.125 dropout_prob = 0.1 if is_dropout else 0.0 @@ -647,6 +788,11 @@ def test_sdpa_backward(param_extract_backward): rng_dump_gpu = torch.empty((b, h_q, s_q, s_kv), dtype=torch.float32, device="cuda") if is_dropout else None + q_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_q_gpu) * h_q * d_qk).int() if is_ragged else None + k_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_kv_gpu) * h_k * d_qk).int() if is_ragged else None + v_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_kv_gpu) * h_v * d_v).int() if is_ragged else None + o_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_q_gpu) * h_q * d_v).int() if is_ragged else None + o_gpu = torch.empty(b * h_q * s_q * d_v, dtype=input_type, device="cuda").as_strided(shape_o, stride_o) stats_gpu = torch.empty(b, h_q, s_q, 1, dtype=torch.float32, device="cuda") @@ -656,6 +802,7 @@ def test_sdpa_backward(param_extract_backward): intermediate_data_type=cudnn.data_type.FLOAT, compute_data_type=cudnn.data_type.FLOAT, ) + q = graph.tensor_like(q_gpu) k = graph.tensor_like(k_gpu) v = graph.tensor_like(v_gpu) @@ -672,6 +819,16 @@ def test_sdpa_backward(param_extract_backward): rng_dump = graph.tensor_like(rng_dump_gpu) if is_dropout else None + q_ragged_offset = graph.tensor_like(q_ragged_offset_gpu) if is_ragged else None + k_ragged_offset = graph.tensor_like(k_ragged_offset_gpu) if is_ragged else None + v_ragged_offset = graph.tensor_like(v_ragged_offset_gpu) if is_ragged else None + o_ragged_offset = graph.tensor_like(o_ragged_offset_gpu) if is_ragged else None + + if is_ragged: + q.set_ragged_offset(q_ragged_offset) + k.set_ragged_offset(k_ragged_offset) + v.set_ragged_offset(v_ragged_offset) + o, stats = graph.sdpa( name="sdpa", q=q, @@ -690,6 +847,9 @@ def test_sdpa_backward(param_extract_backward): ) o.set_output(True).set_dim(shape_o).set_stride(stride_o) + if is_ragged: + o.set_ragged_offset(o_ragged_offset) + stats.set_output(True).set_data_type(cudnn.data_type.FLOAT) graph.validate() @@ -705,6 +865,10 @@ def test_sdpa_backward(param_extract_backward): bias: bias_gpu, seq_len_q: seq_len_q_gpu, seq_len_kv: seq_len_kv_gpu, + q_ragged_offset: q_ragged_offset_gpu, + k_ragged_offset: k_ragged_offset_gpu, + v_ragged_offset: v_ragged_offset_gpu, + o_ragged_offset: o_ragged_offset_gpu, o: o_gpu, stats: stats_gpu, rng_dump: rng_dump_gpu, @@ -730,6 +894,7 @@ def test_sdpa_backward(param_extract_backward): intermediate_data_type=cudnn.data_type.FLOAT, compute_data_type=cudnn.data_type.FLOAT, ) + q = graph.tensor_like(q_gpu) k = graph.tensor_like(k_gpu) v = graph.tensor_like(v_gpu) @@ -748,6 +913,18 @@ def test_sdpa_backward(param_extract_backward): offset = graph.tensor_like(offset_gpu) dropout_tuple = (dropout_prob, seed, offset) + q_ragged_offset = graph.tensor_like(q_ragged_offset_gpu) if is_ragged else None + k_ragged_offset = graph.tensor_like(k_ragged_offset_gpu) if is_ragged else None + v_ragged_offset = graph.tensor_like(v_ragged_offset_gpu) if is_ragged else None + o_ragged_offset = graph.tensor_like(o_ragged_offset_gpu) if is_ragged else None + + if is_ragged: + q.set_ragged_offset(q_ragged_offset) + k.set_ragged_offset(k_ragged_offset) + v.set_ragged_offset(v_ragged_offset) + o.set_ragged_offset(o_ragged_offset) + dO.set_ragged_offset(o_ragged_offset) + dQ, dK, dV = graph.sdpa_backward( name="sdpa_backward", q=q, @@ -770,6 +947,10 @@ def test_sdpa_backward(param_extract_backward): dQ.set_output(True).set_dim(dQ_gpu.size()).set_stride(dQ_gpu.stride()) dK.set_output(True).set_dim(dK_gpu.size()).set_stride(dK_gpu.stride()) dV.set_output(True).set_dim(dV_gpu.size()).set_stride(dV_gpu.stride()) + if is_ragged: + dQ.set_ragged_offset(q_ragged_offset) + dK.set_ragged_offset(k_ragged_offset) + dV.set_ragged_offset(v_ragged_offset) graph.validate() graph.build_operation_graph() @@ -791,6 +972,10 @@ def test_sdpa_backward(param_extract_backward): dBias: dBias_gpu, seq_len_q: seq_len_q_gpu, seq_len_kv: seq_len_kv_gpu, + q_ragged_offset: q_ragged_offset_gpu, + k_ragged_offset: k_ragged_offset_gpu, + v_ragged_offset: v_ragged_offset_gpu, + o_ragged_offset: o_ragged_offset_gpu, } if is_dropout: @@ -810,6 +995,12 @@ def test_sdpa_backward(param_extract_backward): v_ref.requires_grad = True dO_ref = dO_gpu.detach().float() + if is_ragged: + q_ref = convert_ragged_to_uniform(q_ref, q_ragged_offset_gpu.detach()) + k_ref = convert_ragged_to_uniform(k_ref, k_ragged_offset_gpu.detach()) + v_ref = convert_ragged_to_uniform(v_ref, v_ragged_offset_gpu.detach()) + dO_ref = convert_ragged_to_uniform(dO_ref, o_ragged_offset_gpu.detach()) + if is_bias: bias_ref = bias_gpu.detach().float() bias_ref.requires_grad = True @@ -848,6 +1039,11 @@ def test_sdpa_backward(param_extract_backward): if is_bias: dBias_ref = opt_refs.pop(0) + if is_ragged: + dQ_gpu = convert_ragged_to_uniform(dQ_gpu, q_ragged_offset_gpu.detach()) + dK_gpu = convert_ragged_to_uniform(dK_gpu, k_ragged_offset_gpu.detach()) + dV_gpu = convert_ragged_to_uniform(dV_gpu, v_ragged_offset_gpu.detach()) + if is_padding: # zero out padded region of the output for comparison for i, (m, n) in enumerate(zip(seq_len_q_ref, seq_len_kv_ref)): @@ -870,22 +1066,24 @@ def test_sdpa_backward(param_extract_backward): if __name__ == "__main__": """ - option_forward = (input_type, layout, head_group, is_bias, is_alibi, is_padding, is_causal, is_dropout, is_infer) - option_backward = (input_type, layout, head_group, is_bias, is_alibi, is_padding, is_causal, is_dropout) - test_sdpa((torch.float16, "bs3hd", "multi_head", False, False, False, False, False, False)) - test_sdpa_backward((torch.float16, "bs3hd", "multi_head", False, False, False, False, False)) + option_forward = (input_type, layout, head_group, is_bias, is_alibi, is_padding, is_causal, is_dropout, is_ragged, is_infer) + option_backward = (input_type, layout, head_group, is_bias, is_alibi, is_padding, is_causal, is_dropout, is_ragged) + test_sdpa(torch.float16, "bs3hd", "multi_head", False, False, False, False, False, False, False) + test_sdpa_backward(torch.float16, "bs3hd", "multi_head", False, False, False, False, False, False) """ print("==========running forward tests==========") for option in all_options_forward: try: - test_sdpa(option) + print(f"Running {option}") + test_sdpa(*option) except pytest.skip.Exception as e: - print(f"Skipped {option}: {e}") + print(f"Skipped {option}\n{e}") print("==========running backward tests==========") for option in all_options_backward: try: - test_sdpa_backward(option) + print(f"Running {option}") + test_sdpa_backward(*option) except pytest.skip.Exception as e: - print(f"Skipped {option}: {e}") + print(f"Skipped {option}\n{e}") diff --git a/samples/python/test_rmsnorm.py b/test/python_fe/test_rmsnorm.py similarity index 100% rename from samples/python/test_rmsnorm.py rename to test/python_fe/test_rmsnorm.py diff --git a/samples/python/test_wgrads.py b/test/python_fe/test_wgrads.py similarity index 100% rename from samples/python/test_wgrads.py rename to test/python_fe/test_wgrads.py