diff --git a/CMakeLists.txt b/CMakeLists.txt
index 24821d52..a5a8feb8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.17)
 
-project(cudnn_frontend VERSION 1.0.3)
+project(cudnn_frontend VERSION 1.1.0)
 
 option(CUDNN_FRONTEND_BUILD_SAMPLES "Defines if samples are built or not." ON)
 option(CUDNN_FRONTEND_BUILD_UNIT_TESTS "Defines if unittests are built or not." OFF)
diff --git a/README.FE.1.0.md b/README.FE.1.0.md
index 337851bd..196c5b12 100644
--- a/README.FE.1.0.md
+++ b/README.FE.1.0.md
@@ -9,8 +9,8 @@
 6. [Miscellaneous](#Miscellaneous)
 
 ## Introduction
-FE v1.0 API is aimed to extend functionality and usage exposed by the [cuDNN C backend API](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnn-backend-api). Both C++ and python APIs are provided with both having functional parity.  
-For a general introduction to FE, please first refer README.md
+FE v1.0 API is aimed to extend functionality and usage exposed by the [cuDNN C backend API](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnn-backend-api). Both C++ and python APIs are provided, and both have functional parity.  
+For a general introduction to FE, please start with README.md.
 
 ## Workflow
 The steps involved in building and running a cudnn graph are as follows:
@@ -97,6 +97,14 @@ This method internally queries the heuristics for engine configs for the given h
 cudnn_frontend::error_t cudnn_frontend::graph::Graph::get_execution_plans(std::vector<heur_mode_t>)
 ```
 
+### Get execution plan count
+This method returns the number of execution plans returned by cudnn heuristics. Each plan gets an index from 0 to #plans-1, with 0 having top priority.
+
+```
+cudnn_frontend::int64_t
+cudnn_frontend::Graph::get_execution_plan_count() const;
+```
+
 ### Check graph support
 This method guarantees that executing the graph using plans queried will succeed.
 
@@ -105,14 +113,33 @@ cudnn_frontend::error_t cudnn_frontend::graph::Graph::check_support(cudnnHandle_
 ```
 
 ### Build plans
-This method builds one or all the engine configs that was queries during the create_execution_plan phase.
 
+This function builds execution plans queired with `create_execution_plan(...)`` API.
+
+There are two flavours of this API:
+
+Use this method to build execution plans according to a policy. Suitable when trusting cudnn heuristics to return nest suitable execition plan with top priority.
+```
+cudnn_frontend::error_t
+cudnn_frontend::graph::Graph::build_plan(
+    cudnnHandle_t const &handle, 
+    cudnn_frontend::BuildPlanPolicy_t const policy, 
+    bool const do_multithreaded_builds
+);
+```
+
+Use this method to build individual plan indicies. Main usecase is to parallely build execution plans when autotuning.
+Plan index to be used here can be queried with `get_execution_plan_count(...)` API.
 ```
-cudnn_frontend::error_t cudnn_frontend::graph::Graph::build_plans(cudnnHandle_t const &handle, 
-                                                                cudnn_frontend::BuildPlanPolicy_t const policy, 
-                                                                bool const do_multithreaded_builds);
+cudnn_frontend::error_t
+cudnn_frontend::Graph::build_plan_at_index(
+    cudnnHandle_t const &handle,
+    int64_t plan_index
+);
 ```
 
+
+
 ### Filter plans (optional)
 Users can filter out plans against numerical, behavioral notes, or plans that do not provide desired functional correctness.
 
@@ -139,18 +166,40 @@ cudnn_frontend::graph::Graph::autotune(cudnnHandle_t handle,
 ### Execute
 Executing graph requires device pointers to all input output tensors and a user alloaction device workspace pointer.
 
+Two flavours of execute exists, corresponding to `build_plans(...)`` API.
+
+This API already has a candidate execution plan set. Candidate execution plan get internally set either:
+- if build_policy_t::HEURISTIC_CHOICE is used, or
+- as the last plan built that got built.
+
 ```
 cudnn_frontend::error_t
-cudnn_frontend::graph::Graph::execute(cudnnHandle_t handle,
-                                        std::unordered_map<std::shared_ptr<Tensor>, void *> var_pack,
-                                        void* workspace);
+cudnn_frontend::graph::Graph::execute(
+    cudnnHandle_t handle,
+    std::unordered_map<std::shared_ptr<Tensor>, void *> var_pack,
+    void* workspace
+);
+```
+
+execute API also takes a plan index to target a specific plan. This may be used when autotuning, in conjuction with `build_plan_at_index(...)` API.
+```
+cudnn_frontend::error_t
+cudnn_frontend::graph::Graph::execute(
+    cudnnHandle_t handle,
+    std::unordered_map<std::shared_ptr<Tensor>, void *> var_pack,
+    void* workspace,
+    int64_t plan_index
+);
 ```
 
 ### Miscellaneous APIs
 
 Get workspace to execute the current selected execution plan.
 
+Can also take in a plan index to query workspace for. This may be used when autotuning, in conjuction with `build_plan_at_index(...)` API.
+
 `int64_t get_workspace_size() const`
+`int64_t get_workspace_size_plan_index(int64_t plan_index) const`
 
 Get workspace to run autotune on all plans.
 
@@ -167,8 +216,7 @@ Samples are meant to illustrate FE v1.0 API usage to users.
 - `samples/cpp` contains samples that use C++ API.
 - `samples/python` contains samples that use python API.
 
-C++ samples are written using [Catch2](https://github.com/catchorg/Catch2) test framework.  
-Python samples are written using [pytest](https://github.com/pytest-dev/pytest) and [pytorch](https://pytorch.org), with both requiring external installation.
+Python samples are jupyter notebooks with step by step guide on using FE v1 API.
 
 ## Operations
 
diff --git a/README.md b/README.md
index af1082b3..573e1d32 100644
--- a/README.md
+++ b/README.md
@@ -31,56 +31,63 @@ cudnn can be installed from
 Minimum python version needed 3.6
 The python binding compilation requires development package which can be installed by running `apt-get install python-dev`.
 
-To run the python samples, additionally, you will need the following python packages
+To run the python samples, additionally, you will need the following python packages:
 - pytest
-- pytorch-cuda=12.1 (or pytorch-cuda=11.8)
-- torchvision
-- torchaudio
-- pytorch
+- torch
+- jupyter
+
+
+### Python API
+Install FE python API by running:
+```
+pip install git+https://github.com/NVIDIA/cudnn-frontend.git
+```
+
+Above command picks cuda and cudnn from default system paths.
+
+To provide a custom CUDA installation path, use environment variable: `CUDAToolkit_ROOT`.  
+To provide a custom CUDNN installation path, use environment variable: `CUDNN_PATH`.
+
+
+To test whether installation is successful, run:
+```
+pytest tests/python_fe
+```
+
+NOTE: Only v1.0 API is exposed via python bindings.
 
 
 ### C++ API
 
-C++ API is header only library. The following compilation steps are only required for building the samples and python bindings.
+C++ API is header only library.
+
+The root CMakeLists.txt can be used as reference to include the cudnn_frontend in your project's build system.
 
-The CMakeLists.txt can be used reference to include the cudnn_frontend in your project.
+#### Building samples
+The following compilation steps are only required for building the samples and/or python bindings.
 
-Provide CUDA according to: https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html  
+Provide CUDA installation path according to: https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html  
+
+Provide CUDNN installation path using CUDNN_PATH env variable or cmake parameter.
 
 CUDNN_PATH has the cudnn installation:
 - Headers are in CUDNN_PATH/include.
 - Libraries are in CUDNN_PATH/lib or CUDNN_PATH/lib64 or CUDNN_PATH/lib/x64.
 
-From project Root,
-
+For a in-source build,
 ```
-mkdir build; cd build
+mkdir build
+cd build
 cmake -DCUDNN_PATH=/path/to/cudnn -DCUDAToolkit_ROOT=/path/to/cuda  ../
 cmake --build . -j16
 bin/samples
 ```
 
-Skip building samples by providing `CUDNN_FRONTEND_BUILD_SAMPLES=OFF` as cmake parameter.  
-Skip building python bindings by providing `CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS=OFF` as cmake parameter.
-
-In case, you have a stale cmake cache and want to update the cudnn/cuda paths, please delete the cmake cache (or build directory and redo the above steps).
-
-### Python API
-Install FE python API by running: 
-pip install git+https://github.com/NVIDIA/cudnn-frontend.git
-
-Incase of custom installation of CUDA and CUDNN, the default path can be overriden by:
+To skip building samples, use `-DCUDNN_FRONTEND_BUILD_SAMPLES=OFF`.
 
-`CUDAToolkit_ROOT=/path/to/cuda CUDNN_PATH=/path/to/cudnn pip install /path/to/cudnn_frontend`.
+To skip building python bindings, use `-DCUDNN_FRONTEND_BUILD_PYTHON_BINDINGS=OFF`.
 
-To provide a custom CUDA, export environment variable: `CUDAToolkit_ROOT`.  
-To provide a custom CUDNN, export environment variable: `CUDNN_PATH`.
-
-```
-    pytest samples/python
-```
-
-NOTE: Only v1.0 API is exposed via python bindings.
+In case, you have a stale cmake cache and want to update the cudnn/cuda paths, please delete the cmake cache (or build directory and redo the above steps).
 
 ## Debugging
 For initial debugging, we recommend turning on the cudnn FE logging and checking for warnings and errors.
@@ -108,4 +115,5 @@ No external contribution to this repository is accepted. Please create an issue
 
 ## Feedback
 Support, resources, and information about cuDNN can be found online at https://developer.nvidia.com/cudnn. 
+
 Also, bugs and rfes can be reported in the issues section.
diff --git a/docs/operations/Attention.md b/docs/operations/Attention.md
index 092ef3a1..78f9f617 100644
--- a/docs/operations/Attention.md
+++ b/docs/operations/Attention.md
@@ -27,6 +27,7 @@ using the FlashAttention-2 algorithm as described in the paper [FlashAttention-2
   - To use an user-provided dropout mask, users must provide:
     - `dropout mask` that matches the attention weights' dimensions, indicating which weights to drop.
     - `dropout scale` used to adjust the scale of the remaining weights accordingly, such as $1 / (1 - \text{dropout probability})$.
+- Ragged tensor: allows the query, key, value, and output tensor to be [ragged tensors](https://www.tensorflow.org/guide/ragged_tensor), which are tensors with nested variable length lists as inner dimensions. Users must pass another tensor called ragged offset tensor using the `Tensor_attributes.set_ragged_offset()` method as specified in the tensors section below.
 
 When multiple masking options are enabled, they are applied in the listed order above.
 
@@ -43,6 +44,7 @@ The dimensions that are passed as 1 will apply a broadcasted mask over attention
 - (Optional) When philox RNG dropout mask is enabled, the RNG seed and offset tensors should have size $(1, 1, 1, 1)$ with int32 or int64 datatype as either a CPU or GPU tensor.
 - (Optional) When a user provided dropout mask is enabled, a dropout mask tensor should have shape $(1, 1, S_{q}, S_{kv})$, $(1, H_{q}, S_{q}, S_{kv})$, $(B, 1, S_{q}, S_{kv})$, or $(B, H_{q}, S_{q}, S_{kv})$ with input/output datatype.  
 The dimensions that are passed as 1 will apply a broadcasted mask over attention weights.
+- (Optional) When query, key, value, and output tensors are ragged tensors, the ragged offset tensor must be a tensor of size $(B + 1, 1, 1, 1)$ that contains the nested tensor's offset in terms of number of elements (not bytes). The last value of the offset tensor specifies the offset of the past-the-end element of the ragged tensor.
 
 Where,
 
@@ -96,7 +98,7 @@ SDPA_attributes &
 set_bias(std::shared_ptr<Tensor_attributes> value);
 
 SDPA_attributes&
-set_alibi_mask(bool const value)
+set_alibi_mask(bool const value);
 
 SDPA_attributes&
 set_padding_mask(bool const value);
@@ -120,7 +122,7 @@ set_dropout(std::shared_ptr<Tensor_attributes> mask,
             std::shared_ptr<Tensor_attributes> scale);
 
 SDPA_attributes &
-set_compute_data_type(DataType_t value)
+set_compute_data_type(DataType_t value);
 ```
 
 **Python API:**
@@ -153,7 +155,7 @@ This operation computes gradient tensors for scaled dot product attention using
 
 #### Configurable Options:
 
-All the options mentioned in the forward operation, including GQA and MQA, are applicable in the backward operation as well.
+All the options mentioned in the forward operation, including ragged tensors and GQA/MQA, are applicable in the backward operation as well.
 
 #### Tensors:
 
@@ -181,19 +183,19 @@ The `options` parameter of type `SDPA_backward_attributes` is used to control th
 
 ```cpp
 SDPA_backward_attributes&
-set_attn_scale(std::shared_ptr<Tensor_attributes> value)
+set_attn_scale(std::shared_ptr<Tensor_attributes> value);
 
 SDPA_backward_attributes&
 set_attn_scale(float const value);
 
 SDPA_backward_attributes&
-set_bias(std::shared_ptr<Tensor_attributes> value)
+set_bias(std::shared_ptr<Tensor_attributes> value);
 
 SDPA_backward_attributes&
-set_dbias(std::shared_ptr<Tensor_attributes> value)
+set_dbias(std::shared_ptr<Tensor_attributes> value);
 
 SDPA_backward_attributes&
-set_alibi_mask(bool const value)
+set_alibi_mask(bool const value);
 
 SDPA_backward_attributes&
 set_padding_mask(bool const value);
@@ -205,20 +207,20 @@ SDPA_backward_attributes&
 set_seq_len_kv(std::shared_ptr<Tensor_attributes> value);
 
 SDPA_backward_attributes&
-set_causal_mask(bool const value)
+set_causal_mask(bool const value);
 
 SDPA_backward_attributes&
 set_dropout(float const probability,
             std::shared_ptr<Tensor_attributes> seed,
-            std::shared_ptr<Tensor_attributes> offset)
+            std::shared_ptr<Tensor_attributes> offset);
 
 SDPA_backward_attributes&
 set_dropout(std::shared_ptr<Tensor_attributes> mask,
             std::shared_ptr<Tensor_attributes> scale,
-            std::shared_ptr<Tensor_attributes> scale_inv)
+            std::shared_ptr<Tensor_attributes> scale_inv);
 
 SDPA_backward_attributes&
-set_compute_data_type(DataType_t const value)
+set_compute_data_type(DataType_t const value);
 ```
 
 Python API: 
diff --git a/include/cudnn_frontend.h b/include/cudnn_frontend.h
index d2946bb5..0f0d5a66 100644
--- a/include/cudnn_frontend.h
+++ b/include/cudnn_frontend.h
@@ -121,10 +121,11 @@
 #include "cudnn_frontend_Resample.h"
 
 #include "cudnn_frontend/graph_interface.h"
+#include "cudnn_frontend/utils/serialize.h"
 
 #define CUDNN_FRONTEND_MAJOR_VERSION 1
-#define CUDNN_FRONTEND_MINOR_VERSION 0
-#define CUDNN_FRONTEND_PATCH_VERSION 3
+#define CUDNN_FRONTEND_MINOR_VERSION 1
+#define CUDNN_FRONTEND_PATCH_VERSION 0
 #define CUDNN_FRONTEND_VERSION \
     ((CUDNN_FRONTEND_MAJOR_VERSION * 10000) + (CUDNN_FRONTEND_MINOR_VERSION * 100) + CUDNN_FRONTEND_PATCH_VERSION)
 
diff --git a/include/cudnn_frontend/cudnn_interface.h b/include/cudnn_frontend/cudnn_interface.h
index 8c12cdc4..6fe8bcf2 100644
--- a/include/cudnn_frontend/cudnn_interface.h
+++ b/include/cudnn_frontend/cudnn_interface.h
@@ -42,48 +42,59 @@ class ICudnn {
     // TODO: Always returns OK. Can the status and error message be accessed from tensor descriptor?
     error_t
     create_cudnn_tensor(std::shared_ptr<graph::Tensor_attributes> const& props,
-                        int64_t& uid,
-                        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const {
+                        uid_t& uid,
+                        std::unordered_map<uid_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                        std::unordered_set<uid_t> const& invalid_uids) const {
         // Check whether tensor already created
-        // TODO: Do not reply on uid being 0?
-        if (props->get_uid() == 0) {
-            // Make sure no other tensor somehow already has claimed uid.
-            RETURN_CUDNN_FRONTEND_ERROR_IF(tensors.find(uid) != tensors.end(),
-                                           error_code_t::ATTRIBUTE_NOT_SET,
-                                           "Trying to assign same uid to possibily two different tensors.");
+        // Make sure no other tensor somehow already has claimed uid.
+
+        auto tensor_uid = props->has_uid() ? props->get_uid() : uid;
+        if (tensors.find(tensor_uid) != tensors.end()) {
+            getLogger() << "[cudnn_frontend] INFO: Shared Tensor" << uid << " already created." << std::endl;
+            return {error_code_t::OK, ""};
+        }
+
+        if (props->has_uid() == false) {
             props->set_uid(uid);
-            uid++;
-
-            auto&& tensor_builder = cudnn_frontend::TensorBuilder();
-
-            tensor_builder.setDim(props->get_dim().size(), props->get_dim().data())
-                .setStrides(props->get_stride().size(), props->get_stride().data())
-                .setId(props->get_uid())
-                .setAlignment(16)
-                .setDataType(props->get_data_type())
-                .setVirtual(props->get_is_virtual())
-                .setByValue(props->get_is_pass_by_value())
-                .setReorderType(props->get_reordering_type());
-
-            if (auto ragged_offset_props = props->get_ragged_offset()) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(ragged_offset_props, uid, tensors));
-                tensor_builder.setRaggedOffset(tensors.at(ragged_offset_props->get_uid()));
-            }
+            do {
+                uid++;
+            } while (invalid_uids.find(uid) != invalid_uids.end());
+        }
+
+        auto&& tensor_builder = cudnn_frontend::TensorBuilder();
 
+        tensor_builder.setDim(props->get_dim().size(), props->get_dim().data())
+            .setStrides(props->get_stride().size(), props->get_stride().data())
+            .setId(props->get_uid())
+            .setAlignment(16)
+            .setDataType(props->get_data_type())
+            .setVirtual(props->get_is_virtual())
+            .setByValue(props->get_is_pass_by_value())
+            .setReorderType(props->get_reordering_type());
+
+        if (auto ragged_offset_props = props->get_ragged_offset()) {
+            CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(ragged_offset_props, uid, tensors, invalid_uids));
+            tensor_builder.setRaggedOffset(tensors.at(ragged_offset_props->get_uid()));
+        }
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto tensor = tensor_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            tensor.get_status() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, tensor.get_error());
+        tensors.emplace(props->get_uid(), std::make_shared<Tensor>(std::move(tensor)));
+#else
+        // build() can throw
+        // wrap in try catch
+        try {
             auto tensor = tensor_builder.build();
             tensors.emplace(props->get_uid(), std::make_shared<Tensor>(std::move(tensor)));
-
-        } else {
-            // Make sure tensor's uid is present in backend tensor registry.
+        } catch (cudnn_frontend::cudnnException& e) {
             RETURN_CUDNN_FRONTEND_ERROR_IF(
-                tensors.find(props->get_uid()) == tensors.end(),
-                error_code_t::ATTRIBUTE_NOT_SET,
-                "Backend tensor already not found for non-zero Id: " + std::to_string(props->get_uid()));
-
-            getLogger() << "[cudnn_frontend] INFO: Backend tensor already created for Id: " +
-                               std::to_string(props->get_uid())
-                        << std::endl;
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
+#endif
 
         return {error_code_t::OK, ""};
     }
@@ -94,26 +105,50 @@ class ICudnn {
         for (std::shared_ptr<cudnn_frontend::Operation> operation : operations) {
             cudnn_operations.push_back(operation.get());
         }
-        auto cudnn_operation_graph = cudnn_frontend::OperationGraphBuilder()
-                                         .setHandle(handle)
-                                         .setOperationGraph(cudnn_operations.size(), cudnn_operations.data())
-                                         .build();
 
+        auto&& cudnn_operation_graph_builder = cudnn_frontend::OperationGraphBuilder();
+        cudnn_operation_graph_builder.setHandle(handle).setOperationGraph(cudnn_operations.size(),
+                                                                          cudnn_operations.data());
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto cudnn_operation_graph = cudnn_operation_graph_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(cudnn_operation_graph.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       cudnn_operation_graph.get_error());
         operation_graphs.push_back(std::make_shared<OperationGraph_v8>(std::move(cudnn_operation_graph)));
-        getLogger() << "[cudnn_frontend] INFO: Successfully built Operation Graphs." << std::endl;
-
-        return {error_code_t::OK, ""};
+#else
+        // build() can throw
+        // wrap in try catch
+        try {
+            auto cudnn_operation_graph = cudnn_operation_graph_builder.build();
+            operation_graphs.push_back(std::make_shared<OperationGraph_v8>(std::move(cudnn_operation_graph)));
+        } catch (cudnn_frontend::cudnnException& e) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
+        }
+#endif
+        return {error_code_t::OK, "Successfully built Operation Graphs."};
     }
 
    public:
-    int64_t
-    get_cudnn_workspace_size_node() const {
-        int64_t current_workspace_size = 0;
+    error_t
+    get_cudnn_workspace_size_node(int64_t const plan_index, int64_t& cudnn_workspace_size) const {
         for (auto const& execution_plan_list : plans) {
-            current_workspace_size =
-                std::max(current_workspace_size, execution_plan_list.get_best_candidate()->getWorkspaceSize());
+            int64_t candidate = plan_index != -1 ? plan_index : execution_plan_list.candidate;
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                (candidate < 0) && (static_cast<int64_t>(execution_plan_list.execution_plans.size()) <= candidate),
+                error_code_t::GRAPH_EXECUTION_FAILED,
+                "Plan index is invalid.");
+
+            RETURN_CUDNN_FRONTEND_ERROR_IF(!(execution_plan_list.execution_plans[candidate]),
+                                           error_code_t::GRAPH_EXECUTION_FAILED,
+                                           "No candidate plan found for graph to query worksapce for.");
+            cudnn_workspace_size =
+                std::max(cudnn_workspace_size, execution_plan_list.execution_plans[candidate]->getWorkspaceSize());
         }
-        return current_workspace_size;
+        return {error_code_t::OK, ""};
     }
 
     int64_t
@@ -126,22 +161,18 @@ class ICudnn {
     }
 
     error_t
-    execute_cudnn_plans(cudnnHandle_t handle,
-                        std::unordered_map<uid_t, void*> const& tensor_uid_to_pointer_map,
-                        void* workspace_ptr) const {
-        getLogger() << "[cudnn_frontend] INFO: Executing " << plans.size() << " Plans." << std::endl;
+    execute_cudnn_plans_with_uid(cudnnHandle_t handle,
+                                 std::unordered_map<int64_t, void*> const& tensor_uid_to_pointer_map,
+                                 void* workspace_ptr,
+                                 int64_t plan_index = -1) const {
+        getLogger() << "[cudnn_frontend] INFO: Executing " << plans.size() << " plans." << std::endl;
 
+        // Go over each plan list
         for (size_t i = 0; i < plans.size(); ++i) {
-            auto const& execution_plan = plans[i].get_best_candidate();
-            RETURN_CUDNN_FRONTEND_ERROR_IF(
-                execution_plan == nullptr, error_code_t::GRAPH_EXECUTION_FAILED, "No plan found to execute!!");
-            auto const& variant_pack_uid = variant_pack_uids[i];
-
-            getLogger() << "[cudnn_frontend] INFO: Executing " << execution_plan->getTag() << "..." << std::endl;
-
+            // Make sure device pointer is provided for all uids expected for this plan
             std::vector<void*> device_ptrs;
             std::vector<uid_t> uids;
-            for (auto const& uid : variant_pack_uid) {
+            for (auto const& uid : variant_pack_uids[i]) {
                 auto search = tensor_uid_to_pointer_map.find(uid);
                 RETURN_CUDNN_FRONTEND_ERROR_IF(search == tensor_uid_to_pointer_map.end(),
                                                error_code_t::INVALID_VARIANT_PACK,
@@ -149,25 +180,19 @@ class ICudnn {
                 device_ptrs.push_back(tensor_uid_to_pointer_map.at(uid));
                 uids.push_back(uid);
             }
-            auto variant_pack = VariantPackBuilder()
-                                    .setDataPointers(device_ptrs.size(), device_ptrs.data())
-                                    .setUids(uids.size(), uids.data())
-                                    .setWorkspacePointer(workspace_ptr)
-                                    .build();
-            if (variant_pack.get_status() != CUDNN_STATUS_SUCCESS) {
-                std::string message = "[cudnn_frontend] ERROR: Variant pack creation failed with " +
-                                      std::string(variant_pack.get_error());
-                return {error_code_t::INVALID_VARIANT_PACK, message};
-            }
-            getLogger() << "[cudnn_frontend] INFO: Built variant pack for " << execution_plan->getTag() << "..."
-                        << std::endl;
 
-            auto status = cudnnBackendExecute(handle, execution_plan->get_raw_desc(), variant_pack.get_raw_desc());
-            if (status != CUDNN_STATUS_SUCCESS) {
-                std::string message = "[cudnn_frontend] ERROR: Graph execution failed.";
-                return {error_code_t::GRAPH_EXECUTION_FAILED, message};
-            }
-            getLogger() << "[cudnn_frontend] INFO: Executed " << execution_plan->getTag() << "." << std::endl;
+            int64_t candidate = plan_index != -1 ? plan_index : plans[i].candidate;
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                (candidate < 0) && (static_cast<int64_t>(plans[i].execution_plans.size()) <= candidate),
+                error_code_t::GRAPH_EXECUTION_FAILED,
+                "Plan index is invalid.");
+
+            RETURN_CUDNN_FRONTEND_ERROR_IF(!(plans[i].execution_plans[candidate]),
+                                           error_code_t::GRAPH_EXECUTION_FAILED,
+                                           "Plan index does not correspond to a valid plan.");
+
+            CHECK_CUDNN_FRONTEND_ERROR(
+                detail::execute(handle, plans[i].execution_plans[candidate].get(), device_ptrs, uids, workspace_ptr));
         }
 
         return {error_code_t::OK, ""};
diff --git a/include/cudnn_frontend/graph_interface.h b/include/cudnn_frontend/graph_interface.h
index 10b444de..13f40d18 100644
--- a/include/cudnn_frontend/graph_interface.h
+++ b/include/cudnn_frontend/graph_interface.h
@@ -28,11 +28,16 @@ class Graph : public INode {
    private:
     std::unordered_set<std::shared_ptr<Tensor_attributes>> tensors;
 
+    void
+    add_to_tensor_map(std::shared_ptr<Tensor_attributes> tensor) {
+        tensors.emplace(tensor);
+    }
+
     std::shared_ptr<Tensor_attributes>
     output_tensor(std::string const &name) {
         auto tensor = std::make_shared<Tensor_attributes>();
         tensor->set_name(name).set_is_virtual(true);
-        tensors.emplace(tensor);
+        add_to_tensor_map(tensor);
         return tensor;
     }
 
@@ -187,6 +192,9 @@ class Graph : public INode {
     error_t
     create_execution_plans(std::vector<HeurMode_t> const &mode);
 
+    int64_t
+    get_execution_plan_count() const;
+
     error_t
     check_support(cudnnHandle_t h) {
         for (auto &plan_list : plans) {
@@ -200,6 +208,9 @@ class Graph : public INode {
                 BuildPlanPolicy_t const policy     = BuildPlanPolicy_t::HEURISTICS_CHOICE,
                 bool const do_multithreaded_builds = false);
 
+    error_t
+    build_plan_at_index(cudnnHandle_t const &handle, int64_t index);
+
     Graph &
     deselect_workspace_greater_than(int64_t const workspace) {
         for (auto &plan_list : plans) {
@@ -210,16 +221,10 @@ class Graph : public INode {
 
     Graph &
     deselect_behavior_notes(std::vector<BehaviorNote_t> const &notes) {
-        std::vector<cudnnBackendBehaviorNote_t> backend_notes;
-        for (auto &note : notes) {
-            cudnnBackendBehaviorNote_t backend_note;
-            detail::convert_to_cudnn_type(note, backend_note);
-            backend_notes.push_back(backend_note);
-        }
         for (auto &plan_list : plans) {
-            auto status = plan_list.filter_out_behavior_notes(backend_notes);
+            auto status = plan_list.deselect_behavior_notes(notes);
             if (status.is_bad()) {
-                getLogger() << "[cudnn_frontend] ERROR: Filtering by behavioural notes failed." << std::endl;
+                getLogger() << status.get_message() << std::endl;
             }
         }
         return *this;
@@ -227,33 +232,83 @@ class Graph : public INode {
 
     Graph &
     deselect_numeric_notes(std::vector<NumericalNote_t> const &notes) {
-        std::vector<cudnnBackendNumericalNote_t> backend_notes;
-        for (auto &note : notes) {
-            cudnnBackendNumericalNote_t backend_note;
-            detail::convert_to_cudnn_type(note, backend_note);
-            backend_notes.push_back(backend_note);
-        }
         for (auto &plan_list : plans) {
-            auto status = plan_list.filter_out_numeric_notes(backend_notes);
+            auto status = plan_list.deselect_numeric_notes(notes);
             if (status.is_bad()) {
-                getLogger() << "[cudnn_frontend] ERROR: Filtering by numerical notes failed." << std::endl;
+                getLogger() << status.get_message() << std::endl;
             }
         }
         return *this;
     }
 
+    using INode::deserialize;
+    using INode::serialize;
+
+    virtual void
+    serialize(json &j) const override final {
+        // Different from serialization of other INodes.
+        // Go over each subnode and serialize them.
+        j["nodes"];
+        for (auto const &sub_node : sub_nodes) {
+            json j_sub_node;
+            sub_node->serialize(j_sub_node);
+            j["nodes"].push_back(j_sub_node);
+        }
+    };
+
+    // TODO: temparorily placed in graphs class. This function needs to be a free standing function.
     error_t
-    autotune(cudnnHandle_t handle,
-             std::unordered_map<std::shared_ptr<Tensor_attributes>, void *> variants,
-             void *workspace,
-             void *user_impl = nullptr) {
-        for (auto &plan_list : plans) {
-            CHECK_CUDNN_FRONTEND_ERROR(plan_list.autotune(handle, variants, workspace, user_impl));
+    deserialize(const json &j) {
+        if (j.contains("nodes") && j["nodes"].is_array()) {
+            for (const auto &j_sub_node : j["nodes"]) {
+                if (j_sub_node.contains("tag") && j_sub_node["tag"].is_string()) {
+                    auto tag = j_sub_node["tag"].get<std::string>();
+                    if (tag == "CONV_FPROP") {
+                        auto conv_fprop_attributes = j_sub_node.get<Conv_fprop_attributes>();
+                        sub_nodes.emplace_back(
+                            std::make_unique<ConvolutionNode>(std::move(conv_fprop_attributes), detail::Context()));
+                    } else if (tag == "POINTWISE") {
+                        auto pointwise_attributes = j_sub_node.get<Pointwise_attributes>();
+                        sub_nodes.emplace_back(
+                            std::make_unique<PointwiseNode>(std::move(pointwise_attributes), detail::Context()));
+                    } else if (tag == "REDUCTION") {
+                        auto reduction_attributes = j_sub_node.get<Reduction_attributes>();
+                        sub_nodes.emplace_back(
+                            std::make_unique<ReductionNode>(std::move(reduction_attributes), detail::Context()));
+                    } else if (tag == "SDPA_FWD") {
+                        auto sdpa_attributes = j_sub_node.get<SDPA_attributes>();
+                        sub_nodes.emplace_back(
+                            std::make_unique<SDPANode>(std::move(sdpa_attributes), detail::Context()));
+                    } else if (tag == "SDPA_BWD") {
+                        auto sdpa_bwd_attributes = j_sub_node.get<SDPA_backward_attributes>();
+                        sub_nodes.emplace_back(
+                            std::make_unique<SDPABackwardNode>(std::move(sdpa_bwd_attributes), detail::Context()));
+                    }
+                }
+            }
         }
+
         return {error_code_t::OK, ""};
     }
+
+    std::string
+    print(void) const {
+        std::stringstream ss;
+        json j = *this;
+        ss << j.dump(4);
+        return ss.str();
+    }
 };
 
+inline int64_t
+Graph::get_execution_plan_count() const {
+    int64_t plan_count = 0;
+    for (auto &plan_list : plans) {
+        plan_count += plan_list.execution_plans.size();
+    }
+    return plan_count;
+}
+
 inline error_t
 Graph::create_execution_plans(std::vector<HeurMode_t> const &mode) {
     std::unordered_map<std::string, EngineConfigList> op_graph_to_configs;
@@ -276,6 +331,14 @@ Graph::create_execution_plans(std::vector<HeurMode_t> const &mode) {
     return {error_code_t::OK, ""};
 }
 
+inline error_t
+Graph::build_plan_at_index(cudnnHandle_t const &handle, int64_t plan_index) {
+    for (auto i = 0u; i < plans.size(); i++) {
+        CHECK_CUDNN_FRONTEND_ERROR(plans[i].build_plan_at_index(handle, plan_index));
+    }
+    return {error_code_t::OK, ""};
+}
+
 inline error_t
 Graph::build_plans(cudnnHandle_t const &handle, BuildPlanPolicy_t const policy, bool const do_multithreaded_builds) {
     for (auto &plan_list : plans) {
@@ -305,7 +368,7 @@ Graph::set_compute_data_type(DataType_t const type) {
 inline std::shared_ptr<Tensor_attributes>
 Graph::tensor(Tensor_attributes const &tensor) {
     auto tensor_ptr = std::make_shared<Tensor_attributes>(tensor);
-    tensors.emplace(tensor_ptr);
+    add_to_tensor_map(tensor_ptr);
     return tensor_ptr;
 }
 
@@ -320,12 +383,11 @@ Graph::tensor_like(std::shared_ptr<Tensor_attributes> const &tensor, std::string
     // reset the uid of the cloned tensor
     // uids are not meant to be copied by tensor_like
     // When lowering to cudnn backend, both tensors involved here will get unique uids.
-    tensor_ptr->set_uid(0);
+    tensor_ptr->clear_uid();
 
     // reset the name too. Defaults to empty string.
     tensor_ptr->set_name(name);
 
-    tensors.emplace(tensor_ptr);
     return tensor_ptr;
 }
 
@@ -755,4 +817,10 @@ Graph::sdpa_backward(std::shared_ptr<Tensor_attributes> q,
     return {dQ, dK, dV};
 }
 
+static inline std::ostream &
+operator<<(std::ostream &os, Graph const &graph) {
+    os << graph.print();
+    return os;
+}
+
 }  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/include/cudnn_frontend/graph_properties.h b/include/cudnn_frontend/graph_properties.h
index 9eba9373..cf323431 100644
--- a/include/cudnn_frontend/graph_properties.h
+++ b/include/cudnn_frontend/graph_properties.h
@@ -28,6 +28,7 @@ class Tensor_attributes {
     bool is_pass_by_value              = false;
     TensorReordering_t reordering_type = TensorReordering_t::NONE;
     int64_t uid                        = 0;
+    bool uid_assigned                  = false;
 
     std::shared_ptr<Tensor_attributes> ragged_offset;
 
@@ -68,10 +69,9 @@ class Tensor_attributes {
                                    stride,
                                    is_virtual,
                                    is_pass_by_value,
-                                   reordering_type
-                                   /* uid */  // Not serializing uid is intentional. FE graphs do no need a uid. uid is
-                                              // only meant to act as a bridge between backend and frontend tensors.
-    )
+                                   reordering_type,
+                                   uid,
+                                   uid_assigned)
 
     Tensor_attributes() = default;
 
@@ -167,14 +167,27 @@ class Tensor_attributes {
         return uid;
     }
 
+    int64_t
+    has_uid() const {
+        return uid_assigned;
+    }
+
+    auto
+    clear_uid(void) -> Tensor_attributes& {
+        uid          = 0;
+        uid_assigned = false;
+        return *this;
+    }
+
     auto
     set_uid(int64_t value) -> Tensor_attributes& {
-        uid = value;
+        uid          = value;
+        uid_assigned = true;
         return *this;
     }
 
     auto
-    set_ragged_offset(std::shared_ptr<Tensor_attributes> value) -> Tensor_attributes& {
+    set_ragged_offset(std::shared_ptr<Tensor_attributes> const& value) -> Tensor_attributes& {
         ragged_offset = value;
         return *this;
     }
@@ -314,6 +327,45 @@ class Attributes {
         }
         return {error_code_t::OK, ""};
     }
+
+    error_t
+    get_prefilled_uids(std::unordered_set<int64_t>& pre_assigned_uids) const {
+        auto derived = static_cast<DerivedT const*>(this);
+
+        for (auto& [name, tensor] : derived->inputs) {
+            (void)name;
+            if (tensor && tensor->has_uid()) {
+                pre_assigned_uids.insert(tensor->get_uid());
+                if (auto ragged_offset = tensor->get_ragged_offset()) {
+                    pre_assigned_uids.insert(ragged_offset->get_uid());
+                }
+            }
+        }
+        for (auto& [name, tensor] : derived->outputs) {
+            (void)name;
+            if (tensor && tensor->has_uid()) {
+                pre_assigned_uids.insert(tensor->get_uid());
+                if (auto ragged_offset = tensor->get_ragged_offset()) {
+                    pre_assigned_uids.insert(ragged_offset->get_uid());
+                }
+            }
+        }
+
+        // Handle special case of BN where peer_stats is also an input
+        if constexpr (std::is_same_v<DerivedT, Batchnorm_attributes> ||
+                      std::is_same_v<DerivedT, Batchnorm_backward_attributes>) {
+            for (auto& tensor : derived->peer_stats) {
+                if (tensor && tensor->has_uid()) {
+                    pre_assigned_uids.insert(tensor->get_uid());
+                    if (auto ragged_offset = tensor->get_ragged_offset()) {
+                        pre_assigned_uids.insert(ragged_offset->get_uid());
+                    }
+                }
+            }
+        }
+
+        return {error_code_t::OK, ""};
+    }
 };
 
 class BN_finalize_attributes : public Attributes<BN_finalize_attributes> {
@@ -321,6 +373,7 @@ class BN_finalize_attributes : public Attributes<BN_finalize_attributes> {
     friend class BatchNormFinalizeNode;
     friend class Graph;
 
+   public:
     enum class input_names {
         SUM,
         SQ_SUM,
@@ -332,13 +385,11 @@ class BN_finalize_attributes : public Attributes<BN_finalize_attributes> {
         PREV_RUNNING_VAR,
         MOMENTUM
     };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
     enum class output_names { EQ_SCALE, EQ_BIAS, MEAN, INV_VARIANCE, NEXT_RUNNING_MEAN, NEXT_RUNNING_VAR };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
 
-   public:
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(BN_finalize_attributes, name, inputs, outputs)
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
 
     BN_finalize_attributes&
     set_previous_running_stats(std::shared_ptr<Tensor_attributes>& mean,
@@ -356,13 +407,12 @@ class Genstats_attributes : public Attributes<Genstats_attributes> {
     friend class GenstatsNode;
     friend class Graph;
 
+   public:
     enum class input_names { X };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
 
     enum class output_names { SUM, SQ_SUM };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
-   public:
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(Genstats_attributes, name, inputs, outputs)
 };
 
@@ -371,27 +421,51 @@ class Conv_fprop_attributes : public Attributes<Conv_fprop_attributes> {
     friend class ConvolutionNode;
     friend class Graph;
 
-    enum class input_names { X, W };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
-    enum class output_names { Y };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
-    std::vector<int64_t> padding;
+    std::vector<int64_t> pre_padding;
+    std::vector<int64_t> post_padding;
     std::vector<int64_t> stride;
     std::vector<int64_t> dilation;
 
    public:
-    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Conv_fprop_attributes, name, inputs, outputs, padding, stride, dilation)
+    enum class input_names { X, W };
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y };
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Conv_fprop_attributes,
+                                   name,
+                                   inputs,
+                                   outputs,
+                                   pre_padding,
+                                   post_padding,
+                                   stride,
+                                   dilation)
+
+    std::vector<int64_t>
+    get_pre_padding() const {
+        return pre_padding;
+    }
 
     std::vector<int64_t>
-    get_padding() const {
-        return padding;
+    get_post_padding() const {
+        return post_padding;
     }
 
     Conv_fprop_attributes&
     set_padding(std::vector<int64_t> value) {
-        padding = value;
+        pre_padding  = value;
+        post_padding = value;
+        return *this;
+    }
+
+    Conv_fprop_attributes&
+    set_pre_padding(std::vector<int64_t> value) {
+        pre_padding = value;
+        return *this;
+    }
+
+    Conv_fprop_attributes&
+    set_post_padding(std::vector<int64_t> value) {
+        post_padding = value;
         return *this;
     }
 
@@ -423,16 +497,14 @@ class Batchnorm_backward_attributes : public Attributes<Batchnorm_backward_attri
     friend class DBNNode;
     friend class Graph;
 
+   public:
     enum class input_names { DY, X, SCALE, MEAN, INV_VARIANCE };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
     // Only special case where one of the inputs is a vector.
     std::vector<std::shared_ptr<Tensor_attributes>> peer_stats;
-
     enum class output_names { DX, DSCALE, DBIAS };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
-   public:
-    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Batchnorm_backward_attributes, name, inputs, outputs)
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Batchnorm_backward_attributes, name, inputs, peer_stats, outputs)
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
 
     Batchnorm_backward_attributes&
     set_saved_mean_and_inv_variance(std::shared_ptr<Tensor_attributes> mean,
@@ -454,14 +526,12 @@ class DBN_weight_attributes : public Attributes<DBN_weight_attributes> {
     friend class DBNWeightNode;
     friend class Graph;
 
+   public:
     enum class input_names { DY, X, SCALE, MEAN, INV_VARIANCE };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
     enum class output_names { DSCALE, DBIAS, EQ_BIAS, EQ_SCALE_DY, EQ_SCALE_X };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
-   public:
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(DBN_weight_attributes, name, inputs, outputs)
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
 };
 
 class Conv_dgrad_attributes : public Attributes<Conv_dgrad_attributes> {
@@ -469,27 +539,51 @@ class Conv_dgrad_attributes : public Attributes<Conv_dgrad_attributes> {
     friend class DgradNode;
     friend class Graph;
 
-    enum class input_names { DY, W };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
-    enum class output_names { DX };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
-    std::vector<int64_t> padding;
+    std::vector<int64_t> pre_padding;
+    std::vector<int64_t> post_padding;
     std::vector<int64_t> stride;
     std::vector<int64_t> dilation;
 
    public:
-    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Conv_dgrad_attributes, name, inputs, outputs, padding, stride, dilation)
+    enum class input_names { DY, W };
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { DX };
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Conv_dgrad_attributes,
+                                   name,
+                                   inputs,
+                                   outputs,
+                                   pre_padding,
+                                   post_padding,
+                                   stride,
+                                   dilation)
+
+    std::vector<int64_t>
+    get_pre_padding() const {
+        return pre_padding;
+    }
 
     std::vector<int64_t>
-    get_padding() const {
-        return padding;
+    get_post_padding() const {
+        return post_padding;
     }
 
     Conv_dgrad_attributes&
     set_padding(std::vector<int64_t> value) {
-        padding = value;
+        pre_padding  = value;
+        post_padding = value;
+        return *this;
+    }
+
+    Conv_dgrad_attributes&
+    set_pre_padding(std::vector<int64_t> value) {
+        pre_padding = value;
+        return *this;
+    }
+
+    Conv_dgrad_attributes&
+    set_post_padding(std::vector<int64_t> value) {
+        post_padding = value;
         return *this;
     }
 
@@ -521,15 +615,13 @@ class Matmul_attributes : public Attributes<Matmul_attributes> {
     friend class MatmulNode;
     friend class INode;
 
-    enum class input_names { A, B, M_override, N_override, K_override };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
-    enum class output_names { C };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
     double padding_value = 0.0;
 
    public:
+    enum class input_names { A, B, M_override, N_override, K_override };
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { C };
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(Matmul_attributes, name, inputs, outputs)
 
     Matmul_attributes&
@@ -563,18 +655,16 @@ class Pointwise_attributes : public Attributes<Pointwise_attributes> {
     friend class SoftmaxNode;
     friend class INode;
 
-    enum class input_names { IN_0, IN_1, IN_2 };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
-    enum class output_names { OUT_0 };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
     PointwiseMode_t mode = PointwiseMode_t::NOT_SET;
     std::optional<int64_t> axis;
 
     std::optional<float> relu_lower_clip_slope;
 
    public:
+    enum class input_names { IN_0, IN_1, IN_2 };
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { OUT_0 };
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(Pointwise_attributes, name, inputs, outputs, mode, axis)
 
     Pointwise_attributes&
@@ -606,13 +696,11 @@ class Instancenorm_backward_attributes : public Attributes<Instancenorm_backward
     friend class DINNode;
     friend class Graph;
 
+   public:
     enum class input_names { DY, X, SCALE, MEAN, INV_VARIANCE };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
     enum class output_names { DX, DSCALE, DBIAS };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
-   public:
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(Instancenorm_backward_attributes, name, inputs, outputs)
 
     Instancenorm_backward_attributes&
@@ -629,13 +717,11 @@ class Layernorm_backward_attributes : public Attributes<Layernorm_backward_attri
     friend class DLNNode;
     friend class Graph;
 
+   public:
     enum class input_names { DY, X, SCALE, MEAN, INV_VARIANCE };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
     enum class output_names { DX, DSCALE, DBIAS };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
-   public:
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(Layernorm_backward_attributes, name, inputs, outputs)
 
     Layernorm_backward_attributes&
@@ -652,15 +738,13 @@ class Layernorm_attributes : public Attributes<Layernorm_attributes> {
     friend class LayerNormNode;
     friend class Graph;
 
-    enum class input_names { X, SCALE, BIAS, EPSILON };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
-    enum class output_names { Y, MEAN, INV_VARIANCE };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
     NormFwdPhase_t forward_phase = NormFwdPhase_t::NOT_SET;
 
    public:
+    enum class input_names { X, SCALE, BIAS, EPSILON };
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y, MEAN, INV_VARIANCE };
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(Layernorm_attributes, name, inputs, outputs, forward_phase)
 
     Layernorm_attributes&
@@ -681,15 +765,13 @@ class Instancenorm_attributes : public Attributes<Instancenorm_attributes> {
     friend class InstanceNormNode;
     friend class Graph;
 
-    enum class input_names { X, SCALE, BIAS, EPSILON };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
-    enum class output_names { Y, MEAN, INV_VARIANCE };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
     NormFwdPhase_t forward_phase = NormFwdPhase_t::NOT_SET;
 
    public:
+    enum class input_names { X, SCALE, BIAS, EPSILON };
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y, MEAN, INV_VARIANCE };
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(Instancenorm_attributes, name, inputs, outputs, forward_phase)
 
     Instancenorm_attributes&
@@ -710,15 +792,13 @@ class Batchnorm_attributes : public Attributes<Batchnorm_attributes> {
     friend class BatchNormNode;
     friend class Graph;
 
+   public:
     enum class input_names { X, SCALE, BIAS, PREV_RUNNING_MEAN, PREV_RUNNING_VAR, EPSILON, MOMENTUM };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
     // Only special case where one of the inputs is a vector.
     std::vector<std::shared_ptr<Tensor_attributes>> peer_stats;
-
     enum class output_names { Y, MEAN, INV_VARIANCE, NEXT_RUNNING_MEAN, NEXT_RUNNING_VAR };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
-   public:
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(Batchnorm_attributes, name, inputs, peer_stats, outputs)
 
     Batchnorm_attributes&
@@ -749,13 +829,11 @@ class Batchnorm_inference_attributes : public Attributes<Batchnorm_inference_att
     friend class BatchnormInferenceNode;
     friend class Graph;
 
+   public:
     enum class input_names { X, MEAN, INV_VARIANCE, SCALE, BIAS };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
     enum class output_names { Y };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
-   public:
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(Batchnorm_inference_attributes, name, inputs, outputs)
 };
 
@@ -764,15 +842,13 @@ class Reduction_attributes : public Attributes<Reduction_attributes> {
     friend class ReductionNode;
     friend class INode;
 
-    enum class input_names { X };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
-    enum class output_names { Y };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
     std::optional<ReductionMode_t> mode;
 
    public:
+    enum class input_names { X };
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y };
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(Reduction_attributes, name, inputs, outputs, mode)
 
     std::optional<ReductionMode_t>
@@ -792,12 +868,6 @@ class Rng_attributes : public Attributes<Rng_attributes> {
     friend class RngNode;
     friend class INode;
 
-    enum class input_names { Seed, Offset };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
-    enum class output_names { Y };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
     RngDistribution_t distribution = RngDistribution_t::NOT_SET;
     std::vector<int64_t> dim       = {};
     std::vector<int64_t> stride    = {};
@@ -805,6 +875,10 @@ class Rng_attributes : public Attributes<Rng_attributes> {
     std::optional<double> bernoulli_probability;
 
    public:
+    enum class input_names { Seed, Offset };
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y };
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(Rng_attributes,
                                    name,
                                    inputs,
@@ -876,16 +950,14 @@ class Reshape_attributes : public Attributes<Reshape_attributes> {
     friend class ReshapeNode;
     friend class INode;
 
-    enum class input_names { X };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
-    enum class output_names { Y };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
     std::vector<int64_t> dim    = {};
     std::vector<int64_t> stride = {};
 
    public:
+    enum class input_names { X };
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y };
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(Reshape_attributes, name, inputs, outputs, dim, stride)
 
     std::vector<int64_t>
@@ -916,15 +988,13 @@ class Rmsnorm_attributes : public Attributes<Rmsnorm_attributes> {
     friend class RMSNormNode;
     friend class Graph;
 
-    enum class input_names { X, SCALE, BIAS, EPSILON };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
-    enum class output_names { Y, INV_VARIANCE };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
     NormFwdPhase_t forward_phase = NormFwdPhase_t::NOT_SET;
 
    public:
+    enum class input_names { X, SCALE, BIAS, EPSILON };
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y, INV_VARIANCE };
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(Rmsnorm_attributes, name, inputs, outputs, forward_phase)
 
     Rmsnorm_attributes&
@@ -951,14 +1021,13 @@ class Rmsnorm_backward_attributes : public Attributes<Rmsnorm_backward_attribute
     friend class DRMSNormNode;
     friend class Graph;
 
-    enum class input_names { DY, X, SCALE, INV_VARIANCE };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
-    enum class output_names { DX, DSCALE, DBIAS };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
     std::optional<bool> use_dbias;
 
    public:
+    enum class input_names { DY, X, SCALE, INV_VARIANCE };
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { DX, DSCALE, DBIAS };
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(Rmsnorm_backward_attributes, name, inputs, outputs)
 
     Rmsnorm_backward_attributes&
@@ -1090,6 +1159,14 @@ class SDPA_attributes : public Attributes<SDPA_attributes> {
     friend class SDPANode;
     friend class Graph;
 
+    std::optional<bool> is_inference;
+    bool alibi_mask   = false;
+    bool padding_mask = false;
+    bool causal_mask  = false;
+    std::optional<float> dropout_probability;
+    std::optional<float> attn_scale_value;
+
+   public:
     enum class input_names {
         Q,
         K,
@@ -1103,19 +1180,20 @@ class SDPA_attributes : public Attributes<SDPA_attributes> {
         Dropout_mask,
         Dropout_scale
     };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
     enum class output_names { O, Stats, RNG_DUMP };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
-    std::optional<bool> is_inference;
-    bool alibi_mask   = false;
-    bool padding_mask = false;
-    bool causal_mask  = false;
-    std::optional<float> dropout_probability;
-    std::optional<float> attn_scale_value;
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(SDPA_attributes,
+                                   name,
+                                   inputs,
+                                   outputs,
+                                   is_inference,
+                                   alibi_mask,
+                                   padding_mask,
+                                   causal_mask,
+                                   dropout_probability,
+                                   attn_scale_value)
 
-   public:
     SDPA_attributes&
     set_is_inference(bool const value) {
         is_inference = value;
@@ -1200,6 +1278,14 @@ class SDPA_backward_attributes : public Attributes<SDPA_backward_attributes> {
     friend class SDPABackwardNode;
     friend class Graph;
 
+    bool alibi_mask   = false;
+    bool padding_mask = false;
+    bool causal_mask  = false;
+
+    std::optional<float> dropout_probability;
+    std::optional<float> attn_scale_value;
+
+   public:
     enum class input_names {
         Q,
         K,
@@ -1217,19 +1303,19 @@ class SDPA_backward_attributes : public Attributes<SDPA_backward_attributes> {
         Dropout_scale,
         Dropout_scale_inv
     };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
     enum class output_names { dQ, dK, dV, dBias, RNG_DUMP };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
-    bool alibi_mask   = false;
-    bool padding_mask = false;
-    bool causal_mask  = false;
-
-    std::optional<float> dropout_probability;
-    std::optional<float> attn_scale_value;
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(SDPA_backward_attributes,
+                                   name,
+                                   inputs,
+                                   outputs,
+                                   alibi_mask,
+                                   padding_mask,
+                                   causal_mask,
+                                   dropout_probability,
+                                   attn_scale_value)
 
-   public:
     SDPA_backward_attributes&
     set_attn_scale(std::shared_ptr<Tensor_attributes> value) {
         inputs[SDPA_backward_attributes::input_names::Attn_scale] = value;
@@ -1320,16 +1406,16 @@ class Softmax_attributes : public Attributes<Softmax_attributes> {
     friend class SoftmaxNode;
     friend class INode;
 
-    enum class input_names { P };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
-    enum class output_names { S, Stats, M, Zinv };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
     std::optional<bool> use_stats;
     std::optional<bool> use_M_Zinv;
 
    public:
+    enum class input_names { P };
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { S, Stats, M, Zinv };
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Softmax_attributes, name, inputs, outputs, use_stats, use_M_Zinv)
+
     Softmax_attributes&
     has_stats(bool const value) {
         use_stats = value;
@@ -1368,10 +1454,10 @@ class SDPA_FP8_attributes : public Attributes<SDPA_FP8_attributes> {
         ragged_offset_QKV,
         ragged_offset_O
     };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
 
     enum class output_names { O, Stats, M, Zinv, AMax_S, AMax_O };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
 
     std::optional<bool> is_inference;
     bool padding_mask = false;
@@ -1456,27 +1542,52 @@ class Conv_wgrad_attributes : public Attributes<Conv_wgrad_attributes> {
     friend class WgradNode;
     friend class Graph;
 
-    enum class input_names { DY, X };
-    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
-
-    enum class output_names { DW };
-    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
-
-    std::vector<int64_t> padding;
+    std::vector<int64_t> pre_padding;
+    std::vector<int64_t> post_padding;
     std::vector<int64_t> stride;
     std::vector<int64_t> dilation;
 
    public:
-    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Conv_wgrad_attributes, name, inputs, outputs, padding, stride, dilation)
+    enum class input_names { DY, X };
+    std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+
+    enum class output_names { DW };
+    std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Conv_wgrad_attributes,
+                                   name,
+                                   inputs,
+                                   outputs,
+                                   pre_padding,
+                                   post_padding,
+                                   stride,
+                                   dilation)
 
     std::vector<int64_t>
-    get_padding() const {
-        return padding;
+    get_pre_padding() const {
+        return pre_padding;
+    }
+
+    std::vector<int64_t>
+    get_post_padding() const {
+        return post_padding;
     }
 
     Conv_wgrad_attributes&
     set_padding(std::vector<int64_t> value) {
-        padding = value;
+        pre_padding  = value;
+        post_padding = value;
+        return *this;
+    }
+
+    Conv_wgrad_attributes&
+    set_pre_padding(std::vector<int64_t> value) {
+        pre_padding = value;
+        return *this;
+    }
+
+    Conv_wgrad_attributes&
+    set_post_padding(std::vector<int64_t> value) {
+        post_padding = value;
         return *this;
     }
 
diff --git a/include/cudnn_frontend/node/batchnorm.h b/include/cudnn_frontend/node/batchnorm.h
index 69caf4c1..c5c3a50a 100644
--- a/include/cudnn_frontend/node/batchnorm.h
+++ b/include/cudnn_frontend/node/batchnorm.h
@@ -21,6 +21,11 @@ class BatchNormNode : public INode {
         return Type::BATCHNORM;
     }
 
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
     error_t
     expand_and_infer_properties() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferencing properties for batchnorm node " << attributes.name << "..."
@@ -107,28 +112,29 @@ class BatchNormNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building BatchNormNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
 
         // Special case in BN where peer stats is also an input but is not present in inputs map
         for (auto const& tensor : attributes.peer_stats) {
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         return {error_code_t::OK, ""};
@@ -142,66 +148,72 @@ class BatchNormNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building BatchNormNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
-        try {
-#endif
-
-            std::vector<cudnn_frontend::Tensor> peer_stats;
-            for (auto const& peer_stat : attributes.peer_stats) {
-                peer_stats.emplace_back(std::move(*(tensors[peer_stat->get_uid()])));
-            }
-
-            auto&& batchnorm_operation_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR);
-
-            batchnorm_operation_builder.setNormalizationMode(NormMode_t::BATCH_NORM)
-                .setNormFwdPhase(NormFwdPhase_t::TRAINING);
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Batchnorm_attributes::input_names::X);
-            batchnorm_operation_builder.setxDesc(*(tensors[X->second->get_uid()]));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, Batchnorm_attributes::output_names::MEAN);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, Batchnorm_attributes::output_names::INV_VARIANCE);
-            batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors[MEAN->second->get_uid()]),
-                                                              *(tensors[INV_VARIANCE->second->get_uid()]));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Batchnorm_attributes::input_names::SCALE);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Batchnorm_attributes::input_names::BIAS);
-            batchnorm_operation_builder.setScaleAndBias(*(tensors[SCALE->second->get_uid()]),
-                                                        *(tensors[BIAS->second->get_uid()]));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_MEAN,
-                                                      Batchnorm_attributes::input_names::PREV_RUNNING_MEAN);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_VAR,
-                                                      Batchnorm_attributes::input_names::PREV_RUNNING_VAR);
-            batchnorm_operation_builder.setPrevRunningMeanAndVar(*(tensors[PREV_RUNNING_MEAN->second->get_uid()]),
-                                                                 *(tensors[PREV_RUNNING_VAR->second->get_uid()]));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_MEAN,
-                                                       Batchnorm_attributes::output_names::NEXT_RUNNING_MEAN);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_VAR,
-                                                       Batchnorm_attributes::output_names::NEXT_RUNNING_VAR);
-            batchnorm_operation_builder.setNextRunningMeanAndVar(*(tensors[NEXT_RUNNING_MEAN->second->get_uid()]),
-                                                                 *(tensors[NEXT_RUNNING_VAR->second->get_uid()]));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Batchnorm_attributes::input_names::EPSILON);
-            batchnorm_operation_builder.setEpsilonTensor(*(tensors[EPSILON->second->get_uid()]));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MOMENTUM, Batchnorm_attributes::input_names::MOMENTUM);
-            batchnorm_operation_builder.setExpDecayFactorTensor(*(tensors[MOMENTUM->second->get_uid()]));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Batchnorm_attributes::output_names::Y);
-            batchnorm_operation_builder.setyDesc(*(tensors[Y->second->get_uid()]));
-
-            batchnorm_operation_builder.setPeerStatTensor(peer_stats);
+        std::vector<cudnn_frontend::Tensor> peer_stats;
+        for (auto const& peer_stat : attributes.peer_stats) {
+            peer_stats.emplace_back(std::move(*(tensors[peer_stat->get_uid()])));
+        }
 
+        auto&& batchnorm_operation_builder =
+            cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR);
+
+        batchnorm_operation_builder.setNormalizationMode(NormMode_t::BATCH_NORM)
+            .setNormFwdPhase(NormFwdPhase_t::TRAINING);
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Batchnorm_attributes::input_names::X);
+        batchnorm_operation_builder.setxDesc(*(tensors[X->second->get_uid()]));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, Batchnorm_attributes::output_names::MEAN);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, Batchnorm_attributes::output_names::INV_VARIANCE);
+        batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors[MEAN->second->get_uid()]),
+                                                          *(tensors[INV_VARIANCE->second->get_uid()]));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Batchnorm_attributes::input_names::SCALE);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Batchnorm_attributes::input_names::BIAS);
+        batchnorm_operation_builder.setScaleAndBias(*(tensors[SCALE->second->get_uid()]),
+                                                    *(tensors[BIAS->second->get_uid()]));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_MEAN,
+                                                  Batchnorm_attributes::input_names::PREV_RUNNING_MEAN);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_VAR,
+                                                  Batchnorm_attributes::input_names::PREV_RUNNING_VAR);
+        batchnorm_operation_builder.setPrevRunningMeanAndVar(*(tensors[PREV_RUNNING_MEAN->second->get_uid()]),
+                                                             *(tensors[PREV_RUNNING_VAR->second->get_uid()]));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_MEAN,
+                                                   Batchnorm_attributes::output_names::NEXT_RUNNING_MEAN);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_VAR,
+                                                   Batchnorm_attributes::output_names::NEXT_RUNNING_VAR);
+        batchnorm_operation_builder.setNextRunningMeanAndVar(*(tensors[NEXT_RUNNING_MEAN->second->get_uid()]),
+                                                             *(tensors[NEXT_RUNNING_VAR->second->get_uid()]));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Batchnorm_attributes::input_names::EPSILON);
+        batchnorm_operation_builder.setEpsilonTensor(*(tensors[EPSILON->second->get_uid()]));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MOMENTUM, Batchnorm_attributes::input_names::MOMENTUM);
+        batchnorm_operation_builder.setExpDecayFactorTensor(*(tensors[MOMENTUM->second->get_uid()]));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Batchnorm_attributes::output_names::Y);
+        batchnorm_operation_builder.setyDesc(*(tensors[Y->second->get_uid()]));
+
+        batchnorm_operation_builder.setPeerStatTensor(peer_stats);
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = batchnorm_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
+        try {
             auto operation = batchnorm_operation_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -213,6 +225,7 @@ class BatchNormNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "BATCHNORM"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/batchnorm_inference.h b/include/cudnn_frontend/node/batchnorm_inference.h
index 243f6913..3ab531d2 100644
--- a/include/cudnn_frontend/node/batchnorm_inference.h
+++ b/include/cudnn_frontend/node/batchnorm_inference.h
@@ -75,21 +75,27 @@ class BatchnormInferenceNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
+    error_t
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building BatchnormInferenceNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         return {error_code_t::OK, ""};
@@ -103,39 +109,45 @@ class BatchnormInferenceNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building BatchnormInferenceNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        auto&& batchnorm_operation_builder =
+            cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR);
+        batchnorm_operation_builder.setNormalizationMode(NormMode_t::BATCH_NORM)
+            .setNormFwdPhase(NormFwdPhase_t::INFERENCE);
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Batchnorm_inference_attributes::input_names::X);
+        batchnorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Batchnorm_inference_attributes::input_names::MEAN);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE,
+                                                  Batchnorm_inference_attributes::input_names::INV_VARIANCE);
+        batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
+                                                          *(tensors.at(INV_VARIANCE->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Batchnorm_inference_attributes::input_names::SCALE);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Batchnorm_inference_attributes::input_names::BIAS);
+        batchnorm_operation_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())),
+                                                    *(tensors.at(BIAS->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Batchnorm_inference_attributes::output_names::Y);
+        batchnorm_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid())));
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = batchnorm_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
         try {
-#endif
-
-            auto&& batchnorm_operation_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR);
-            batchnorm_operation_builder.setNormalizationMode(NormMode_t::BATCH_NORM)
-                .setNormFwdPhase(NormFwdPhase_t::INFERENCE);
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Batchnorm_inference_attributes::input_names::X);
-            batchnorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Batchnorm_inference_attributes::input_names::MEAN);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE,
-                                                      Batchnorm_inference_attributes::input_names::INV_VARIANCE);
-            batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
-                                                              *(tensors.at(INV_VARIANCE->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Batchnorm_inference_attributes::input_names::SCALE);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Batchnorm_inference_attributes::input_names::BIAS);
-            batchnorm_operation_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())),
-                                                        *(tensors.at(BIAS->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Batchnorm_inference_attributes::output_names::Y);
-            batchnorm_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid())));
-
             auto operation = batchnorm_operation_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -147,6 +159,7 @@ class BatchnormInferenceNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "BATCHNORM_INFERENCE"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/bn_finalize.h b/include/cudnn_frontend/node/bn_finalize.h
index cb73d0fa..1226d088 100644
--- a/include/cudnn_frontend/node/bn_finalize.h
+++ b/include/cudnn_frontend/node/bn_finalize.h
@@ -22,6 +22,11 @@ class BatchNormFinalizeNode : public INode {
         return Type::BN_FINALIZE;
     }
 
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
     error_t
     pre_validate_node() const override final {
         CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs());
@@ -74,21 +79,22 @@ class BatchNormFinalizeNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building BatchNormFinalizeNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
 
@@ -103,68 +109,73 @@ class BatchNormFinalizeNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building BatchNormFinalizeNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        // Create the batchnorm operation.
+        auto&& batchnorm_operation_builder =
+            cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR);
+        batchnorm_operation_builder.setComputeType(CUDNN_DATA_FLOAT)
+            .setBNFinalizeMode(CUDNN_BN_FINALIZE_STATISTICS_TRAINING);
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SUM, BN_finalize_attributes::input_names::SUM);
+        batchnorm_operation_builder.setSumDesc(*(tensors.at(SUM->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SQ_SUM, BN_finalize_attributes::input_names::SQ_SUM);
+        batchnorm_operation_builder.setSqSumDesc(*(tensors.at(SQ_SUM->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_SCALE, BN_finalize_attributes::output_names::EQ_SCALE);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_BIAS, BN_finalize_attributes::output_names::EQ_BIAS);
+        batchnorm_operation_builder.setEqScaleAndBias(*(tensors.at(EQ_SCALE->second->get_uid())),
+                                                      *(tensors.at(EQ_BIAS->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, BN_finalize_attributes::output_names::MEAN);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, BN_finalize_attributes::output_names::INV_VARIANCE);
+        batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
+                                                          *(tensors.at(INV_VARIANCE->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, BN_finalize_attributes::input_names::SCALE);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, BN_finalize_attributes::input_names::BIAS);
+        batchnorm_operation_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())),
+                                                    *(tensors.at(BIAS->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_MEAN,
+                                                  BN_finalize_attributes::input_names::PREV_RUNNING_MEAN);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_VAR,
+                                                  BN_finalize_attributes::input_names::PREV_RUNNING_VAR);
+        batchnorm_operation_builder.setPrevRunningMeanAndVar(*(tensors.at(PREV_RUNNING_MEAN->second->get_uid())),
+                                                             *(tensors.at(PREV_RUNNING_VAR->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_MEAN,
+                                                   BN_finalize_attributes::output_names::NEXT_RUNNING_MEAN);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_VAR,
+                                                   BN_finalize_attributes::output_names::NEXT_RUNNING_VAR);
+        batchnorm_operation_builder.setNextRunningMeanAndVar(*(tensors.at(NEXT_RUNNING_MEAN->second->get_uid())),
+                                                             *(tensors.at(NEXT_RUNNING_VAR->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, BN_finalize_attributes::input_names::EPSILON);
+        batchnorm_operation_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MOMENTUM, BN_finalize_attributes::input_names::MOMENTUM);
+        batchnorm_operation_builder.setExpDecayFactorTensor(*(tensors.at(MOMENTUM->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(ACCUM_COUNT, BN_finalize_attributes::input_names::ACCUM_COUNT);
+        batchnorm_operation_builder.setAccumCountTensor(*(tensors.at(ACCUM_COUNT->second->get_uid())));
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = batchnorm_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
         try {
-#endif
-
-            // Create the batchnorm operation.
-            auto&& batchnorm_operation_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR);
-            batchnorm_operation_builder.setComputeType(CUDNN_DATA_FLOAT)
-                .setBNFinalizeMode(CUDNN_BN_FINALIZE_STATISTICS_TRAINING);
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SUM, BN_finalize_attributes::input_names::SUM);
-            batchnorm_operation_builder.setSumDesc(*(tensors.at(SUM->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SQ_SUM, BN_finalize_attributes::input_names::SQ_SUM);
-            batchnorm_operation_builder.setSqSumDesc(*(tensors.at(SQ_SUM->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_SCALE, BN_finalize_attributes::output_names::EQ_SCALE);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_BIAS, BN_finalize_attributes::output_names::EQ_BIAS);
-            batchnorm_operation_builder.setEqScaleAndBias(*(tensors.at(EQ_SCALE->second->get_uid())),
-                                                          *(tensors.at(EQ_BIAS->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, BN_finalize_attributes::output_names::MEAN);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE,
-                                                       BN_finalize_attributes::output_names::INV_VARIANCE);
-            batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
-                                                              *(tensors.at(INV_VARIANCE->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, BN_finalize_attributes::input_names::SCALE);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, BN_finalize_attributes::input_names::BIAS);
-            batchnorm_operation_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())),
-                                                        *(tensors.at(BIAS->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_MEAN,
-                                                      BN_finalize_attributes::input_names::PREV_RUNNING_MEAN);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_VAR,
-                                                      BN_finalize_attributes::input_names::PREV_RUNNING_VAR);
-            batchnorm_operation_builder.setPrevRunningMeanAndVar(*(tensors.at(PREV_RUNNING_MEAN->second->get_uid())),
-                                                                 *(tensors.at(PREV_RUNNING_VAR->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_MEAN,
-                                                       BN_finalize_attributes::output_names::NEXT_RUNNING_MEAN);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_VAR,
-                                                       BN_finalize_attributes::output_names::NEXT_RUNNING_VAR);
-            batchnorm_operation_builder.setNextRunningMeanAndVar(*(tensors.at(NEXT_RUNNING_MEAN->second->get_uid())),
-                                                                 *(tensors.at(NEXT_RUNNING_VAR->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, BN_finalize_attributes::input_names::EPSILON);
-            batchnorm_operation_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MOMENTUM, BN_finalize_attributes::input_names::MOMENTUM);
-            batchnorm_operation_builder.setExpDecayFactorTensor(*(tensors.at(MOMENTUM->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(ACCUM_COUNT, BN_finalize_attributes::input_names::ACCUM_COUNT);
-            batchnorm_operation_builder.setAccumCountTensor(*(tensors.at(ACCUM_COUNT->second->get_uid())));
-
             auto operation = batchnorm_operation_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -176,6 +187,7 @@ class BatchNormFinalizeNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "BN_FINALIZE"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/conv_dgrad.h b/include/cudnn_frontend/node/conv_dgrad.h
index 74295421..a597fa9c 100644
--- a/include/cudnn_frontend/node/conv_dgrad.h
+++ b/include/cudnn_frontend/node/conv_dgrad.h
@@ -31,6 +31,15 @@ class DgradNode : public INode {
 
         CUDNN_FE_VALIDATE_OUTPUT_TENSOR(Conv_dgrad_attributes::output_names::DX);
 
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_pre_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Pre padding not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_post_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Post padding not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_stride().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv strides not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_dilation().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv dilation not set.");
+
         CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs());
         return {error_code_t::OK, ""};
     }
@@ -73,21 +82,27 @@ class DgradNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
+    error_t
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building DgradNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         return {error_code_t::OK, ""};
@@ -101,44 +116,50 @@ class DgradNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building DgradNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        // dgrad descriptor
+        int64_t const spatial_dim_count = attributes.get_pre_padding().size();
+        auto dgrad_descriptor           = cudnn_frontend::ConvDescBuilder()
+                                    .setComputeType(attributes.compute_data_type)
+                                    .setMathMode(CUDNN_CROSS_CORRELATION)
+                                    .setSpatialDimCount(spatial_dim_count)
+                                    .setSpatialStride(spatial_dim_count, attributes.get_stride().data())
+                                    .setPrePadding(spatial_dim_count, attributes.get_pre_padding().data())
+                                    .setPostPadding(spatial_dim_count, attributes.get_post_padding().data())
+                                    .setDilation(spatial_dim_count, attributes.get_dilation().data())
+                                    .build();
+
+        // Create the dgrad operation.
+        auto&& dgrad_operation_builder =
+            cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Conv_dgrad_attributes::output_names::DX);
+        dgrad_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(W, Conv_dgrad_attributes::input_names::W);
+        dgrad_operation_builder.setwDesc(*(tensors.at(W->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Conv_dgrad_attributes::input_names::DY);
+        dgrad_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid())));
+
+        dgrad_operation_builder.setcDesc(dgrad_descriptor).setAlpha(1.f).setBeta(0.f);
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = dgrad_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
         try {
-#endif
-
-            // dgrad descriptor
-            int64_t const spatial_dim_count = attributes.get_padding().size();
-            auto dgrad_descriptor           = cudnn_frontend::ConvDescBuilder()
-                                        .setComputeType(attributes.compute_data_type)
-                                        .setMathMode(CUDNN_CROSS_CORRELATION)
-                                        .setSpatialDimCount(spatial_dim_count)
-                                        .setSpatialStride(spatial_dim_count, attributes.get_stride().data())
-                                        .setPrePadding(spatial_dim_count, attributes.get_padding().data())
-                                        .setPostPadding(spatial_dim_count, attributes.get_padding().data())
-                                        .setDilation(spatial_dim_count, attributes.get_dilation().data())
-                                        .build();
-
-            // Create the dgrad operation.
-            auto&& dgrad_operation_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Conv_dgrad_attributes::output_names::DX);
-            dgrad_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(W, Conv_dgrad_attributes::input_names::W);
-            dgrad_operation_builder.setwDesc(*(tensors.at(W->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Conv_dgrad_attributes::input_names::DY);
-            dgrad_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid())));
-
-            dgrad_operation_builder.setcDesc(dgrad_descriptor).setAlpha(1.f).setBeta(0.f);
-
             auto operation = dgrad_operation_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -150,6 +171,7 @@ class DgradNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "CONV_DGRAD"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/conv_fprop.h b/include/cudnn_frontend/node/conv_fprop.h
index 5b20a9db..35dcc231 100644
--- a/include/cudnn_frontend/node/conv_fprop.h
+++ b/include/cudnn_frontend/node/conv_fprop.h
@@ -21,6 +21,11 @@ class ConvolutionNode : public INode {
         return Type::CONVOLUTION;
     }
 
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
     error_t
     pre_validate_node() const override final {
         getLogger() << "[cudnn_frontend] INFO: "
@@ -31,6 +36,15 @@ class ConvolutionNode : public INode {
 
         CUDNN_FE_VALIDATE_OUTPUT_TENSOR(Conv_fprop_attributes::output_names::Y);
 
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_pre_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Pre padding not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_post_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Post padding not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_stride().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv strides not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_dilation().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv dilation not set.");
+
         CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs());
         return {error_code_t::OK, ""};
     }
@@ -54,16 +68,17 @@ class ConvolutionNode : public INode {
         // Only infer dims and strides if user did not set them
         if (y_tensor_dim.empty()) {
             y_tensor_dim.resize(x_tensor_dim.size());
-            auto const& padding  = attributes.get_padding();
-            auto const& stride   = attributes.get_stride();
-            auto const& dilation = attributes.get_dilation();
+            auto const& pre_padding  = attributes.get_pre_padding();
+            auto const& post_padding = attributes.get_post_padding();
+            auto const& stride       = attributes.get_stride();
+            auto const& dilation     = attributes.get_dilation();
             // N
             y_tensor_dim[0] = x_tensor_dim[0];
             // PQ
             for (size_t dim = 2; dim < x_tensor_dim.size(); ++dim) {
-                y_tensor_dim[dim] =
-                    1 + (x_tensor_dim[dim] - dilation[dim - 2] * (w_tensor_dim[dim] - 1) - 1 + 2 * padding[dim - 2]) /
-                            stride[dim - 2];
+                y_tensor_dim[dim] = 1 + (x_tensor_dim[dim] - dilation[dim - 2] * (w_tensor_dim[dim] - 1) - 1 +
+                                         pre_padding[dim - 2] + post_padding[dim - 2]) /
+                                            stride[dim - 2];
             }
             // K
             y_tensor_dim[1] = w_tensor_dim[0];
@@ -89,21 +104,22 @@ class ConvolutionNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building ConvolutionNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
 
@@ -118,44 +134,50 @@ class ConvolutionNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building ConvolutionNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        // convolution descriptor
+        int64_t const spatial_dim_count = attributes.get_pre_padding().size();
+        auto convolution_descriptor     = cudnn_frontend::ConvDescBuilder()
+                                          .setComputeType(attributes.compute_data_type)
+                                          .setMathMode(CUDNN_CROSS_CORRELATION)
+                                          .setSpatialDimCount(spatial_dim_count)
+                                          .setSpatialStride(spatial_dim_count, attributes.get_stride().data())
+                                          .setPrePadding(spatial_dim_count, attributes.get_pre_padding().data())
+                                          .setPostPadding(spatial_dim_count, attributes.get_post_padding().data())
+                                          .setDilation(spatial_dim_count, attributes.get_dilation().data())
+                                          .build();
+
+        // Create the convolution operation.
+        auto&& convolution_operation_builder =
+            cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR);
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Conv_fprop_attributes::input_names::X);
+        convolution_operation_builder.setxDesc(*(tensors[X->second->get_uid()]));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(W, Conv_fprop_attributes::input_names::W);
+        convolution_operation_builder.setwDesc(*(tensors[W->second->get_uid()]));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Conv_fprop_attributes::output_names::Y);
+        convolution_operation_builder.setyDesc(*(tensors[Y->second->get_uid()]));
+
+        convolution_operation_builder.setcDesc(convolution_descriptor).setAlpha(1.f).setBeta(0.f);
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = convolution_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
         try {
-#endif
-
-            // convolution descriptor
-            int64_t const spatial_dim_count = attributes.get_padding().size();
-            auto convolution_descriptor     = cudnn_frontend::ConvDescBuilder()
-                                              .setComputeType(attributes.compute_data_type)
-                                              .setMathMode(CUDNN_CROSS_CORRELATION)
-                                              .setSpatialDimCount(spatial_dim_count)
-                                              .setSpatialStride(spatial_dim_count, attributes.get_stride().data())
-                                              .setPrePadding(spatial_dim_count, attributes.get_padding().data())
-                                              .setPostPadding(spatial_dim_count, attributes.get_padding().data())
-                                              .setDilation(spatial_dim_count, attributes.get_dilation().data())
-                                              .build();
-
-            // Create the convolution operation.
-            auto&& convolution_operation_builder =
-                cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR);
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Conv_fprop_attributes::input_names::X);
-            convolution_operation_builder.setxDesc(*(tensors[X->second->get_uid()]));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(W, Conv_fprop_attributes::input_names::W);
-            convolution_operation_builder.setwDesc(*(tensors[W->second->get_uid()]));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Conv_fprop_attributes::output_names::Y);
-            convolution_operation_builder.setyDesc(*(tensors[Y->second->get_uid()]));
-
-            convolution_operation_builder.setcDesc(convolution_descriptor).setAlpha(1.f).setBeta(0.f);
-
             auto operation = convolution_operation_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -167,6 +189,7 @@ class ConvolutionNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"({"tag": "CONV_FPROP"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/conv_wgrad.h b/include/cudnn_frontend/node/conv_wgrad.h
index 8a6fb384..575be4bd 100644
--- a/include/cudnn_frontend/node/conv_wgrad.h
+++ b/include/cudnn_frontend/node/conv_wgrad.h
@@ -31,6 +31,15 @@ class WgradNode : public INode {
 
         CUDNN_FE_VALIDATE_OUTPUT_TENSOR(Conv_wgrad_attributes::output_names::DW);
 
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_pre_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Pre padding not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_post_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Post padding not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_stride().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv strides not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_dilation().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv dilation not set.");
+
         CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs());
         return {error_code_t::OK, ""};
     }
@@ -73,21 +82,27 @@ class WgradNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
+    error_t
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building WgradNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         return {error_code_t::OK, ""};
@@ -101,44 +116,50 @@ class WgradNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building WgradNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        // wgrad descriptor
+        int64_t const spatial_dim_count = attributes.get_pre_padding().size();
+        auto wgrad_descriptor           = cudnn_frontend::ConvDescBuilder()
+                                    .setComputeType(attributes.compute_data_type)
+                                    .setMathMode(CUDNN_CROSS_CORRELATION)
+                                    .setSpatialDimCount(spatial_dim_count)
+                                    .setSpatialStride(spatial_dim_count, attributes.get_stride().data())
+                                    .setPrePadding(spatial_dim_count, attributes.get_pre_padding().data())
+                                    .setPostPadding(spatial_dim_count, attributes.get_post_padding().data())
+                                    .setDilation(spatial_dim_count, attributes.get_dilation().data())
+                                    .build();
+
+        // Create the wgrad operation.
+        auto&& wgrad_operation_builder =
+            cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Conv_wgrad_attributes::input_names::X);
+        wgrad_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Conv_wgrad_attributes::input_names::DY);
+        wgrad_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DW, Conv_wgrad_attributes::output_names::DW);
+        wgrad_operation_builder.setdwDesc(*(tensors.at(DW->second->get_uid())));
+
+        wgrad_operation_builder.setcDesc(wgrad_descriptor).setAlpha(1.f).setBeta(0.f);
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = wgrad_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
         try {
-#endif
-
-            // wgrad descriptor
-            int64_t const spatial_dim_count = attributes.get_padding().size();
-            auto wgrad_descriptor           = cudnn_frontend::ConvDescBuilder()
-                                        .setComputeType(attributes.compute_data_type)
-                                        .setMathMode(CUDNN_CROSS_CORRELATION)
-                                        .setSpatialDimCount(spatial_dim_count)
-                                        .setSpatialStride(spatial_dim_count, attributes.get_stride().data())
-                                        .setPrePadding(spatial_dim_count, attributes.get_padding().data())
-                                        .setPostPadding(spatial_dim_count, attributes.get_padding().data())
-                                        .setDilation(spatial_dim_count, attributes.get_dilation().data())
-                                        .build();
-
-            // Create the wgrad operation.
-            auto&& wgrad_operation_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Conv_wgrad_attributes::input_names::X);
-            wgrad_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Conv_wgrad_attributes::input_names::DY);
-            wgrad_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DW, Conv_wgrad_attributes::output_names::DW);
-            wgrad_operation_builder.setdwDesc(*(tensors.at(DW->second->get_uid())));
-
-            wgrad_operation_builder.setcDesc(wgrad_descriptor).setAlpha(1.f).setBeta(0.f);
-
             auto operation = wgrad_operation_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -150,6 +171,7 @@ class WgradNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "CONV_WGRAD"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/dbn.h b/include/cudnn_frontend/node/dbn.h
index 71e3b6df..b2df03db 100644
--- a/include/cudnn_frontend/node/dbn.h
+++ b/include/cudnn_frontend/node/dbn.h
@@ -22,6 +22,11 @@ class DBNNode : public INode {
         return Type::DBN;
     }
 
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
     error_t
     pre_validate_node() const override final {
         getLogger() << "[cudnn_frontend] INFO: "
@@ -87,28 +92,29 @@ class DBNNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building DBNNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
 
         // Special case in BN where peer stats is also an input but is not present in inputs map
         for (auto const& tensor : attributes.peer_stats) {
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
 
@@ -123,53 +129,59 @@ class DBNNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building DBNNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
-        try {
-#endif
-
-            std::vector<cudnn_frontend::Tensor> peer_stats;
-            for (auto const& peer_stat : attributes.peer_stats) {
-                peer_stats.emplace_back(std::move(*(tensors.at(peer_stat->get_uid()))));
-            }
+        std::vector<cudnn_frontend::Tensor> peer_stats;
+        for (auto const& peer_stat : attributes.peer_stats) {
+            peer_stats.emplace_back(std::move(*(tensors.at(peer_stat->get_uid()))));
+        }
 
-            // Create the DBN operation.
-            auto&& DBN_operation_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR);
+        // Create the DBN operation.
+        auto&& DBN_operation_builder =
+            cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR);
 
-            DBN_operation_builder.setNormalizationMode(NormMode_t::BATCH_NORM);
+        DBN_operation_builder.setNormalizationMode(NormMode_t::BATCH_NORM);
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Batchnorm_backward_attributes::input_names::X);
-            DBN_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Batchnorm_backward_attributes::input_names::X);
+        DBN_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Batchnorm_backward_attributes::input_names::DY);
-            DBN_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Batchnorm_backward_attributes::input_names::DY);
+        DBN_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Batchnorm_backward_attributes::input_names::SCALE);
-            DBN_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Batchnorm_backward_attributes::input_names::SCALE);
+        DBN_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Batchnorm_backward_attributes::input_names::MEAN);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE,
-                                                      Batchnorm_backward_attributes::input_names::INV_VARIANCE);
-            DBN_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
-                                                        *(tensors.at(INV_VARIANCE->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Batchnorm_backward_attributes::input_names::MEAN);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE,
+                                                  Batchnorm_backward_attributes::input_names::INV_VARIANCE);
+        DBN_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
+                                                    *(tensors.at(INV_VARIANCE->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Batchnorm_backward_attributes::output_names::DSCALE);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Batchnorm_backward_attributes::output_names::DBIAS);
-            DBN_operation_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())),
-                                                    *(tensors.at(DBIAS->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Batchnorm_backward_attributes::output_names::DSCALE);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Batchnorm_backward_attributes::output_names::DBIAS);
+        DBN_operation_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())),
+                                                *(tensors.at(DBIAS->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Batchnorm_backward_attributes::output_names::DX);
-            DBN_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Batchnorm_backward_attributes::output_names::DX);
+        DBN_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid())));
 
-            DBN_operation_builder.setPeerStatTensor(peer_stats);
+        DBN_operation_builder.setPeerStatTensor(peer_stats);
 
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = DBN_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
+        try {
             auto operation = DBN_operation_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -181,6 +193,7 @@ class DBNNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "DBN"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/dbn_weight.h b/include/cudnn_frontend/node/dbn_weight.h
index e4908025..dda9daee 100644
--- a/include/cudnn_frontend/node/dbn_weight.h
+++ b/include/cudnn_frontend/node/dbn_weight.h
@@ -79,6 +79,11 @@ class DBNWeightNode : public INode {
         return {error_code_t::OK, ""};
     }
 
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
     error_t
     post_validate_node() const override final {
         // Validate outputs
@@ -89,21 +94,22 @@ class DBNWeightNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building DBNWeightNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
 
@@ -118,49 +124,55 @@ class DBNWeightNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building DBNWeightNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        // Create the batchnorm operation.
+        auto&& batchnorm_operation_builder =
+            cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR);
+
+        batchnorm_operation_builder.setComputeType(CUDNN_DATA_FLOAT);
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_SCALE_DY, DBN_weight_attributes::output_names::EQ_SCALE_DY);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_SCALE_X, DBN_weight_attributes::output_names::EQ_SCALE_X);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_BIAS, DBN_weight_attributes::output_names::EQ_BIAS);
+        batchnorm_operation_builder.setEqScalesAndBias(*(tensors.at(EQ_SCALE_DY->second->get_uid())),
+                                                       *(tensors.at(EQ_SCALE_X->second->get_uid())),
+                                                       *(tensors.at(EQ_BIAS->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, DBN_weight_attributes::input_names::MEAN);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, DBN_weight_attributes::input_names::INV_VARIANCE);
+        batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
+                                                          *(tensors.at(INV_VARIANCE->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, DBN_weight_attributes::input_names::SCALE);
+        batchnorm_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, DBN_weight_attributes::input_names::X);
+        batchnorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, DBN_weight_attributes::input_names::DY);
+        batchnorm_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, DBN_weight_attributes::output_names::DSCALE);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, DBN_weight_attributes::output_names::DBIAS);
+        batchnorm_operation_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())),
+                                                      *(tensors.at(DBIAS->second->get_uid())));
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = batchnorm_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
         try {
-#endif
-
-            // Create the batchnorm operation.
-            auto&& batchnorm_operation_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR);
-
-            batchnorm_operation_builder.setComputeType(CUDNN_DATA_FLOAT);
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_SCALE_DY, DBN_weight_attributes::output_names::EQ_SCALE_DY);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_SCALE_X, DBN_weight_attributes::output_names::EQ_SCALE_X);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_BIAS, DBN_weight_attributes::output_names::EQ_BIAS);
-            batchnorm_operation_builder.setEqScalesAndBias(*(tensors.at(EQ_SCALE_DY->second->get_uid())),
-                                                           *(tensors.at(EQ_SCALE_X->second->get_uid())),
-                                                           *(tensors.at(EQ_BIAS->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, DBN_weight_attributes::input_names::MEAN);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, DBN_weight_attributes::input_names::INV_VARIANCE);
-            batchnorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
-                                                              *(tensors.at(INV_VARIANCE->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, DBN_weight_attributes::input_names::SCALE);
-            batchnorm_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, DBN_weight_attributes::input_names::X);
-            batchnorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, DBN_weight_attributes::input_names::DY);
-            batchnorm_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, DBN_weight_attributes::output_names::DSCALE);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, DBN_weight_attributes::output_names::DBIAS);
-            batchnorm_operation_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())),
-                                                          *(tensors.at(DBIAS->second->get_uid())));
-
             auto operation = batchnorm_operation_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -172,6 +184,7 @@ class DBNWeightNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "DBN_WEIGHT"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/dln.h b/include/cudnn_frontend/node/dln.h
index b2f38e0f..9d4ebbb9 100644
--- a/include/cudnn_frontend/node/dln.h
+++ b/include/cudnn_frontend/node/dln.h
@@ -107,6 +107,11 @@ class DLNNode : public INode {
         return {error_code_t::OK, ""};
     }
 
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
     error_t
     post_validate_node() const override final {
         // Validate outputs
@@ -117,26 +122,27 @@ class DLNNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building DLNNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
 
         if (epsilon) {
-            CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(epsilon, uid, tensors));
+            CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(epsilon, uid, tensors, invalid_uids));
         }
 
         return {error_code_t::OK, ""};
@@ -150,51 +156,56 @@ class DLNNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building DLNNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
-        try {
-#endif
-
-            // Create the DLN operation.
-            auto&& DLN_op_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR);
+        // Create the DLN operation.
+        auto&& DLN_op_builder = cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR);
 
-            DLN_op_builder.setNormalizationMode(NormMode_t::LAYER_NORM);
+        DLN_op_builder.setNormalizationMode(NormMode_t::LAYER_NORM);
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Layernorm_backward_attributes::input_names::X);
-            DLN_op_builder.setxDesc(*(tensors.at(X->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Layernorm_backward_attributes::input_names::X);
+        DLN_op_builder.setxDesc(*(tensors.at(X->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Layernorm_backward_attributes::input_names::DY);
-            DLN_op_builder.setdyDesc(*(tensors.at(DY->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Layernorm_backward_attributes::input_names::DY);
+        DLN_op_builder.setdyDesc(*(tensors.at(DY->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Layernorm_backward_attributes::input_names::SCALE);
-            DLN_op_builder.setScale(*(tensors.at(SCALE->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Layernorm_backward_attributes::input_names::SCALE);
+        DLN_op_builder.setScale(*(tensors.at(SCALE->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Layernorm_backward_attributes::input_names::MEAN);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE,
-                                                      Layernorm_backward_attributes::input_names::INV_VARIANCE);
-            DLN_op_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
-                                                 *(tensors.at(INV_VARIANCE->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Layernorm_backward_attributes::input_names::MEAN);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE,
+                                                  Layernorm_backward_attributes::input_names::INV_VARIANCE);
+        DLN_op_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
+                                             *(tensors.at(INV_VARIANCE->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Layernorm_backward_attributes::output_names::DSCALE);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Layernorm_backward_attributes::output_names::DBIAS);
-            DLN_op_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())),
-                                             *(tensors.at(DBIAS->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Layernorm_backward_attributes::output_names::DSCALE);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Layernorm_backward_attributes::output_names::DBIAS);
+        DLN_op_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())),
+                                         *(tensors.at(DBIAS->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Layernorm_backward_attributes::output_names::DX);
-            DLN_op_builder.setdxDesc(*(tensors.at(DX->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Layernorm_backward_attributes::output_names::DX);
+        DLN_op_builder.setdxDesc(*(tensors.at(DX->second->get_uid())));
 
-            if (epsilon) {
-                DLN_op_builder.setEpsilonTensor(*(tensors.at(epsilon->get_uid())));
-                uids_involved_in_operations.insert(epsilon->get_uid());
-            }
+        if (epsilon) {
+            DLN_op_builder.setEpsilonTensor(*(tensors.at(epsilon->get_uid())));
+            uids_involved_in_operations.insert(epsilon->get_uid());
+        }
 
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = DLN_op_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
+        try {
             auto operation = DLN_op_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -206,17 +217,14 @@ class DLNNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "LAYER_NORM_BPROP"})"_json);
     }
 
     error_t
-    pass_by_value_tensors_(
-        cudnnHandle_t,
-        std::unordered_map<std::shared_ptr<Tensor_attributes>, void*> const&,
-        std::unordered_map<std::shared_ptr<Tensor_attributes>, pass_by_values_t>& tensor_to_pass_by_value,
-        void*) const override final {
+    pass_by_value_tensors_(std::unordered_map<uid_t, pass_by_values_t>& tensor_to_pass_by_value) const override final {
         if (epsilon) {
             // can pass in any dummy value
-            tensor_to_pass_by_value.emplace(epsilon, 0.0f);
+            tensor_to_pass_by_value.emplace(epsilon->get_uid(), 0.0f);
         }
         return {error_code_t::OK, ""};
     }
diff --git a/include/cudnn_frontend/node/genstats.h b/include/cudnn_frontend/node/genstats.h
index 2cd5f21a..2703dec6 100644
--- a/include/cudnn_frontend/node/genstats.h
+++ b/include/cudnn_frontend/node/genstats.h
@@ -21,6 +21,11 @@ class GenstatsNode : public INode {
         return Type::GENSTATS;
     }
 
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
     error_t
     pre_validate_node() const override final {
         CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs());
@@ -80,21 +85,22 @@ class GenstatsNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building GenstatsNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         return {error_code_t::OK, ""};
@@ -108,31 +114,36 @@ class GenstatsNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building GenstatsNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        auto&& genstats_operation_builder =
+            cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_GEN_STATS_DESCRIPTOR);
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Genstats_attributes::input_names::X);
+        genstats_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
+
+        genstats_operation_builder.setGenStatsMode(CUDNN_GENSTATS_SUM_SQSUM);
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(SUM, Genstats_attributes::output_names::SUM);
+        genstats_operation_builder.setSumDesc(*(tensors.at(SUM->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(SQ_SUM, Genstats_attributes::output_names::SQ_SUM);
+        genstats_operation_builder.setSqSumDesc(*(tensors.at(SQ_SUM->second->get_uid())));
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = genstats_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
         try {
-#endif
-
-            auto&& genstats_operation_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_GEN_STATS_DESCRIPTOR);
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Genstats_attributes::input_names::X);
-            genstats_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
-
-            genstats_operation_builder.setGenStatsMode(CUDNN_GENSTATS_SUM_SQSUM);
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(SUM, Genstats_attributes::output_names::SUM);
-            genstats_operation_builder.setSumDesc(*(tensors.at(SUM->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(SQ_SUM, Genstats_attributes::output_names::SQ_SUM);
-            genstats_operation_builder.setSqSumDesc(*(tensors.at(SQ_SUM->second->get_uid())));
-
             auto operation = genstats_operation_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -144,6 +155,7 @@ class GenstatsNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "GENSTATS"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/instancenorm.h b/include/cudnn_frontend/node/instancenorm.h
index 0a3d4c33..c8f1b075 100644
--- a/include/cudnn_frontend/node/instancenorm.h
+++ b/include/cudnn_frontend/node/instancenorm.h
@@ -103,21 +103,22 @@ class InstanceNormNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building InstanceNormNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         return {error_code_t::OK, ""};
@@ -130,44 +131,50 @@ class InstanceNormNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building InstanceNormNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
-        try {
-#endif
+        auto&& op_builder = cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR);
 
-            auto&& op_builder = cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR);
+        op_builder.setNormalizationMode(NormMode_t::INSTANCE_NORM);
 
-            op_builder.setNormalizationMode(NormMode_t::INSTANCE_NORM);
+        op_builder.setNormFwdPhase(attributes.forward_phase);
 
-            op_builder.setNormFwdPhase(attributes.forward_phase);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Instancenorm_attributes::input_names::X);
+        op_builder.setxDesc(*(tensors.at(X->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Instancenorm_attributes::input_names::X);
-            op_builder.setxDesc(*(tensors.at(X->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Instancenorm_attributes::input_names::SCALE);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Instancenorm_attributes::input_names::BIAS);
+        op_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())), *(tensors.at(BIAS->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Instancenorm_attributes::input_names::SCALE);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Instancenorm_attributes::input_names::BIAS);
-            op_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())), *(tensors.at(BIAS->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Instancenorm_attributes::input_names::EPSILON);
+        op_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Instancenorm_attributes::input_names::EPSILON);
-            op_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Instancenorm_attributes::output_names::Y);
+        op_builder.setyDesc(*(tensors.at(Y->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Instancenorm_attributes::output_names::Y);
-            op_builder.setyDesc(*(tensors.at(Y->second->get_uid())));
-
-            if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
-                CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, Instancenorm_attributes::output_names::MEAN);
-                CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE,
-                                                           Instancenorm_attributes::output_names::INV_VARIANCE);
-                op_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
-                                                 *(tensors.at(INV_VARIANCE->second->get_uid())));
-            }
+        if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, Instancenorm_attributes::output_names::MEAN);
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE,
+                                                       Instancenorm_attributes::output_names::INV_VARIANCE);
+            op_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
+                                             *(tensors.at(INV_VARIANCE->second->get_uid())));
+        }
 
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = op_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
+        try {
             auto operation = op_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -179,6 +186,12 @@ class InstanceNormNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "INSTANCE_NORM"})"_json);
+    }
+
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
     }
 };
 
@@ -284,21 +297,27 @@ class DINNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
+    error_t
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building DINode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         return {error_code_t::OK, ""};
@@ -312,46 +331,52 @@ class DINNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building DINode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        // Create the DIN operation.
+        auto&& DIN_operation_builder =
+            cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR);
+
+        DIN_operation_builder.setNormalizationMode(NormMode_t::INSTANCE_NORM);
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Instancenorm_backward_attributes::input_names::X);
+        DIN_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Instancenorm_backward_attributes::input_names::DY);
+        DIN_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Instancenorm_backward_attributes::input_names::SCALE);
+        DIN_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Instancenorm_backward_attributes::input_names::MEAN);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE,
+                                                  Instancenorm_backward_attributes::input_names::INV_VARIANCE);
+        DIN_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
+                                                    *(tensors.at(INV_VARIANCE->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Instancenorm_backward_attributes::output_names::DSCALE);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Instancenorm_backward_attributes::output_names::DBIAS);
+        DIN_operation_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())),
+                                                *(tensors.at(DBIAS->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Instancenorm_backward_attributes::output_names::DX);
+        DIN_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid())));
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = DIN_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
         try {
-#endif
-
-            // Create the DIN operation.
-            auto&& DIN_operation_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR);
-
-            DIN_operation_builder.setNormalizationMode(NormMode_t::INSTANCE_NORM);
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Instancenorm_backward_attributes::input_names::X);
-            DIN_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Instancenorm_backward_attributes::input_names::DY);
-            DIN_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Instancenorm_backward_attributes::input_names::SCALE);
-            DIN_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Instancenorm_backward_attributes::input_names::MEAN);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE,
-                                                      Instancenorm_backward_attributes::input_names::INV_VARIANCE);
-            DIN_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
-                                                        *(tensors.at(INV_VARIANCE->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Instancenorm_backward_attributes::output_names::DSCALE);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Instancenorm_backward_attributes::output_names::DBIAS);
-            DIN_operation_builder.setDScaleAndDBias(*(tensors.at(DSCALE->second->get_uid())),
-                                                    *(tensors.at(DBIAS->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Instancenorm_backward_attributes::output_names::DX);
-            DIN_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid())));
-
             auto operation = DIN_operation_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -363,6 +388,7 @@ class DINNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "INSTANCE_NORM_BPROP"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/layernorm.h b/include/cudnn_frontend/node/layernorm.h
index 845ecdd1..27e1ac7b 100644
--- a/include/cudnn_frontend/node/layernorm.h
+++ b/include/cudnn_frontend/node/layernorm.h
@@ -21,6 +21,11 @@ class LayerNormNode : public INode {
         return Type::LAYERNORM;
     }
 
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
     error_t
     expand_and_infer_properties() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferencing properties for layernorm node " << attributes.name << "..."
@@ -147,21 +152,22 @@ class LayerNormNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building LayerNormNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         return {error_code_t::OK, ""};
@@ -174,43 +180,48 @@ class LayerNormNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building LayerNormNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
-        try {
-#endif
-            auto&& layernorm_operation_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR);
-            layernorm_operation_builder.setNormalizationMode(NormMode_t::LAYER_NORM)
-                .setNormFwdPhase(attributes.forward_phase);
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Layernorm_attributes::input_names::X);
-            layernorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Layernorm_attributes::input_names::SCALE);
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Layernorm_attributes::input_names::BIAS);
-            layernorm_operation_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())),
-                                                        *(tensors.at(BIAS->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Layernorm_attributes::input_names::EPSILON);
-            layernorm_operation_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Layernorm_attributes::output_names::Y);
-            layernorm_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid())));
-
-            if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
-                CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, Layernorm_attributes::output_names::MEAN);
-                CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE,
-                                                           Layernorm_attributes::output_names::INV_VARIANCE);
-                layernorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
-                                                                  *(tensors.at(INV_VARIANCE->second->get_uid())));
-            }
+        auto&& layernorm_operation_builder =
+            cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR);
+        layernorm_operation_builder.setNormalizationMode(NormMode_t::LAYER_NORM)
+            .setNormFwdPhase(attributes.forward_phase);
 
-            auto operation = layernorm_operation_builder.build();
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Layernorm_attributes::input_names::X);
+        layernorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
 
-            operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Layernorm_attributes::input_names::SCALE);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Layernorm_attributes::input_names::BIAS);
+        layernorm_operation_builder.setScaleAndBias(*(tensors.at(SCALE->second->get_uid())),
+                                                    *(tensors.at(BIAS->second->get_uid())));
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Layernorm_attributes::input_names::EPSILON);
+        layernorm_operation_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Layernorm_attributes::output_names::Y);
+        layernorm_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid())));
+
+        if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, Layernorm_attributes::output_names::MEAN);
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, Layernorm_attributes::output_names::INV_VARIANCE);
+            layernorm_operation_builder.setSavedMeanAndInvVar(*(tensors.at(MEAN->second->get_uid())),
+                                                              *(tensors.at(INV_VARIANCE->second->get_uid())));
+        }
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = layernorm_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
+        try {
+            auto operation = layernorm_operation_builder.build();
+            operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -222,6 +233,7 @@ class LayerNormNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "LAYER_NORM"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/matmul.h b/include/cudnn_frontend/node/matmul.h
index 7c4b57a0..aa9fa959 100644
--- a/include/cudnn_frontend/node/matmul.h
+++ b/include/cudnn_frontend/node/matmul.h
@@ -21,6 +21,11 @@ class MatmulNode : public INode {
         return Type::MATMUL;
     }
 
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
     error_t
     pre_validate_node() const override final {
         getLogger() << "[cudnn_frontend] INFO: "
@@ -86,21 +91,22 @@ class MatmulNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building MatmulNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
 
@@ -115,51 +121,57 @@ class MatmulNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building MatmulNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
-        try {
-#endif
-
-            // matmul descriptor
-            auto matmul_descriptor = cudnn_frontend::MatMulDescBuilder()
-                                         .setComputeType(attributes.compute_data_type)
-                                         .setPaddingValue(attributes.padding_value)
-                                         .build();
+        // matmul descriptor
+        auto matmul_descriptor = cudnn_frontend::MatMulDescBuilder()
+                                     .setComputeType(attributes.compute_data_type)
+                                     .setPaddingValue(attributes.padding_value)
+                                     .build();
 
-            auto&& matmul_operation_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_MATMUL_DESCRIPTOR);
+        auto&& matmul_operation_builder =
+            cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_MATMUL_DESCRIPTOR);
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(A, Matmul_attributes::input_names::A);
-            matmul_operation_builder.setaMatDesc(*tensors.at(A->second->get_uid()));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(A, Matmul_attributes::input_names::A);
+        matmul_operation_builder.setaMatDesc(*tensors.at(A->second->get_uid()));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(B, Matmul_attributes::input_names::B);
-            matmul_operation_builder.setbMatDesc(*tensors.at(B->second->get_uid()));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(B, Matmul_attributes::input_names::B);
+        matmul_operation_builder.setbMatDesc(*tensors.at(B->second->get_uid()));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(C, Matmul_attributes::output_names::C);
-            matmul_operation_builder.setcMatDesc(*tensors.at(C->second->get_uid()));
-            matmul_operation_builder.setmatmulDesc(matmul_descriptor);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(C, Matmul_attributes::output_names::C);
+        matmul_operation_builder.setcMatDesc(*tensors.at(C->second->get_uid()));
+        matmul_operation_builder.setmatmulDesc(matmul_descriptor);
 
-            auto M_override = attributes.inputs.find(Matmul_attributes::input_names::M_override);
-            if ((M_override != attributes.inputs.end()) && (M_override->second != nullptr)) {
-                matmul_operation_builder.setmOverrideDesc(*tensors.at(M_override->second->get_uid()));
-            }
+        auto M_override = attributes.inputs.find(Matmul_attributes::input_names::M_override);
+        if ((M_override != attributes.inputs.end()) && (M_override->second != nullptr)) {
+            matmul_operation_builder.setmOverrideDesc(*tensors.at(M_override->second->get_uid()));
+        }
 
-            auto N_override = attributes.inputs.find(Matmul_attributes::input_names::N_override);
-            if ((N_override != attributes.inputs.end()) && (N_override->second != nullptr)) {
-                matmul_operation_builder.setnOverrideDesc(*tensors.at(N_override->second->get_uid()));
-            }
+        auto N_override = attributes.inputs.find(Matmul_attributes::input_names::N_override);
+        if ((N_override != attributes.inputs.end()) && (N_override->second != nullptr)) {
+            matmul_operation_builder.setnOverrideDesc(*tensors.at(N_override->second->get_uid()));
+        }
 
-            auto K_override = attributes.inputs.find(Matmul_attributes::input_names::K_override);
-            if ((K_override != attributes.inputs.end()) && (K_override->second != nullptr)) {
-                matmul_operation_builder.setkOverrideDesc(*tensors.at(K_override->second->get_uid()));
-            }
+        auto K_override = attributes.inputs.find(Matmul_attributes::input_names::K_override);
+        if ((K_override != attributes.inputs.end()) && (K_override->second != nullptr)) {
+            matmul_operation_builder.setkOverrideDesc(*tensors.at(K_override->second->get_uid()));
+        }
 
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = matmul_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
+        try {
             auto operation = matmul_operation_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -171,6 +183,7 @@ class MatmulNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "MATMUL"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/pointwise.h b/include/cudnn_frontend/node/pointwise.h
index b16beb40..861d98f7 100644
--- a/include/cudnn_frontend/node/pointwise.h
+++ b/include/cudnn_frontend/node/pointwise.h
@@ -21,6 +21,11 @@ class PointwiseNode : public INode {
         return Type::POINTWISE;
     }
 
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
     error_t
     pre_validate_node() const override final {
         getLogger() << "[cudnn_frontend] INFO: "
@@ -81,21 +86,22 @@ class PointwiseNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building PointwiseNode " << attributes.name << " tensors X:" << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
 
@@ -110,57 +116,63 @@ class PointwiseNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building PointwiseNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
-        try {
-#endif
+        auto pointwise_descriptor = cudnn_frontend::PointwiseDescBuilder()
+                                        .setAxis(attributes.get_axis().value_or(-1))
+                                        .setReluLowerClipSlope(attributes.relu_lower_clip_slope.value_or(0.0))
+                                        .setComputeType(attributes.compute_data_type)
+                                        .setMode(attributes.mode)
+                                        .build();
+
+        auto const port_count = get_pointwise_mode_port_count(attributes.mode);
 
-            auto pointwise_descriptor = cudnn_frontend::PointwiseDescBuilder()
-                                            .setAxis(attributes.get_axis().value_or(-1))
-                                            .setReluLowerClipSlope(attributes.relu_lower_clip_slope.value_or(0.0))
-                                            .setComputeType(attributes.compute_data_type)
-                                            .setMode(attributes.mode)
-                                            .build();
+        auto&& pointwise_operation_builder =
+            cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_POINTWISE_DESCRIPTOR);
+        pointwise_operation_builder.setpwDesc(pointwise_descriptor);
 
-            auto const port_count = get_pointwise_mode_port_count(attributes.mode);
+        if (detail::is_activation_backward_mode(attributes.mode)) {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_0, Pointwise_attributes::input_names::IN_0);
+            pointwise_operation_builder.setdyDesc(*(tensors.at(IN_0->second->get_uid())));
 
-            auto&& pointwise_operation_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_POINTWISE_DESCRIPTOR);
-            pointwise_operation_builder.setpwDesc(pointwise_descriptor);
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_1, Pointwise_attributes::input_names::IN_1);
+            pointwise_operation_builder.setxDesc(*(tensors.at(IN_1->second->get_uid())));
 
-            if (detail::is_activation_backward_mode(attributes.mode)) {
-                CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_0, Pointwise_attributes::input_names::IN_0);
-                pointwise_operation_builder.setdyDesc(*(tensors.at(IN_0->second->get_uid())));
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(OUT_0, Pointwise_attributes::output_names::OUT_0);
+            pointwise_operation_builder.setdxDesc(*(tensors.at(OUT_0->second->get_uid())));
+        } else {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_0, Pointwise_attributes::input_names::IN_0);
+            pointwise_operation_builder.setxDesc(*(tensors.at(IN_0->second->get_uid())));
 
+            if (port_count >= 3) {
                 CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_1, Pointwise_attributes::input_names::IN_1);
-                pointwise_operation_builder.setxDesc(*(tensors.at(IN_1->second->get_uid())));
-
-                CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(OUT_0, Pointwise_attributes::output_names::OUT_0);
-                pointwise_operation_builder.setdxDesc(*(tensors.at(OUT_0->second->get_uid())));
-            } else {
-                CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_0, Pointwise_attributes::input_names::IN_0);
-                pointwise_operation_builder.setxDesc(*(tensors.at(IN_0->second->get_uid())));
-
-                if (port_count >= 3) {
-                    CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_1, Pointwise_attributes::input_names::IN_1);
-                    pointwise_operation_builder.setbDesc(*(tensors.at(IN_1->second->get_uid())));
-                }
-
-                if (port_count >= 4) {
-                    CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_2, Pointwise_attributes::input_names::IN_2);
-                    pointwise_operation_builder.settDesc(*(tensors.at(IN_2->second->get_uid())));
-                }
-
-                CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(OUT_0, Pointwise_attributes::output_names::OUT_0);
-                pointwise_operation_builder.setyDesc(*(tensors.at(OUT_0->second->get_uid())));
+                pointwise_operation_builder.setbDesc(*(tensors.at(IN_1->second->get_uid())));
             }
 
-            auto operation = pointwise_operation_builder.build();
+            if (port_count >= 4) {
+                CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_2, Pointwise_attributes::input_names::IN_2);
+                pointwise_operation_builder.settDesc(*(tensors.at(IN_2->second->get_uid())));
+            }
 
-            operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(OUT_0, Pointwise_attributes::output_names::OUT_0);
+            pointwise_operation_builder.setyDesc(*(tensors.at(OUT_0->second->get_uid())));
+        }
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = pointwise_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
+        try {
+            auto operation = pointwise_operation_builder.build();
+            operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -172,6 +184,7 @@ class PointwiseNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"({"tag": "POINTWISE"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/reduction.h b/include/cudnn_frontend/node/reduction.h
index c11bc863..45bd3b1f 100644
--- a/include/cudnn_frontend/node/reduction.h
+++ b/include/cudnn_frontend/node/reduction.h
@@ -70,21 +70,27 @@ class ReductionNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
+    error_t
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building ReductionNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         return {error_code_t::OK, ""};
@@ -98,33 +104,39 @@ class ReductionNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building ReductionNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        auto reduction_descriptor = cudnn_frontend::ReductionDescBuilder()
+                                        .setComputeType(attributes.compute_data_type)
+                                        .setReductionOp(attributes.get_mode().value())
+                                        .build();
+
+        auto&& reduction_operation_builder =
+            cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR);
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Reduction_attributes::input_names::X);
+        reduction_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Reduction_attributes::output_names::Y);
+        reduction_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid())));
+
+        reduction_operation_builder.setreductionDesc(reduction_descriptor);
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = reduction_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
         try {
-#endif
-
-            auto reduction_descriptor = cudnn_frontend::ReductionDescBuilder()
-                                            .setComputeType(attributes.compute_data_type)
-                                            .setReductionOp(attributes.get_mode().value())
-                                            .build();
-
-            auto&& reduction_operation_builder =
-                cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR);
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Reduction_attributes::input_names::X);
-            reduction_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Reduction_attributes::output_names::Y);
-            reduction_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid())));
-
-            reduction_operation_builder.setreductionDesc(reduction_descriptor);
-
             auto operation = reduction_operation_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -136,6 +148,7 @@ class ReductionNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"({"tag": "REDUCTION"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/reshape.h b/include/cudnn_frontend/node/reshape.h
index ca6a9fce..f9d9686f 100644
--- a/include/cudnn_frontend/node/reshape.h
+++ b/include/cudnn_frontend/node/reshape.h
@@ -19,6 +19,11 @@ class ReshapeNode : public INode {
         return Type::RESHAPE;
     }
 
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
     error_t
     pre_validate_node() const override final {
         getLogger() << "[cudnn_frontend] INFO: "
@@ -82,21 +87,22 @@ class ReshapeNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building Reshape tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         return {error_code_t::OK, ""};
@@ -110,28 +116,35 @@ class ReshapeNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building ReshapeNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
-        try {
-#endif
-            auto&& reshape_op_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_RESHAPE_DESCRIPTOR);
+        auto&& reshape_op_builder = cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_RESHAPE_DESCRIPTOR);
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Reshape_attributes::input_names::X);
-            reshape_op_builder.setxDesc(*(tensors.at(X->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Reshape_attributes::input_names::X);
+        reshape_op_builder.setxDesc(*(tensors.at(X->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Reshape_attributes::output_names::Y);
-            reshape_op_builder.setyDesc(*(tensors.at(Y->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Reshape_attributes::output_names::Y);
+        reshape_op_builder.setyDesc(*(tensors.at(Y->second->get_uid())));
 
-            auto operation = reshape_op_builder.build();
+        reshape_op_builder.setyDesc(*(tensors.at(Y->second->get_uid())));
 
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = reshape_op_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
+        try {
+            auto operation = reshape_op_builder.build();
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
-
         auto const& non_virtual_uids = attributes.get_non_virtual_uids();
         uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
         return {error_code_t::OK, ""};
@@ -140,6 +153,7 @@ class ReshapeNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "RESHAPE"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/rmsnorm.h b/include/cudnn_frontend/node/rmsnorm.h
index 13380c0c..23cc23ad 100644
--- a/include/cudnn_frontend/node/rmsnorm.h
+++ b/include/cudnn_frontend/node/rmsnorm.h
@@ -88,25 +88,27 @@ class RMSNormNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building RMSNormNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         return {error_code_t::OK, ""};
     }
+
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
@@ -115,46 +117,49 @@ class RMSNormNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building RMSNormNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
-        try {
-#endif
-
-            auto&& rmsnorm_operation_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR);
+        auto&& rmsnorm_operation_builder =
+            cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR);
 
-            rmsnorm_operation_builder.setNormalizationMode(NormMode_t::RMS_NORM)
-                .setNormFwdPhase(attributes.forward_phase);
+        rmsnorm_operation_builder.setNormalizationMode(NormMode_t::RMS_NORM).setNormFwdPhase(attributes.forward_phase);
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Rmsnorm_attributes::input_names::X);
-            rmsnorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Rmsnorm_attributes::input_names::X);
+        rmsnorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Rmsnorm_attributes::input_names::SCALE);
-            rmsnorm_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Rmsnorm_attributes::input_names::SCALE);
+        rmsnorm_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Rmsnorm_attributes::input_names::EPSILON);
-            rmsnorm_operation_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Rmsnorm_attributes::input_names::EPSILON);
+        rmsnorm_operation_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Rmsnorm_attributes::output_names::Y);
-            rmsnorm_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Rmsnorm_attributes::output_names::Y);
+        rmsnorm_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid())));
 
-            if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
-                CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE,
-                                                           Rmsnorm_attributes::output_names::INV_VARIANCE);
-                rmsnorm_operation_builder.setSavedInvVar(*(tensors.at(INV_VARIANCE->second->get_uid())));
-            }
-
-            auto BIAS = attributes.inputs.find(Rmsnorm_attributes::input_names::BIAS);
-            if ((BIAS != attributes.inputs.end()) && (BIAS->second != nullptr)) {
-                rmsnorm_operation_builder.setBias(*(tensors.at(BIAS->second->get_uid())));
-            }
+        if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, Rmsnorm_attributes::output_names::INV_VARIANCE);
+            rmsnorm_operation_builder.setSavedInvVar(*(tensors.at(INV_VARIANCE->second->get_uid())));
+        }
 
+        auto BIAS = attributes.inputs.find(Rmsnorm_attributes::input_names::BIAS);
+        if ((BIAS != attributes.inputs.end()) && (BIAS->second != nullptr)) {
+            rmsnorm_operation_builder.setBias(*(tensors.at(BIAS->second->get_uid())));
+        }
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = rmsnorm_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
+        try {
             auto operation = rmsnorm_operation_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
 
@@ -166,6 +171,12 @@ class RMSNormNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "RMS_NORM"})"_json);
+    }
+
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
     }
 };
 
@@ -195,6 +206,11 @@ class DRMSNormNode : public INode {
         return {error_code_t::OK, ""};
     }
 
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
     error_t
     expand_and_infer_properties() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferencing properties for DRMSNorm node " << attributes.name << "..."
@@ -271,21 +287,22 @@ class DRMSNormNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building DRMSNormNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
 
@@ -300,49 +317,52 @@ class DRMSNormNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building DRMSNormNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
-        try {
-#endif
+        auto&& DRMSNorm_operation_builder =
+            cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR);
 
-            auto&& DRMSNorm_operation_builder =
-                cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR);
+        DRMSNorm_operation_builder.setNormalizationMode(NormMode_t::RMS_NORM);
 
-            DRMSNorm_operation_builder.setNormalizationMode(NormMode_t::RMS_NORM);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Rmsnorm_backward_attributes::input_names::X);
+        DRMSNorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Rmsnorm_backward_attributes::input_names::X);
-            DRMSNorm_operation_builder.setxDesc(*(tensors.at(X->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Rmsnorm_backward_attributes::input_names::DY);
+        DRMSNorm_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Rmsnorm_backward_attributes::input_names::DY);
-            DRMSNorm_operation_builder.setdyDesc(*(tensors.at(DY->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Rmsnorm_backward_attributes::input_names::SCALE);
+        DRMSNorm_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Rmsnorm_backward_attributes::input_names::SCALE);
-            DRMSNorm_operation_builder.setScale(*(tensors.at(SCALE->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, Rmsnorm_backward_attributes::input_names::INV_VARIANCE);
+        DRMSNorm_operation_builder.setSavedInvVar(*(tensors.at(INV_VARIANCE->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE,
-                                                      Rmsnorm_backward_attributes::input_names::INV_VARIANCE);
-            DRMSNorm_operation_builder.setSavedInvVar(*(tensors.at(INV_VARIANCE->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Rmsnorm_backward_attributes::output_names::DSCALE);
+        DRMSNorm_operation_builder.setDScale(*(tensors.at(DSCALE->second->get_uid())));
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Rmsnorm_backward_attributes::output_names::DSCALE);
-            DRMSNorm_operation_builder.setDScale(*(tensors.at(DSCALE->second->get_uid())));
-
-            if (attributes.use_dbias.value()) {
-                CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Rmsnorm_backward_attributes::output_names::DBIAS);
-                DRMSNorm_operation_builder.setDBias(*(tensors.at(DBIAS->second->get_uid())));
-            }
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Rmsnorm_backward_attributes::output_names::DX);
-            DRMSNorm_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid())));
+        if (attributes.use_dbias.value()) {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Rmsnorm_backward_attributes::output_names::DBIAS);
+            DRMSNorm_operation_builder.setDBias(*(tensors.at(DBIAS->second->get_uid())));
+        }
 
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Rmsnorm_backward_attributes::output_names::DX);
+        DRMSNorm_operation_builder.setdxDesc(*(tensors.at(DX->second->get_uid())));
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = DRMSNorm_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
+        try {
             auto operation = DRMSNorm_operation_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
-
         auto const& non_virtual_uids = attributes.get_non_virtual_uids();
         uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
         return {error_code_t::OK, ""};
@@ -351,6 +371,7 @@ class DRMSNormNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "RMS_NORM_BPROP"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/rng.h b/include/cudnn_frontend/node/rng.h
index 4e4993a6..92939251 100644
--- a/include/cudnn_frontend/node/rng.h
+++ b/include/cudnn_frontend/node/rng.h
@@ -36,21 +36,27 @@ class RngNode : public INode {
     }
 
     error_t
-    create_cudnn_tensors(int64_t& uid, std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors)
-        const override final {
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
+    error_t
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const override final {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building RngNode tensors " << attributes.name << "..." << std::endl;
 
         for (auto const& [name, tensor] : attributes.inputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         for (auto const& [name, tensor] : attributes.outputs) {
             (void)name;
             if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors));
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
             }
         }
         return {error_code_t::OK, ""};
@@ -108,46 +114,51 @@ class RngNode : public INode {
         getLogger() << "[cudnn_frontend] INFO: "
                     << "Building RngNode operations " << attributes.name << "..." << std::endl;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
-        try {
-#endif
-
-            RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.get_distribution() != RngDistribution_t::BERNOULLI,
-                                           error_code_t::ATTRIBUTE_NOT_SET,
-                                           "no other distribution except bernoulli supported.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.get_distribution() != RngDistribution_t::BERNOULLI,
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "no other distribution except bernoulli supported.");
 
-            auto rng_descriptor = cudnn_frontend::RngDescBuilder()
-                                      .setRngDistribution(attributes.get_distribution())
-                                      .setBernoulliDistProbability(attributes.get_bernoulli_probability().value())
-                                      .build();
+        auto rng_descriptor = cudnn_frontend::RngDescBuilder()
+                                  .setRngDistribution(attributes.get_distribution())
+                                  .setBernoulliDistProbability(attributes.get_bernoulli_probability().value())
+                                  .build();
 
-            auto&& Rng_operation_builder = cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_RNG_DESCRIPTOR);
+        auto&& Rng_operation_builder = cudnn_frontend::OperationBuilder(DescriptorType_t::OPERATION_RNG_DESCRIPTOR);
 
-            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Rng_attributes::output_names::Y);
-            Rng_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid())));
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Rng_attributes::output_names::Y);
+        Rng_operation_builder.setyDesc(*(tensors.at(Y->second->get_uid())));
 
-            Rng_operation_builder.setRngDesc(rng_descriptor);
+        Rng_operation_builder.setRngDesc(rng_descriptor);
 
-            if (attributes.seed.has_value()) {
-                Rng_operation_builder.setSeed(attributes.get_seed().value());
-            } else {
-                CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Seed, Rng_attributes::input_names::Seed);
-                Rng_operation_builder.setSeedDesc(*(tensors.at(Seed->second->get_uid())));
+        if (attributes.seed.has_value()) {
+            Rng_operation_builder.setSeed(attributes.get_seed().value());
+        } else {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Seed, Rng_attributes::input_names::Seed);
+            Rng_operation_builder.setSeedDesc(*(tensors.at(Seed->second->get_uid())));
 
-                CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Offset, Rng_attributes::input_names::Offset);
-                Rng_operation_builder.setOffsetDesc(*(tensors.at(Offset->second->get_uid())));
-            }
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Offset, Rng_attributes::input_names::Offset);
+            Rng_operation_builder.setOffsetDesc(*(tensors.at(Offset->second->get_uid())));
+        }
 
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto operation = Rng_operation_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(operation.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       operation.get_error());
+        operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
+#else
+        // build() can throw
+        // wrap in try catch
+        try {
             auto operation = Rng_operation_builder.build();
-
             operations.push_back(std::make_shared<Operation_v8>(std::move(operation)));
-
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
         } catch (cudnn_frontend::cudnnException& e) {
-            throw cudnnException(e.what(), e.getCudnnStatus());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
         }
 #endif
-
         auto const& non_virtual_uids = attributes.get_non_virtual_uids();
         uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
         return {error_code_t::OK, ""};
@@ -156,6 +167,7 @@ class RngNode : public INode {
     virtual void
     serialize(json& j) const override final {
         j = attributes;
+        j.update(R"( {"tag": "RNG"})"_json);
     }
 };
 
diff --git a/include/cudnn_frontend/node/scaled_dot_product_attention.h b/include/cudnn_frontend/node/scaled_dot_product_attention.h
index 296472b5..46f73417 100644
--- a/include/cudnn_frontend/node/scaled_dot_product_attention.h
+++ b/include/cudnn_frontend/node/scaled_dot_product_attention.h
@@ -326,16 +326,12 @@ class ScaledDotProductAttentionNode : public INode {
     }
 
     virtual error_t
-    pass_by_value_tensors_(
-        cudnnHandle_t,
-        std::unordered_map<std::shared_ptr<Tensor_attributes>, void*> const&,
-        std::unordered_map<std::shared_ptr<Tensor_attributes>, pass_by_values_t>& tensor_to_pass_by_value,
-        void*) const override final {
+    pass_by_value_tensors_(std::map<uid_t, pass_by_values_t>& tensor_to_pass_by_value) const override final {
         half dropout_scale_value = options.dropout_scale;
-        tensor_to_pass_by_value.emplace(options.inputs.Dropout_scale, dropout_scale_value);
+        tensor_to_pass_by_value.emplace(options.inputs.Dropout_scale->get_uid(), dropout_scale_value);
 
         float negative_inf_value = std::numeric_limits<float>::min();
-        tensor_to_pass_by_value.emplace(negative_inf, negative_inf_value);
+        tensor_to_pass_by_value.emplace(negative_inf->get_uid(), negative_inf_value);
 
         return {error_code_t::OK, ""};
     }
diff --git a/include/cudnn_frontend/node/scaled_dot_product_flash_attention.h b/include/cudnn_frontend/node/scaled_dot_product_flash_attention.h
index b0629514..240eda36 100644
--- a/include/cudnn_frontend/node/scaled_dot_product_flash_attention.h
+++ b/include/cudnn_frontend/node/scaled_dot_product_flash_attention.h
@@ -37,6 +37,11 @@ class SDPANode : public INode {
         return Type::COMPOSITE;
     }
 
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
     error_t
     pre_validate_node() const override final {
         getLogger() << "[cudnn_frontend] INFO: "
@@ -81,6 +86,7 @@ class SDPANode : public INode {
 #undef CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE
 
         // validate backend limitations for the operation
+        int64_t s_q  = attributes.inputs.at(input_names::Q)->get_dim()[2];
         int64_t h_q  = attributes.inputs.at(input_names::Q)->get_dim()[1];
         int64_t h_k  = attributes.inputs.at(input_names::K)->get_dim()[1];
         int64_t h_v  = attributes.inputs.at(input_names::V)->get_dim()[1];
@@ -121,9 +127,10 @@ class SDPANode : public INode {
             bool const has_dropout_mask =
                 (dropout_mask != attributes.inputs.end()) && (dropout_mask->second != nullptr);
             bool const has_dropout = attributes.dropout_probability.has_value() || has_dropout_mask;
-            RETURN_CUDNN_FRONTEND_ERROR_IF(has_dropout,
-                                           error_code_t::GRAPH_NOT_SUPPORTED,
-                                           "s_kv not a multiple of 64 is not supported with cudnn version below 9.0.0");
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                has_dropout,
+                error_code_t::GRAPH_NOT_SUPPORTED,
+                "s_kv not a multiple of 64 with dropout enabled is not supported with cudnn version below 9.0.0");
         }
 
         if (((s_kv % 64 != 0) || (d_qk % 64 != 0)) && (cudnnGetVersion() <= 8905)) {
@@ -163,6 +170,14 @@ class SDPANode : public INode {
                                        error_code_t::ATTRIBUTE_NOT_SET,
                                        "Intermediate tensor data type needs to be set as internal tensors require it.");
 
+        if (((s_q % 64 != 0) || (s_kv % 64 != 0)) && (attributes.padding_mask || has_dropout_mask) &&
+            (cudnnGetVersion() < 90000)) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF(true,
+                                           error_code_t::GRAPH_NOT_SUPPORTED,
+                                           "s_q/s_kv not a multiple of 64 with padding/dropout mask is not supported "
+                                           "with cudnn version below 9.0.0");
+        }
+
         CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs());
         return {error_code_t::OK, ""};
     }
@@ -530,56 +545,58 @@ class SDPANode : public INode {
     }
 
     virtual error_t
-    pass_by_value_tensors_(
-        cudnnHandle_t handle,
-        std::unordered_map<std::shared_ptr<Tensor_attributes>, void*> const&,
-        std::unordered_map<std::shared_ptr<Tensor_attributes>, pass_by_values_t>& tensor_to_pass_by_value,
-        void* node_workspace) const override final {
+    workspace_modifications_tensors_(
+        std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>>& workspace_modifications,
+        int64_t& offset) const override final {
+        if (attributes.alibi_mask) {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Q, input_names::Q);
+            int64_t const h_q     = Q->second->get_dim()[1];
+            auto alibi_slopes_vec = detail::get_abili_slope(h_q);
+            workspace_modifications.emplace(alibi_slopes->get_uid(), std::make_tuple(0, offset, alibi_slopes_vec));
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    virtual error_t
+    pass_by_value_tensors_(std::unordered_map<uid_t, pass_by_values_t>& tensor_to_pass_by_value) const override final {
         if (attributes.dropout_probability.has_value() && attributes.dropout_probability.value() != 0.0) {
 #if CUDNN_VERSION < 8903
             half dropout_scale_value = __float2half(1.0f / (1.0f - attributes.dropout_probability.value()));
 #else
             float dropout_scale_value = (1.0f / (1.0f - attributes.dropout_probability.value()));
 #endif
-            tensor_to_pass_by_value.emplace(dropout_scale, dropout_scale_value);
+            tensor_to_pass_by_value.emplace(dropout_scale->get_uid(), dropout_scale_value);
         }
 
         if (negative_inf_padding) {
             float negative_inf_value = std::numeric_limits<float>::lowest();
-            tensor_to_pass_by_value.emplace(negative_inf_padding, negative_inf_value);
+            tensor_to_pass_by_value.emplace(negative_inf_padding->get_uid(), negative_inf_value);
         }
 
         if (WAR_scalar_max_seq_kv) {
             auto const& v_dim = attributes.inputs.at(input_names::V)->get_dim();
             int32_t s_kv      = static_cast<int32_t>(v_dim[2]);
-            tensor_to_pass_by_value.emplace(WAR_scalar_max_seq_kv, s_kv);
+            tensor_to_pass_by_value.emplace(WAR_scalar_max_seq_kv->get_uid(), s_kv);
         }
 
         if (negative_inf_causal) {
             float negative_inf_value = std::numeric_limits<float>::lowest();
-            tensor_to_pass_by_value.emplace(negative_inf_causal, negative_inf_value);
-        }
-
-        if (attributes.alibi_mask) {
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Q, input_names::Q);
-            int64_t const h            = Q->second->get_dim()[1];
-            auto h_alibi_slopes_vector = detail::get_abili_slope(h);
-            int64_t alibi_slopes_size  = h * sizeof(float);
-
-            cudaStream_t stream;
-            CHECK_CUDNN_ERROR(cudnnGetStream(handle, &stream));
-            CHECK_CUDA_ERROR(cudaMemcpyAsync(
-                node_workspace, h_alibi_slopes_vector.data(), alibi_slopes_size, cudaMemcpyHostToDevice, stream));
-            tensor_to_pass_by_value.emplace(alibi_slopes, node_workspace);
+            tensor_to_pass_by_value.emplace(negative_inf_causal->get_uid(), negative_inf_value);
         }
 
         if (attributes.attn_scale_value.has_value()) {
             CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Attn_scale, input_names::Attn_scale);
-            tensor_to_pass_by_value.emplace(Attn_scale->second, attributes.attn_scale_value.value());
+            tensor_to_pass_by_value.emplace(Attn_scale->second->get_uid(), attributes.attn_scale_value.value());
         }
 
         return {error_code_t::OK, ""};
     }
+
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"({"tag": "SDPA_FWD"})"_json);
+    }
 };
 
 class SDPABackwardNode : public INode {
@@ -657,10 +674,18 @@ class SDPABackwardNode : public INode {
 
         // validate backend limitations for the operation
         int64_t h_q  = attributes.inputs.at(input_names::Q)->get_dim()[1];
+        int64_t s_q  = attributes.inputs.at(input_names::Q)->get_dim()[2];
         int64_t h_k  = attributes.inputs.at(input_names::K)->get_dim()[1];
         int64_t h_v  = attributes.inputs.at(input_names::V)->get_dim()[1];
         int64_t d_qk = attributes.inputs.at(input_names::Q)->get_dim()[3];
+        int64_t s_kv = attributes.inputs.at(input_names::V)->get_dim()[2];
         int64_t d_v  = attributes.inputs.at(input_names::V)->get_dim()[3];
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            (s_q < 64) && cudnnGetVersion() < 90000,
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "Sequence length must be greater than or equal to 64 for cudnn version prior to v9.0.0");
+
         RETURN_CUDNN_FRONTEND_ERROR_IF((h_q % h_k != 0) || (h_q % h_v != 0),
                                        error_code_t::GRAPH_NOT_SUPPORTED,
                                        "For group-query attention, number of heads for key and query must be a factor "
@@ -678,8 +703,9 @@ class SDPABackwardNode : public INode {
                                        "attn_scale with tensor and value cannot be set at the same time.");
 
         // validate options for bias mask
-        auto bias_mask = attributes.inputs.find(input_names::Bias);
-        if (bias_mask != attributes.inputs.end() && bias_mask->second != nullptr) {
+        auto bias_mask      = attributes.inputs.find(input_names::Bias);
+        bool const has_bias = (bias_mask != attributes.inputs.end() && bias_mask->second != nullptr);
+        if (has_bias) {
             auto bias_mask_dtype = bias_mask->second->get_data_type();
             RETURN_CUDNN_FRONTEND_ERROR_IF((bias_mask_dtype == DataType_t::BOOLEAN),
                                            error_code_t::GRAPH_NOT_SUPPORTED,
@@ -716,6 +742,14 @@ class SDPABackwardNode : public INode {
                                        error_code_t::ATTRIBUTE_NOT_SET,
                                        "Intermediate tensor data type needs to be set as internal tensors require it.");
 
+        if (((s_q % 64 != 0) || (s_kv % 64 != 0)) && (attributes.padding_mask || has_dropout_mask) &&
+            (cudnnGetVersion() < 90000)) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF(true,
+                                           error_code_t::GRAPH_NOT_SUPPORTED,
+                                           "s_q/s_kv not a multiple of 64 with padding/dropout mask is not supported "
+                                           "with cudnn version below 9.0.0");
+        }
+
         CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs());
         return {error_code_t::OK, ""};
     }
@@ -729,6 +763,11 @@ class SDPABackwardNode : public INode {
         return {error_code_t::OK, ""};
     }
 
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
     error_t
     expand_and_infer_properties() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferrencing properties for SDPABackwardNode " << attributes.name
@@ -848,7 +887,7 @@ class SDPABackwardNode : public INode {
 
         struct cudaDeviceProp prop;
         CHECK_CUDA_ERROR(cudaGetDeviceProperties(&prop, 0));
-        if (cudnnGetVersion() >= 8905 && prop.major >= 9) {
+        if ((cudnnGetVersion() >= 8905 && prop.major >= 9) || (cudnnGetVersion() >= 9000)) {
             // default upper limit for workspace 256MB
             int64_t max_dp_workspace_bytes = 256 * 1024 * 1024;
 
@@ -1089,7 +1128,7 @@ class SDPABackwardNode : public INode {
                                 attributes.inputs[input_names::Stats],
                                 Pointwise_attributes().set_name("sub_s_m").set_mode(PointwiseMode_t::SUB));
 
-        // WAR: Explicitly putting the padding value again after the stats have been loaded
+        // WAR for bug 4475073 by explicitly putting the padding value again after the stats have been loaded
         if (attributes.padding_mask && cudnnGetVersion() >= 90000) {
             auto row_idx_output = pointwise(last_output,
                                             Pointwise_attributes()
@@ -1163,7 +1202,7 @@ class SDPABackwardNode : public INode {
         // as reshape + matmul
         last_output = reshape(last_output, Reshape_attributes().set_name("reshape_p"));
         last_output->set_dim({b, h_q, s_kv, s_q}).set_stride({h_q * s_q * s_kv, s_q * s_kv, 1, s_kv});
-        last_output->set_data_type(context.get_io_data_type());
+        last_output->set_data_type(attributes.inputs[input_names::Q]->get_data_type());
 
         if (h_q == h_v) {
             // for MHA
@@ -1183,7 +1222,7 @@ class SDPABackwardNode : public INode {
                                      .set_m_override(attributes.inputs[input_names::SEQ_LEN_KV])
                                      .set_k_override(attributes.inputs[input_names::SEQ_LEN_Q]));
             last_output->set_dim({b, h_q, s_kv, d_v}).set_stride({h_q * s_kv * d_v, s_kv * d_v, d_v, 1});
-            last_output->set_data_type(context.get_io_data_type());
+            last_output->set_data_type(attributes.inputs[input_names::Q]->get_data_type());
             reduction(last_output,
                       Reduction_attributes().set_name("red_dV_head").set_mode(ReductionMode_t::ADD),
                       attributes.outputs[output_names::dV]);
@@ -1197,7 +1236,7 @@ class SDPABackwardNode : public INode {
                              Matmul_attributes()
                                  .set_name("matmul_dO_VT")
                                  .set_m_override(attributes.inputs[input_names::SEQ_LEN_Q])
-                                 .set_k_override(attributes.inputs[input_names::SEQ_LEN_KV]));
+                                 .set_n_override(attributes.inputs[input_names::SEQ_LEN_KV]));
         last_output->set_dim({b, h_q, s_q, s_kv}).set_stride({h_q * s_q * s_kv, s_q * s_kv, s_kv, 1});
 
         // last_output = last_output(dP) * mask
@@ -1243,7 +1282,7 @@ class SDPABackwardNode : public INode {
         // as reshape + matmul
         last_output = reshape(last_output, Reshape_attributes().set_name("reshape_dS"));
         last_output->set_dim({b, h_q, s_kv, s_q}).set_stride({h_q * s_q * s_kv, s_q * s_kv, 1, s_kv});
-        last_output->set_data_type(context.get_io_data_type());
+        last_output->set_data_type(attributes.inputs[input_names::Q]->get_data_type());
 
         if (h_q == h_k) {
             // for MHA
@@ -1263,7 +1302,7 @@ class SDPABackwardNode : public INode {
                                      .set_m_override(attributes.inputs[input_names::SEQ_LEN_KV])
                                      .set_k_override(attributes.inputs[input_names::SEQ_LEN_Q]));
             last_output->set_dim({b, h_q, s_kv, d_qk}).set_stride({h_q * s_kv * d_qk, s_kv * d_qk, d_qk, 1});
-            last_output->set_data_type(context.get_io_data_type());
+            last_output->set_data_type(attributes.inputs[input_names::Q]->get_data_type());
             reduction(last_output,
                       Reduction_attributes().set_name("red_dK_head").set_mode(ReductionMode_t::ADD),
                       attributes.outputs[output_names::dK]);
@@ -1280,6 +1319,10 @@ class SDPABackwardNode : public INode {
         last_output->set_dim({kt_dim[0], kt_dim[1], kt_dim[3], kt_dim[2]})
             .set_stride({kt_stride[0], kt_stride[1], kt_stride[3], kt_stride[2]});
 
+        if (attributes.inputs[input_names::K]->get_ragged_offset() != nullptr) {
+            last_output->set_ragged_offset(attributes.inputs[input_names::K]->get_ragged_offset());
+        }
+
         matmul(dS_output,
                last_output,
                Matmul_attributes()
@@ -1316,45 +1359,55 @@ class SDPABackwardNode : public INode {
         return alibi_slopes_size_padded + dQ_accum_size + softmax_sum_size;
     }
 
+    virtual error_t
+    workspace_modifications_tensors_(
+        std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>>& workspace_modifications,
+        int64_t& offset) const override final {
+        if (attributes.alibi_mask) {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Q, input_names::Q);
+            int64_t const h_q     = Q->second->get_dim()[1];
+            auto alibi_slopes_vec = detail::get_abili_slope(h_q);
+            workspace_modifications.emplace(alibi_slopes->get_uid(), std::make_tuple(0, offset, alibi_slopes_vec));
+            int64_t alibi_slopes_size_padded = (alibi_slopes_size + 15) & ~15;
+            offset                           = offset + alibi_slopes_size_padded;
+        }
+
+        if (dQ_accum && !dQ_accum->get_is_virtual()) {
+            std::vector<float> f_vec = {(float)dQ_accum_size};
+            workspace_modifications.emplace(dQ_accum->get_uid(), std::make_tuple(1, offset, f_vec));
+            offset = offset + dQ_accum_size;
+        }
+
+        if (softmax_sum && !softmax_sum->get_is_virtual()) {
+            // There is no requirement for softmax_sum to be memset to 0
+            std::vector<float> f_vec = {};
+            workspace_modifications.emplace(softmax_sum->get_uid(), std::make_tuple(2, offset, f_vec));
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
     error_t
-    pass_by_value_tensors_(
-        cudnnHandle_t handle,
-        std::unordered_map<std::shared_ptr<Tensor_attributes>, void*> const&,
-        std::unordered_map<std::shared_ptr<Tensor_attributes>, pass_by_values_t>& tensor_to_pass_by_value,
-        void* node_workspace) const override final {
+    pass_by_value_tensors_(std::unordered_map<uid_t, pass_by_values_t>& tensor_to_pass_by_value) const override final {
         using input_names = SDPA_backward_attributes::input_names;
 
         if (one_tensor) {
-            tensor_to_pass_by_value.emplace(one_tensor, 1.0f);
+            tensor_to_pass_by_value.emplace(one_tensor->get_uid(), 1.0f);
         }
 
         if (attributes.attn_scale_value.has_value()) {
             CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Attn_scale, input_names::Attn_scale);
-            tensor_to_pass_by_value.emplace(Attn_scale->second, attributes.attn_scale_value.value());
-        }
-
-        if (attributes.alibi_mask) {
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Q, input_names::Q);
-            int64_t const h_q                = Q->second->get_dim()[1];
-            auto alibi_slopes_vec            = detail::get_abili_slope(h_q);
-            int64_t alibi_slopes_size_padded = (alibi_slopes_size + 15) & ~15;
-
-            cudaStream_t stream;
-            CHECK_CUDNN_ERROR(cudnnGetStream(handle, &stream));
-            CHECK_CUDA_ERROR(cudaMemcpyAsync(
-                node_workspace, alibi_slopes_vec.data(), alibi_slopes_size, cudaMemcpyHostToDevice, stream));
-            tensor_to_pass_by_value.emplace(alibi_slopes, node_workspace);
-            node_workspace = static_cast<char*>(node_workspace) + alibi_slopes_size_padded;
+            tensor_to_pass_by_value.emplace(Attn_scale->second->get_uid(), attributes.attn_scale_value.value());
         }
 
         if (attributes.padding_mask) {
             float negative_inf_value = std::numeric_limits<float>::lowest();
-            tensor_to_pass_by_value.emplace(negative_inf_padding, negative_inf_value);
+            tensor_to_pass_by_value.emplace(negative_inf_padding->get_uid(), negative_inf_value);
         }
 
         if (attributes.causal_mask) {
             float negative_inf_value = std::numeric_limits<float>::lowest();
-            tensor_to_pass_by_value.emplace(negative_inf_causal, negative_inf_value);
+            tensor_to_pass_by_value.emplace(negative_inf_causal->get_uid(), negative_inf_value);
         }
 
         if (attributes.dropout_probability.has_value()) {
@@ -1362,27 +1415,20 @@ class SDPABackwardNode : public INode {
             float dropout_scale_inv_value = (1.0f - attributes.dropout_probability.value());
 
             CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Dropout_scale, input_names::Dropout_scale);
-            tensor_to_pass_by_value.emplace(Dropout_scale->second, dropout_scale_value);
+            tensor_to_pass_by_value.emplace(Dropout_scale->second->get_uid(), dropout_scale_value);
 
             CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Dropout_scale_inv, input_names::Dropout_scale_inv);
-            tensor_to_pass_by_value.emplace(Dropout_scale_inv->second, dropout_scale_inv_value);
-        }
-
-        if (dQ_accum && !dQ_accum->get_is_virtual()) {
-            cudaStream_t stream;
-            CHECK_CUDNN_ERROR(cudnnGetStream(handle, &stream));
-            CHECK_CUDA_ERROR(cudaMemsetAsync(node_workspace, 0, dQ_accum_size, stream));
-            tensor_to_pass_by_value.emplace(dQ_accum, node_workspace);
-            node_workspace = static_cast<char*>(node_workspace) + dQ_accum_size;
-        }
-
-        if (softmax_sum && !softmax_sum->get_is_virtual()) {
-            // There is no requirement for softmax_sum to be memset to 0
-            tensor_to_pass_by_value.emplace(softmax_sum, node_workspace);
+            tensor_to_pass_by_value.emplace(Dropout_scale_inv->second->get_uid(), dropout_scale_inv_value);
         }
 
         return {error_code_t::OK, ""};
     }
+
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"({"tag": "SDPA_BWD"})"_json);
+    }
 };
 
-}  // namespace cudnn_frontend::graph
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/include/cudnn_frontend/node/softmax.h b/include/cudnn_frontend/node/softmax.h
index f821a7bd..dbd4963f 100644
--- a/include/cudnn_frontend/node/softmax.h
+++ b/include/cudnn_frontend/node/softmax.h
@@ -122,5 +122,15 @@ class SoftmaxNode : public INode {
 
         return {error_code_t::OK, ""};
     }
+
+    error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        return attributes.get_prefilled_uids(pre_assigned_uids);
+    }
+
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+    }
 };
 }  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/include/cudnn_frontend/node_interface.h b/include/cudnn_frontend/node_interface.h
index bd63a020..5ec2dfb4 100644
--- a/include/cudnn_frontend/node_interface.h
+++ b/include/cudnn_frontend/node_interface.h
@@ -38,6 +38,10 @@ class INode : public ICudnn {
     detail::Context context;
 
    private:
+    std::unordered_map<uid_t, pass_by_values_t> deserialized_pass_by_value;
+    std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>> deserialized_workspace_modifications;
+    int64_t fe_workspace_size = 0;
+
     std::shared_ptr<Tensor_attributes>
     output_tensor(std::string const& name) {
         auto tensor = std::make_shared<Tensor_attributes>();
@@ -56,15 +60,21 @@ class INode : public ICudnn {
 
     virtual int64_t
     get_fe_workspace_size_node() const {
-        // Mostly no FE nodes have require workspace
-        return 0;
+        // Mostly no FE nodes have require workspace initiailized to 0
+        return fe_workspace_size;
     }
 
     int64_t
-    get_cudnn_workspace_size() const {
-        int64_t cudnn_workspace_size = get_cudnn_workspace_size_node();
+    get_cudnn_workspace_size(int64_t plan_index = -1) const {
+        int64_t cudnn_workspace_size = 0;
+
+        auto status = get_cudnn_workspace_size_node(plan_index, cudnn_workspace_size);
+        if (status.is_bad()) {
+            getLogger() << "[cudnn_frontend] ERROR: Querying workspace failed." << std::endl;
+        }
+
         for (auto const& sub_node : sub_nodes) {
-            cudnn_workspace_size = std::max(cudnn_workspace_size, sub_node->get_cudnn_workspace_size());
+            cudnn_workspace_size = std::max(cudnn_workspace_size, sub_node->get_cudnn_workspace_size(plan_index));
         }
         return cudnn_workspace_size;
     }
@@ -88,27 +98,79 @@ class INode : public ICudnn {
     }
 
     virtual error_t
-    pass_by_value_tensors_(cudnnHandle_t,
-                           std::unordered_map<std::shared_ptr<Tensor_attributes>, void*> const&,
-                           std::unordered_map<std::shared_ptr<Tensor_attributes>, pass_by_values_t>&,
-                           void*) const {
+    pass_by_value_tensors_(std::unordered_map<uid_t, pass_by_values_t>& pass_by_values) const {
+        for (auto [uid, value] : deserialized_pass_by_value) {
+            pass_by_values.emplace(uid, value);
+        }
         return {error_code_t::OK, ""};
     }
 
     error_t
-    gather_pass_by_value_tensors(
-        cudnnHandle_t const& handle,
-        std::unordered_map<std::shared_ptr<Tensor_attributes>, void*> const& tensor_to_pointer_map,
-        std::unordered_map<std::shared_ptr<Tensor_attributes>, pass_by_values_t>& tensor_to_pass_by_value,
-        void* fe_workspace) const {
-        void* node_workspace = fe_workspace;
-        CHECK_CUDNN_FRONTEND_ERROR(
-            pass_by_value_tensors_(handle, tensor_to_pointer_map, tensor_to_pass_by_value, node_workspace));
-        node_workspace = static_cast<char*>(node_workspace) + get_fe_workspace_size_node();
+    run_auxiliary_kernels(
+        cudnnHandle_t handle,
+        void* fe_workspace,
+        std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>>& workspace_modifications) const {
+        cudaStream_t stream;
+        CHECK_CUDNN_ERROR(cudnnGetStream(handle, &stream));
+        char* workspace = static_cast<char*>(fe_workspace);
+
+        for (auto [uid, data] : workspace_modifications) {
+            (void)uid;
+            if (std::get<0>(data) == 0) {
+                auto& vec_data = std::get<2>(data);
+                CHECK_CUDA_ERROR(cudaMemcpyAsync(workspace + std::get<1>(data),
+                                                 vec_data.data(),
+                                                 vec_data.size() * sizeof(float),
+                                                 cudaMemcpyHostToDevice,
+                                                 stream));
+            } else if (std::get<0>(data) == 1) {
+                int64_t memset_size = (int64_t)std::get<2>(data)[0];
+                CHECK_CUDA_ERROR(cudaMemsetAsync(workspace + std::get<1>(data), 0, memset_size, stream));
+            }
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    gather_pass_by_value_tensors_(std::unordered_map<uid_t, pass_by_values_t>& tensor_to_pass_by_value) const {
+        CHECK_CUDNN_FRONTEND_ERROR(pass_by_value_tensors_(tensor_to_pass_by_value));
         for (auto const& sub_node : sub_nodes) {
-            CHECK_CUDNN_FRONTEND_ERROR(sub_node->gather_pass_by_value_tensors(
-                handle, tensor_to_pointer_map, tensor_to_pass_by_value, node_workspace));
-            node_workspace = static_cast<char*>(node_workspace) + sub_node->get_fe_workspace_size_node();
+            CHECK_CUDNN_FRONTEND_ERROR(sub_node->gather_pass_by_value_tensors_(tensor_to_pass_by_value));
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    virtual error_t
+    workspace_modifications_tensors_(
+        std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>>& worskspace_modifications,
+        int64_t&) const {
+        for (auto [uid, value] : deserialized_workspace_modifications) {
+            worskspace_modifications.emplace(uid, value);
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    gather_workspace_modifications(
+        std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>>& worskspace_modifications,
+        int64_t& offset) const {
+        CHECK_CUDNN_FRONTEND_ERROR(workspace_modifications_tensors_(worskspace_modifications, offset));
+        offset = get_fe_workspace_size_node();
+        for (auto const& sub_node : sub_nodes) {
+            CHECK_CUDNN_FRONTEND_ERROR(sub_node->gather_workspace_modifications(worskspace_modifications, offset));
+            offset += sub_node->get_fe_workspace_size_node();
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    extend_tensor_map_with_workspace_tensors_(
+        std::unordered_map<int64_t, void*>& tensor_to_pointer_map,
+        void* workspace,
+        std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>> const& worskspace_modifications)
+        const {
+        for (auto const& [uid, data] : worskspace_modifications) {
+            tensor_to_pointer_map.emplace(uid, static_cast<char*>(workspace) + std::get<1>(data));
         }
         return {error_code_t::OK, ""};
     }
@@ -116,16 +178,16 @@ class INode : public ICudnn {
     error_t
     extend_tensor_map_with_pass_by_value_tensors_(
         std::unordered_map<int64_t, void*>& tensor_to_pointer_map,
-        std::unordered_map<std::shared_ptr<Tensor_attributes>, pass_by_values_t>& tensor_to_pass_by_value) const {
-        for (auto& [tensor, value] : tensor_to_pass_by_value) {
+        std::unordered_map<uid_t, pass_by_values_t>& tensor_to_pass_by_value) const {
+        for (auto& [uid, value] : tensor_to_pass_by_value) {
             if (half* half_value_ptr = std::get_if<half>(&value)) {
-                tensor_to_pointer_map.emplace(tensor->get_uid(), half_value_ptr);
+                tensor_to_pointer_map.emplace(uid, half_value_ptr);
             } else if (int32_t* int32_t_value_ptr = std::get_if<int32_t>(&value)) {
-                tensor_to_pointer_map.emplace(tensor->get_uid(), int32_t_value_ptr);
+                tensor_to_pointer_map.emplace(uid, int32_t_value_ptr);
             } else if (float* float_value_ptr = std::get_if<float>(&value)) {
-                tensor_to_pointer_map.emplace(tensor->get_uid(), float_value_ptr);
+                tensor_to_pointer_map.emplace(uid, float_value_ptr);
             } else if (void** void_value_ptr = std::get_if<void*>(&value)) {
-                tensor_to_pointer_map.emplace(tensor->get_uid(), *void_value_ptr);
+                tensor_to_pointer_map.emplace(uid, *void_value_ptr);
             } else {
                 RETURN_CUDNN_FRONTEND_ERROR_IF(
                     true, error_code_t::INVALID_VARIANT_PACK, "Unexpected type for pass by value tensor.");
@@ -242,11 +304,11 @@ class INode : public ICudnn {
 
     // Creates cudnn tensors for each node (and its sub nodes)
     virtual error_t
-    create_cudnn_tensors(
-        int64_t& uid,
-        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& uid_to_backend_tensors) const {
+    create_cudnn_tensors(int64_t& uid,
+                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& uid_to_backend_tensors,
+                         std::unordered_set<int64_t> const& invalid_uids) const {
         for (auto const& sub_node : sub_nodes) {
-            CHECK_CUDNN_FRONTEND_ERROR(sub_node->create_cudnn_tensors(uid, uid_to_backend_tensors));
+            CHECK_CUDNN_FRONTEND_ERROR(sub_node->create_cudnn_tensors(uid, uid_to_backend_tensors, invalid_uids));
         }
         return {error_code_t::OK, ""};
     }
@@ -265,6 +327,14 @@ class INode : public ICudnn {
         return {error_code_t::OK, ""};
     }
 
+    virtual error_t
+    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const {
+        for (auto const& sub_node : sub_nodes) {
+            auto x = sub_node->collect_pre_assigned_uids(pre_assigned_uids);
+        }
+        return {error_code_t::OK, ""};
+    }
+
     // An implicitly topological-sorted vector of sub nodes.
     // The sorted order is a side effect of functional API.
     std::vector<std::shared_ptr<INode>> sub_nodes;
@@ -318,8 +388,14 @@ class INode : public ICudnn {
         // TODO: Maybe just use uid_to_tensors size as uid each time?
         int64_t uid = 1;
 
+        std::unordered_set<int64_t> pre_assigned_uids;
+        CHECK_CUDNN_FRONTEND_ERROR(collect_pre_assigned_uids(pre_assigned_uids));
+        while (pre_assigned_uids.find(uid) != pre_assigned_uids.end()) {
+            uid++;
+        }
+
         // Lower each sub node to cudnn backend.
-        CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensors(uid, uid_to_tensors));
+        CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensors(uid, uid_to_tensors, pre_assigned_uids));
 
         // INode needs to keep track of all uids that an operation graph uses.
         // This is because cudnn backend will not accept extra tensors in variant pack.
@@ -344,6 +420,14 @@ class INode : public ICudnn {
         return get_fe_workspace_size() + get_cudnn_workspace_size();
     }
 
+    int64_t
+    get_workspace_size_plan_at_index(int64_t plan_index) const {
+        // There are two workspaces:
+        // - cudnn execution plan workspace
+        // - FE node workspace (example: alibiSlope for fmha)
+        return get_fe_workspace_size() + get_cudnn_workspace_size(plan_index);
+    }
+
     int64_t
     get_autotune_workspace_size() const {
         // There are two workspaces:
@@ -352,56 +436,249 @@ class INode : public ICudnn {
         return get_fe_workspace_size() + get_max_cudnn_workspace_size();
     }
 
+    error_t
+    autotune(cudnnHandle_t handle,
+             std::unordered_map<int64_t, void*>& tensor_uid_to_pointer_map,
+             void* workspace,
+             void* user_impl = nullptr) {
+        // Add pass_by_value data pointers to tensor_uid_to_pointer map
+        // object lifetime is controlled by tensor_to_pass_by_value which means the pointer should stay valid during
+        // execute.
+        std::unordered_map<uid_t, pass_by_values_t> tensor_to_pass_by_value;
+        CHECK_CUDNN_FRONTEND_ERROR(gather_pass_by_value_tensors_(tensor_to_pass_by_value));
+
+        CHECK_CUDNN_FRONTEND_ERROR(
+            extend_tensor_map_with_pass_by_value_tensors_(tensor_uid_to_pointer_map, tensor_to_pass_by_value));
+
+        std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>> workspace_modifications;
+        int64_t workspace_offset = 0;
+        CHECK_CUDNN_FRONTEND_ERROR(gather_workspace_modifications(workspace_modifications, workspace_offset));
+
+        CHECK_CUDNN_FRONTEND_ERROR(run_auxiliary_kernels(handle, workspace, workspace_modifications));
+
+        CHECK_CUDNN_FRONTEND_ERROR(
+            extend_tensor_map_with_workspace_tensors_(tensor_uid_to_pointer_map, workspace, workspace_modifications));
+
+        // offset workspace by the already used fe graph workspace
+        // this is where cudnn backend can start using workspace for its execution plans
+        void* cudnn_workspace = static_cast<char*>(workspace) + get_fe_workspace_size();
+
+        for (auto& plan_list : plans) {
+            CHECK_CUDNN_FRONTEND_ERROR(
+                plan_list.autotune(handle, tensor_uid_to_pointer_map, cudnn_workspace, user_impl));
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    autotune(cudnnHandle_t handle,
+             std::unordered_map<std::shared_ptr<Tensor_attributes>, void*>& tensor_to_pointer_map,
+             void* workspace,
+             void* user_impl = nullptr) {
+        // First get all the uids from the map
+        std::unordered_map<int64_t, void*> tensor_uid_to_pointer_map;
+        for (auto const& [tensor, pointer] : tensor_to_pointer_map) {
+            tensor_uid_to_pointer_map.emplace(tensor->get_uid(), pointer);
+        }
+
+        return autotune(handle, tensor_uid_to_pointer_map, workspace, user_impl);
+    }
+
+    error_t
+    execute_plan_at_index(cudnnHandle_t handle,
+                          std::unordered_map<std::shared_ptr<Tensor_attributes>, void*>& tensor_to_pointer_map,
+                          void* workspace,
+                          int64_t plan_index) const {
+        // First get all the uids from the map
+        std::unordered_map<int64_t, void*> tensor_uid_to_pointer_map;
+        for (auto const& [tensor, pointer] : tensor_to_pointer_map) {
+            tensor_uid_to_pointer_map.emplace(tensor->get_uid(), pointer);
+        }
+
+        return execute_plan_at_index(handle, tensor_uid_to_pointer_map, workspace, plan_index);
+    }
+
     error_t
     execute(cudnnHandle_t handle,
-            std::unordered_map<std::shared_ptr<Tensor_attributes>, void*> const& tensor_to_pointer_map,
+            std::unordered_map<std::shared_ptr<Tensor_attributes>, void*>& tensor_to_pointer_map,
             void* workspace) const {
+        // First get all the uids from the map
         std::unordered_map<int64_t, void*> tensor_uid_to_pointer_map;
         for (auto const& [tensor, pointer] : tensor_to_pointer_map) {
             tensor_uid_to_pointer_map.emplace(tensor->get_uid(), pointer);
         }
 
-        std::unordered_map<std::shared_ptr<Tensor_attributes>, pass_by_values_t> tensor_to_pass_by_value;
-        void* fe_workspace    = workspace;
-        void* cudnn_workspace = static_cast<char*>(fe_workspace) + get_fe_workspace_size();
+        return execute(handle, tensor_uid_to_pointer_map, workspace);
+    }
+
+    error_t
+    execute_plan_at_index(cudnnHandle_t handle,
+                          std::unordered_map<int64_t, void*>& tensor_uid_to_pointer_map,
+                          void* workspace,
+                          int64_t plan_index) const {
+        // Add pass_by_value data pointers to uid_to_pointer map
+        // object lifetime is controlled by tensor_to_pass_by_value which means the pointer should stay valid during
+        // execute.
+        std::unordered_map<uid_t, pass_by_values_t> tensor_to_pass_by_value;
+        CHECK_CUDNN_FRONTEND_ERROR(gather_pass_by_value_tensors_(tensor_to_pass_by_value));
 
         CHECK_CUDNN_FRONTEND_ERROR(
-            gather_pass_by_value_tensors(handle, tensor_to_pointer_map, tensor_to_pass_by_value, fe_workspace));
+            extend_tensor_map_with_pass_by_value_tensors_(tensor_uid_to_pointer_map, tensor_to_pass_by_value));
 
-        // Add pass_by_value data pointers to tensor_uid_to_pointer map
+        std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>> workspace_modifications;
+        int64_t workspace_offset = 0;
+        CHECK_CUDNN_FRONTEND_ERROR(gather_workspace_modifications(workspace_modifications, workspace_offset));
+
+        CHECK_CUDNN_FRONTEND_ERROR(run_auxiliary_kernels(handle, workspace, workspace_modifications));
+
+        CHECK_CUDNN_FRONTEND_ERROR(
+            extend_tensor_map_with_workspace_tensors_(tensor_uid_to_pointer_map, workspace, workspace_modifications));
+        // offset workspace by the already used fe graph workspace
+        // this is where cudnn backend can start using workspace for its execution plans
+        void* cudnn_workspace = static_cast<char*>(workspace) + get_fe_workspace_size();
+
+        CHECK_CUDNN_FRONTEND_ERROR(
+            execute_cudnn_plans_with_uid(handle, tensor_uid_to_pointer_map, cudnn_workspace, plan_index));
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    execute(cudnnHandle_t handle,
+            std::unordered_map<int64_t, void*>& tensor_uid_to_pointer_map,
+            void* workspace) const {
+        // Add pass_by_value data pointers to uid_to_pointer map
         // object lifetime is controlled by tensor_to_pass_by_value which means the pointer should stay valid during
-        // execute
-        for (auto& [tensor, value] : tensor_to_pass_by_value) {
-            if (half* half_value_ptr = std::get_if<half>(&value)) {
-                tensor_uid_to_pointer_map.emplace(tensor->get_uid(), half_value_ptr);
-            } else if (int32_t* int32_t_value_ptr = std::get_if<int32_t>(&value)) {
-                tensor_uid_to_pointer_map.emplace(tensor->get_uid(), int32_t_value_ptr);
-            } else if (float* float_value_ptr = std::get_if<float>(&value)) {
-                tensor_uid_to_pointer_map.emplace(tensor->get_uid(), float_value_ptr);
-            } else if (void** void_value_ptr = std::get_if<void*>(&value)) {
-                tensor_uid_to_pointer_map.emplace(tensor->get_uid(), *void_value_ptr);
-            } else {
-                RETURN_CUDNN_FRONTEND_ERROR_IF(
-                    true, error_code_t::INVALID_VARIANT_PACK, "Execute unexpected type for pass by value tensor.");
+        // execute.
+        std::unordered_map<uid_t, pass_by_values_t> tensor_to_pass_by_value;
+        CHECK_CUDNN_FRONTEND_ERROR(gather_pass_by_value_tensors_(tensor_to_pass_by_value));
+
+        CHECK_CUDNN_FRONTEND_ERROR(
+            extend_tensor_map_with_pass_by_value_tensors_(tensor_uid_to_pointer_map, tensor_to_pass_by_value));
+
+        std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>> workspace_modifications;
+        int64_t workspace_offset = 0;
+        CHECK_CUDNN_FRONTEND_ERROR(gather_workspace_modifications(workspace_modifications, workspace_offset));
+
+        CHECK_CUDNN_FRONTEND_ERROR(run_auxiliary_kernels(handle, workspace, workspace_modifications));
+
+        CHECK_CUDNN_FRONTEND_ERROR(
+            extend_tensor_map_with_workspace_tensors_(tensor_uid_to_pointer_map, workspace, workspace_modifications));
+        // offset workspace by the already used fe graph workspace
+        // this is where cudnn backend can start using workspace for its execution plans
+        void* cudnn_workspace = static_cast<char*>(workspace) + get_fe_workspace_size();
+
+        CHECK_CUDNN_FRONTEND_ERROR(execute_cudnn_plans_with_uid(handle, tensor_uid_to_pointer_map, cudnn_workspace));
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    deserialize(cudnnHandle_t handle, std::vector<uint8_t> const& data) {
+        json j                = json::from_ubjson(data);
+        auto serialized_plans = j["cudnn_backend_data"];
+        if (serialized_plans.size() == 0) {
+            return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, "No plans in the serialized json"};
+        }
+
+        auto index = 0;
+        for (auto const& serialized_plan : serialized_plans) {
+            Execution_plan_list plan_list;
+            CHECK_CUDNN_FRONTEND_ERROR(plan_list.build_plans(handle, serialized_plan));
+            plans.emplace_back(std::move(plan_list));
+            std::unordered_set<uid_t>&& opgraph_variant_packs = j["variant_pack_uids"][index];
+            variant_pack_uids.emplace_back(opgraph_variant_packs);
+            index++;
+        }
+
+        std::unordered_map<uid_t, int32_t> integer_pass_by_values;
+        std::unordered_map<uid_t, float> half_pass_by_values;
+        std::unordered_map<uid_t, float> float_pass_by_values;
+
+        auto pass_by_value_tensors = j["pass_by_values"];
+        for (auto i = 0u; i < pass_by_value_tensors.size(); i++) {
+            if (i == 0) {
+                integer_pass_by_values = pass_by_value_tensors[i].get<std::unordered_map<uid_t, int32_t>>();
+            } else if (i == 1) {
+                half_pass_by_values = pass_by_value_tensors[i].get<std::unordered_map<uid_t, float>>();
+            } else if (i == 2) {
+                float_pass_by_values = pass_by_value_tensors[i].get<std::unordered_map<uid_t, float>>();
+            }
+        }
+
+        for (auto const& [uid, value] : integer_pass_by_values) {
+            deserialized_pass_by_value.emplace(uid, value);
+        }
+        for (auto const& [uid, value] : half_pass_by_values) {
+            deserialized_pass_by_value.emplace(uid, __float2half(value));
+        }
+        for (auto const& [uid, value] : float_pass_by_values) {
+            deserialized_pass_by_value.emplace(uid, value);
+        }
+
+        deserialized_workspace_modifications = j["workspace_modifications"];
+
+        fe_workspace_size = j["fe_workspace_size"];
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    serialize(std::vector<uint8_t>& data) const {
+        json j;
+        serialize(j);
+        j["cudnn_backend_data"];
+        int index = 0;
+        for (auto& plan_list : plans) {
+            auto const candidate = plan_list.candidate;
+            auto execution_plan  = plan_list.execution_plans[candidate];
+            if (execution_plan != nullptr) {
+                auto serialized_plan = execution_plan->getJsonRepresentation();
+                j["cudnn_backend_data"].push_back(serialized_plan);
+                j["variant_pack_uids"].push_back(variant_pack_uids[index]);
+                index++;
             }
         }
 
-        CHECK_CUDNN_FRONTEND_ERROR(execute_cudnn_plans(handle, tensor_uid_to_pointer_map, cudnn_workspace));
+        std::unordered_map<uid_t, pass_by_values_t> tensor_to_pass_by_value;
+        CHECK_CUDNN_FRONTEND_ERROR(gather_pass_by_value_tensors_(tensor_to_pass_by_value));
+
+        j["pass_by_values"];
+        std::unordered_map<uid_t, int32_t> integer_pass_by_values;
+        std::unordered_map<uid_t, float> half_pass_by_values;
+        std::unordered_map<uid_t, float> float_pass_by_values;
+        // std::unordered_map<uid_t, void *>  void_ptr_pass_by_values;
+        for (auto const& [uid, pass_by_value] : tensor_to_pass_by_value) {
+            if (pass_by_value.index() == 0) {
+                integer_pass_by_values.emplace(uid, std::get<0>(pass_by_value));
+            } else if (pass_by_value.index() == 1) {
+                half_pass_by_values.emplace(uid, __half2float(std::get<1>(pass_by_value)));
+            } else if (pass_by_value.index() == 2) {
+                float_pass_by_values.emplace(uid, std::get<2>(pass_by_value));
+            }
+        }
+        // json j = half_pass_by_values;
+        j["pass_by_values"].push_back(integer_pass_by_values);
+        j["pass_by_values"].push_back(half_pass_by_values);
+        j["pass_by_values"].push_back(float_pass_by_values);
+
+        std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>> workspace_modifications;
+        int64_t workspace_offset = 0;
+        CHECK_CUDNN_FRONTEND_ERROR(gather_workspace_modifications(workspace_modifications, workspace_offset));
+
+        j["workspace_modifications"] = workspace_modifications;
 
+        j["fe_workspace_size"] = get_fe_workspace_size();
+
+        data = json::to_ubjson(j);
         return {error_code_t::OK, ""};
     }
 
     INode(detail::Context const& context) : context(context) {}
 
+    // Make sure each node implements a public serialize function
     virtual void
-    serialize(json& j) const {
-        j["nodes"];
-        for (auto const& sub_node : sub_nodes) {
-            json j_sub_node;
-            sub_node->serialize(j_sub_node);
-            j["nodes"].push_back(j_sub_node);
-        }
-    };
+    serialize(json& j) const = 0;
 
     size_t
     key() {
diff --git a/include/cudnn_frontend/plans.h b/include/cudnn_frontend/plans.h
index 7f1e13ad..e9d66b9f 100644
--- a/include/cudnn_frontend/plans.h
+++ b/include/cudnn_frontend/plans.h
@@ -1,14 +1,68 @@
 #pragma once
 
+#include <optional>
 #include <string>
 #include <vector>
 
 #include "../cudnn_frontend_EngineConfig.h"
 #include "../cudnn_frontend_Logging.h"
+#include "graph_helpers.h"
 
 namespace cudnn_frontend {
 
 namespace detail {
+
+inline error_t
+execute(cudnnHandle_t handle,
+        ExecutionPlan* plan,
+        std::vector<void*>& device_ptrs,
+        std::vector<int64_t> const& uids,
+        void* workspace_ptr) {
+    // TODO: below line fails with MSVC. warning C4127: conditional expression is constant
+    // RETURN_CUDNN_FRONTEND_ERROR_IF(!plan, error_code_t::GRAPH_EXECUTION_FAILED, "No plan found to execute!!");
+    getLogger() << "[cudnn_frontend] INFO: Executing " << plan->getTag() << "..." << std::endl;
+
+    auto&& variant_pack_builder = VariantPackBuilder();
+    variant_pack_builder.setDataPointers(device_ptrs.size(), device_ptrs.data())
+        .setUids(uids.size(), uids.data())
+        .setWorkspacePointer(workspace_ptr);
+
+    cudnnBackendDescriptor_t raw_variant_pack = nullptr;
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+    // disable exception macro is defined. Calling build will not throw.
+    // Check status of desc and return error.
+    auto variant_pack = variant_pack_builder.build();
+    RETURN_CUDNN_FRONTEND_ERROR_IF(variant_pack.get_status() != CUDNN_STATUS_SUCCESS,
+                                   error_code_t::INVALID_VARIANT_PACK,
+                                   variant_pack.get_error());
+    raw_variant_pack = variant_pack.get_raw_desc();
+#else
+    // build() can throw
+    // wrap in try catch
+    try {
+        auto variant_pack = variant_pack_builder.build();
+        raw_variant_pack  = variant_pack.get_raw_desc();
+    } catch (cudnn_frontend::cudnnException& e) {
+        // Silly MSVC error that thinks below condition is constexpr
+        // RETURN_CUDNN_FRONTEND_ERROR_IF(
+        //     e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::INVALID_VARIANT_PACK, e.what());
+        getLogger() << "[cudnn_frontend] ERROR: " << e.what() << ". ";
+        getLogger() << error_code_t::INVALID_VARIANT_PACK << " because variant packing building failed at " << __FILE__
+                    << ":" << __LINE__ << "\n";
+        return {error_code_t::INVALID_VARIANT_PACK, e.what()};
+    }
+#endif
+
+    auto status = cudnnBackendExecute(handle, plan->get_raw_desc(), raw_variant_pack);
+    if (status != CUDNN_STATUS_SUCCESS) {
+        std::string message = "[cudnn_frontend] ERROR: Graph execution failed.";
+        return {error_code_t::GRAPH_EXECUTION_FAILED, message};
+    }
+    getLogger() << "[cudnn_frontend] INFO: Executed " << plan->getTag() << "." << std::endl;
+
+    return {error_code_t::OK, ""};
+}
+
 inline error_t
 query_cudnn_heuristics_impl(std::shared_ptr<OperationGraph_v8> const& operation_graph,
                             cudnn_frontend::EngineConfigList& configs,
@@ -17,7 +71,26 @@ query_cudnn_heuristics_impl(std::shared_ptr<OperationGraph_v8> const& operation_
     getLogger() << "[cudnn_frontend] INFO: "
                 << " Getting plan from heuristics for " << operation_graph_tag << " ..." << std::endl;
 
-    auto statuses = cudnn_frontend::get_heuristics_list(modes, *operation_graph, allowAllConfig, configs, true);
+    std::vector<cudnnStatus_t> statuses;
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+    // disable exception macro is defined. Calling build will not throw.
+    // Check status of desc and return error.
+    statuses = cudnn_frontend::get_heuristics_list(modes, *operation_graph, allowAllConfig, configs, true);
+#else
+    // build() can throw
+    // wrap in try catch
+    try {
+        statuses = cudnn_frontend::get_heuristics_list(modes, *operation_graph, allowAllConfig, configs, true);
+    } catch (cudnn_frontend::cudnnException& e) {
+        // Silly MSVC error that thinks below condition is constexpr
+        // RETURN_CUDNN_FRONTEND_ERROR_IF(
+        //     e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::HEURISTIC_QUERY_FAILED, e.what());
+        getLogger() << "[cudnn_frontend] ERROR: " << e.what() << ". ";
+        getLogger() << error_code_t::HEURISTIC_QUERY_FAILED << " because querying heuristics failed at " << __FILE__
+                    << ":" << __LINE__ << "\n";
+        return {error_code_t::HEURISTIC_QUERY_FAILED, e.what()};
+    }
+#endif
 
     getLogger() << "[cudnn_frontend] INFO: get_heuristics_list statuses: ";
     for (size_t i = 0; i < statuses.size(); i++) {
@@ -68,31 +141,73 @@ query_heuristics(std::vector<std::shared_ptr<OperationGraph_v8>> const& operatio
 
 inline error_t
 create_cudnn_execution_plan(std::shared_ptr<ExecutionPlan>& plan,
-                            ManagedOpaqueDescriptor const& config,
-                            std::string const& operation_graph_tag,
+                            std::string const& serialized_data,
                             cudnnHandle_t handle) {
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
+    auto&& plan_builder = cudnn_frontend::ExecutionPlanBuilder();
+
+    plan_builder.setHandle(handle);
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+    // disable exception macro is defined. Calling build will not throw.
+    // Check status of desc and return error.
+    auto built_plan = plan_builder.loadFromJson(serialized_data);
+    RETURN_CUDNN_FRONTEND_ERROR_IF(built_plan.get_status() != CUDNN_STATUS_SUCCESS,
+                                   error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                                   built_plan.get_error());
+    plan = std::make_shared<ExecutionPlan>(std::move(built_plan));
+#else
+    // build() can throw
+    // wrap in try catch
     try {
+        auto built_plan = plan_builder.loadFromJson(serialized_data);
+        plan            = std::make_shared<ExecutionPlan>(std::move(built_plan));
+    } catch (cudnn_frontend::cudnnException& e) {
+        // Silly MSVC error that thinks below condition is constexpr
+        // RETURN_CUDNN_FRONTEND_ERROR_IF(
+        //     e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+        //     e.what());
+        getLogger() << "[cudnn_frontend] ERROR: " << e.what() << ". ";
+        getLogger() << error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED << " because plan building failed at "
+                    << __FILE__ << ":" << __LINE__ << "\n";
+        return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, e.what()};
+    }
 #endif
-        auto built_plan = cudnn_frontend::ExecutionPlanBuilder()
-                              .setHandle(handle)
-                              .setEngineConfig(config, operation_graph_tag)
-                              .build();
-        if (built_plan.get_status() != CUDNN_STATUS_SUCCESS) {
-            getLogger() << "[cudnn_frontend] ERROR: "
-                        << "Config failed with " << built_plan.get_error() << std::endl;
-            return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, "Couldn't build plan from Config."};
-        }
 
-        getLogger() << "[cudnn_frontend] INFO: Config succeeded! Plan has built!\n";
-        getLogger() << "[cudnn_frontend] INFO: " << built_plan.describe() << std::endl;
-        plan = std::make_shared<ExecutionPlan>(std::move(built_plan));
+    return {error_code_t::OK, ""};
+}
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
+inline error_t
+create_cudnn_execution_plan(std::shared_ptr<ExecutionPlan>& plan,
+                            ManagedOpaqueDescriptor const& config,
+                            std::string const& operation_graph_tag,
+                            cudnnHandle_t handle) {
+    auto&& plan_builder = cudnn_frontend::ExecutionPlanBuilder();
+
+    plan_builder.setHandle(handle).setEngineConfig(config, operation_graph_tag);
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+    // disable exception macro is defined. Calling build will not throw.
+    // Check status of desc and return error.
+    auto built_plan = plan_builder.build();
+    RETURN_CUDNN_FRONTEND_ERROR_IF(built_plan.get_status() != CUDNN_STATUS_SUCCESS,
+                                   error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                                   built_plan.get_error());
+    plan = std::make_shared<ExecutionPlan>(std::move(built_plan));
+#else
+    // build() can throw
+    // wrap in try catch
+    try {
+        auto built_plan = plan_builder.build();
+        plan            = std::make_shared<ExecutionPlan>(std::move(built_plan));
     } catch (cudnn_frontend::cudnnException& e) {
-        getLogger() << "[cudnn_frontend] ERROR: "
-                    << "Config failed with " << e.getCudnnStatus() << " " << e.what() << std::endl;
-        return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, "Couldn't build plan from Config."};
+        // Silly MSVC error that thinks below condition is constexpr
+        // RETURN_CUDNN_FRONTEND_ERROR_IF(
+        //     e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+        //     e.what());
+        getLogger() << "[cudnn_frontend] ERROR: " << e.what() << ". ";
+        getLogger() << error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED << " because plan building failed at "
+                    << __FILE__ << ":" << __LINE__ << "\n";
+        return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, e.what()};
     }
 #endif
 
@@ -104,18 +219,21 @@ create_cudnn_execution_plan(std::shared_ptr<ExecutionPlan>& plan,
 namespace graph {
 class Execution_plan_list {
     std::string operation_tag;
-    EngineConfigList engine_configs;
     std::vector<std::vector<cudnnBackendNumericalNote_t>> numeric_notes;
     std::vector<std::vector<cudnnBackendNumericalNote_t>> behavior_notes;
-
     std::vector<bool> filtered_indices;
+
     int64_t max_workspace_allowed = std::numeric_limits<int64_t>::max();
 
-    std::shared_ptr<ExecutionPlan> candidate;
+    EngineConfigList engine_configs;
 
    public:
     std::vector<std::shared_ptr<ExecutionPlan>>
-        execution_plans;  // Filtered engine configs that have been made as plans
+        execution_plans;  // a built plan corresponding to each engine config, irrespective of whether config is
+                          // selected or deselected.
+
+    // Stores position of best plan in above vector of execution plan
+    int64_t candidate = -1;
 
     void
     set_tag(std::string const& tag) {
@@ -135,7 +253,10 @@ class Execution_plan_list {
     query_properties() {
         numeric_notes.reserve(engine_configs.size());
         behavior_notes.reserve(engine_configs.size());
-        filtered_indices.resize(engine_configs.size());
+
+        filtered_indices.resize(engine_configs.size(), 0);
+        execution_plans.resize(engine_configs.size());
+
         for (auto& engine_config : engine_configs) {
             int64_t elem_count = 0;
             std::vector<cudnnBackendNumericalNote_t> numerics;
@@ -200,10 +321,17 @@ class Execution_plan_list {
     }
 
     error_t
-    filter_out_numeric_notes(std::vector<cudnnBackendNumericalNote_t> const& notes) {
-        for (auto note : notes) {
+    deselect_numeric_notes(std::vector<NumericalNote_t> const& notes) {
+        for (auto& note : notes) {
+            cudnnBackendNumericalNote_t backend_note;
+
+            RETURN_CUDNN_FRONTEND_ERROR_IF(detail::convert_to_cudnn_type(note, backend_note) != CUDNN_STATUS_SUCCESS,
+                                           error_code_t::CUDNN_BACKEND_API_FAILED,
+                                           "Unexpected behaviour note provided.");
+
             for (auto i = 0u; i < engine_configs.size(); i++) {
-                if (std::find(numeric_notes[i].begin(), numeric_notes[i].end(), note) != numeric_notes[i].end()) {
+                if (std::find(numeric_notes[i].begin(), numeric_notes[i].end(), backend_note) !=
+                    numeric_notes[i].end()) {
                     filtered_indices[i] = true;
                 }
             }
@@ -212,10 +340,17 @@ class Execution_plan_list {
     }
 
     error_t
-    filter_out_behavior_notes(std::vector<cudnnBackendBehaviorNote_t> const& notes) {
-        for (auto note : notes) {
+    deselect_behavior_notes(std::vector<BehaviorNote_t> const& notes) {
+        for (auto& note : notes) {
+            cudnnBackendBehaviorNote_t backend_note;
+
+            RETURN_CUDNN_FRONTEND_ERROR_IF(detail::convert_to_cudnn_type(note, backend_note) != CUDNN_STATUS_SUCCESS,
+                                           error_code_t::CUDNN_BACKEND_API_FAILED,
+                                           "Unexpected behaviour note provided.");
+
             for (auto i = 0u; i < engine_configs.size(); i++) {
-                if (std::find(behavior_notes[i].begin(), behavior_notes[i].end(), note) != behavior_notes[i].end()) {
+                if (std::find(behavior_notes[i].begin(), behavior_notes[i].end(), backend_note) !=
+                    behavior_notes[i].end()) {
                     filtered_indices[i] = true;
                 }
             }
@@ -245,76 +380,117 @@ class Execution_plan_list {
 
     error_t
     check_support(cudnnHandle_t handle) {
-        auto const& configs = get_filtered_engine_configs();
-        for (auto const& config : configs) {
-            std::shared_ptr<ExecutionPlan> plan;
-            auto const& fe_status = detail::create_cudnn_execution_plan(plan, config, operation_tag, handle);
-
-            if (fe_status.is_good() && plan->getWorkspaceSize() <= max_workspace_allowed) {
-                RETURN_CUDNN_FRONTEND_ERROR_IF(execution_plans.size(),
-                                               error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
-                                               "[cudnn_frontend] Check support or build called already.");
-
-                // No plans should be pushed here.
-                // But check_support in v8 incurs compilation cost.
-                // If not pushed, build_plans will incur compilation cost again.
-                // TODO: Uncomment after https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=4299195&cmtNo=
-                // if(cudnnGetVersion() < 9100)
-                { execution_plans.push_back(std::move(plan)); }
+        for (auto i = 0u; i < engine_configs.size(); i++) {
+            if (filtered_indices[i]) {
+                getLogger() << "[cudnn_frontend] INFO: Deselecting execution plan at position " << i << std::endl;
+                continue;
+            }
+
+            auto const& config = engine_configs[i];
+            auto fe_status     = detail::create_cudnn_execution_plan(execution_plans[i], config, operation_tag, handle);
+            getLogger() << "[cudnn_frontend] INFO: Building plan at index " << i << " gave " << fe_status.get_code()
+                        << " with message: " << fe_status.get_message() << std::endl;
+
+            // If a plan is built successfully, set it as a candidate
+            if (fe_status.is_good()) {
+                // Filter out execution plans with workspace greater than whats available from user
+                if (execution_plans[i]->getWorkspaceSize() > max_workspace_allowed) {
+                    filtered_indices[i] = true;
+                    getLogger() << "[cudnn_frontend] INFO: Deselecting execution plan at position " << i << std::endl;
+                    continue;
+                }
+
+                candidate = static_cast<int64_t>(i);
                 return {error_code_t::OK, ""};
             }
         }
 
+        // No plans were able to be built. Return error.
         return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
                 "[cudnn_frontend] Error: No execution plans built successfully."};
     }
 
+    error_t
+    build_plans(cudnnHandle_t handle, std::string const& json) {
+        execution_plans.resize(1);
+        auto const& fe_status = detail::create_cudnn_execution_plan(execution_plans[0], json, handle);
+
+        if (fe_status.is_good()) {
+            candidate = 0;
+        }
+
+        return fe_status;
+    }
+
+    error_t
+    build_plan_at_index(cudnnHandle_t handle, int64_t index) {
+        RETURN_CUDNN_FRONTEND_ERROR_IF(filtered_indices[index] == true,
+                                       error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                                       "Chosen plan index has been deselected.");
+
+        if (execution_plans[index] != nullptr && execution_plans[index]->getWorkspaceSize() <= max_workspace_allowed) {
+            return {error_code_t::OK, ""};
+        };
+
+        auto fe_status =
+            detail::create_cudnn_execution_plan(execution_plans[index], engine_configs[index], operation_tag, handle);
+
+        getLogger() << "[cudnn_frontend] INFO: Building plan at index " << index << " gave " << fe_status.get_code()
+                    << " with message: " << fe_status.get_message() << std::endl;
+
+        // Sets candidate in case user does not call execute with plan_index later.
+        if (fe_status.is_good()) {
+            if (execution_plans[index]->getWorkspaceSize() <= max_workspace_allowed) {
+                candidate = index;
+            } else {
+                filtered_indices[index] = true;
+                return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                        "[cudnn_frontend] Error: Workspace size is too large."};
+            }
+        }
+
+        return fe_status;
+    }
+
     error_t
     build_plans(cudnnHandle_t handle, BuildPlanPolicy_t const policy, bool const do_multithreaded_builds) {
         RETURN_CUDNN_FRONTEND_ERROR_IF(do_multithreaded_builds,
                                        error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
                                        "Doing multithreaded builds is not yet supported.");
 
-        auto const& configs = get_filtered_engine_configs();
-
-        switch (policy) {
-            case BuildPlanPolicy_t::HEURISTICS_CHOICE:
-                // short circuit in case a plan was already created.
-                // This happens as check_support for v8 builds a plan.
-                // Should not happen in v9.
-                // TODO: Uncomment after https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=4299195&cmtNo=
-                // if(cudnnGetVersion() < 9100)
-                {
-                    if (execution_plans.size() > 0) {
-                        return {error_code_t::OK, ""};
-                    }
-                }
+        // short circuit in case a plan was already created.
+        // This happens as check_support for v8 builds a plan.
+        if (policy == BuildPlanPolicy_t::HEURISTICS_CHOICE && candidate != -1) {
+            return {error_code_t::OK, ""};
+        }
 
-                for (auto const& config : configs) {
-                    std::shared_ptr<ExecutionPlan> plan;
-                    auto const& fe_status = detail::create_cudnn_execution_plan(plan, config, operation_tag, handle);
+        for (auto i = 0u; i < engine_configs.size(); i++) {
+            if (filtered_indices[i]) {
+                getLogger() << "[cudnn_frontend] INFO: Skipping deselected engine plan at index " << i << std::endl;
+                continue;
+            }
 
-                    if (fe_status.is_good() && plan->getWorkspaceSize() <= max_workspace_allowed) {
-                        execution_plans.push_back(std::move(plan));
-                        break;
-                    }
+            auto fe_status =
+                detail::create_cudnn_execution_plan(execution_plans[i], engine_configs[i], operation_tag, handle);
+            getLogger() << "[cudnn_frontend] INFO: Building plan at index " << i << " gave " << fe_status.get_code()
+                        << " with message: " << fe_status.get_message() << std::endl;
+
+            if (fe_status.is_good()) {
+                if (execution_plans[i]->getWorkspaceSize() > max_workspace_allowed) {
+                    filtered_indices[i] = true;
+                    continue;
                 }
-                break;
-            case BuildPlanPolicy_t::ALL:
-                for (auto const& config : configs) {
-                    std::shared_ptr<ExecutionPlan> plan;
-                    auto const& fe_status = detail::create_cudnn_execution_plan(plan, config, operation_tag, handle);
-
-                    if (fe_status.is_good() && plan->getWorkspaceSize() <= max_workspace_allowed) {
-                        execution_plans.push_back(std::move(plan));
-                    }
+                // Only set the candidate the first time, as the order of iteration is from highest to lowest priority
+                if (candidate == -1) {
+                    candidate = static_cast<int64_t>(i);
                 }
-                break;
-        }
 
-        RETURN_CUDNN_FRONTEND_ERROR_IF(execution_plans.empty(),
-                                       error_code_t::GRAPH_NOT_SUPPORTED,
-                                       "No execution plans finalized successfully. Hence, not supported.");
+                // Return from this function as first successfully built plan is found.
+                if (policy == BuildPlanPolicy_t::HEURISTICS_CHOICE) {
+                    return {error_code_t::OK, ""};
+                }
+            }
+        }
 
         return {error_code_t::OK, ""};
     }
@@ -328,34 +504,20 @@ class Execution_plan_list {
         return max_size;
     }
 
-    std::shared_ptr<ExecutionPlan>
-    get_best_candidate() const {
-        if (execution_plans.empty()) return nullptr;
-        return execution_plans.front();
-    }
-
     static error_t
     autotune_default_impl(std::vector<std::shared_ptr<ExecutionPlan>>& execution_plans,
                           cudnnHandle_t handle,
-                          std::unordered_map<std::shared_ptr<Tensor_attributes>, void*> variants,
-                          void* workspace,
+                          std::unordered_map<int64_t, void*> const& tensor_to_pointer_map,
+                          void* workspace_ptr,
                           void*) {
         // Create the variant pack for all the plans to use.
         std::vector<int64_t> uids;
         std::vector<void*> ptrs;
-        for (auto it : variants) {
-            if (it.first != nullptr) {
-                uids.push_back(it.first->get_uid());
-                ptrs.push_back(it.second);
-            }
+        for (auto it : tensor_to_pointer_map) {
+            uids.push_back(it.first);
+            ptrs.push_back(it.second);
         }
 
-        auto variantPack = VariantPackBuilder()
-                               .setDataPointers(ptrs.size(), ptrs.data())
-                               .setUids(uids.size(), uids.data())
-                               .setWorkspacePointer(workspace)
-                               .build();
-
         std::vector<std::shared_ptr<ExecutionPlan>> time_sorted_plans;
 
         auto plan_cmp = [](std::shared_ptr<ExecutionPlan> a, std::shared_ptr<ExecutionPlan> b) {
@@ -381,19 +543,14 @@ class Execution_plan_list {
             float min_time_ms   = std::numeric_limits<float>::max();
 
             // Warm-up run
-            auto warmup_status = cudnnBackendExecute(handle, plan->get_raw_desc(), variantPack.get_raw_desc());
-            if (warmup_status != CUDNN_STATUS_SUCCESS) {
-                getLogger() << "[cudnn_frontend] Plan " << plan->getTag() << " failed with " << to_string(warmup_status)
-                            << std::endl;
-                continue;
-            }
+            CHECK_CUDNN_FRONTEND_ERROR(detail::execute(handle, plan.get(), ptrs, uids, workspace_ptr));
             successful_plan_count++;
             cudaDeviceSynchronize();
 
             for (int i = 0; i < maxIterCount; i++) {
                 cudaEventRecord(start, stream);
 
-                cudnnBackendExecute(handle, plan->get_raw_desc(), variantPack.get_raw_desc());
+                auto status = detail::execute(handle, plan.get(), ptrs, uids, workspace_ptr);
 
                 cudaEventRecord(stop, stream);
                 cudaEventSynchronize(stop);
@@ -427,17 +584,17 @@ class Execution_plan_list {
 
     std::function<error_t(std::vector<std::shared_ptr<ExecutionPlan>>&,
                           cudnnHandle_t,
-                          std::unordered_map<std::shared_ptr<Tensor_attributes>, void*>,
+                          std::unordered_map<int64_t, void*> const&,
                           void*,
                           void*)>
         autotune_impl = &Execution_plan_list::autotune_default_impl;
 
     error_t
     autotune(cudnnHandle_t handle,
-             std::unordered_map<std::shared_ptr<Tensor_attributes>, void*> variants,
+             std::unordered_map<int64_t, void*> const& tensor_to_pointer_map,
              void* workspace,
              void* user_impl = nullptr) {
-        auto error = autotune_impl(execution_plans, handle, variants, workspace, user_impl);
+        auto error = autotune_impl(execution_plans, handle, tensor_to_pointer_map, workspace, user_impl);
         return error;
     }
 };
diff --git a/include/cudnn_frontend/utils/serialize.h b/include/cudnn_frontend/utils/serialize.h
new file mode 100644
index 00000000..2a59f42f
--- /dev/null
+++ b/include/cudnn_frontend/utils/serialize.h
@@ -0,0 +1,328 @@
+#pragma once
+
+#include "../graph_properties.h"
+#include "../graph_helpers.h"
+
+namespace cudnn_frontend::graph {
+
+NLOHMANN_JSON_SERIALIZE_ENUM(BN_finalize_attributes::input_names,
+                             {
+                                 {BN_finalize_attributes::input_names::SUM, "SUM"},
+                                 {BN_finalize_attributes::input_names::SQ_SUM, "SQ_SUM"},
+                                 {BN_finalize_attributes::input_names::SCALE, "SCALE"},
+                                 {BN_finalize_attributes::input_names::BIAS, "BIAS"},
+                                 {BN_finalize_attributes::input_names::EPSILON, "EPSILON"},
+                                 {BN_finalize_attributes::input_names::ACCUM_COUNT, "ACCUM_COUNT"},
+                                 {BN_finalize_attributes::input_names::PREV_RUNNING_MEAN, "PREV_RUNNING_MEAN"},
+                                 {BN_finalize_attributes::input_names::PREV_RUNNING_VAR, "PREV_RUNNING_VAR"},
+                                 {BN_finalize_attributes::input_names::MOMENTUM, "MOMENTUM"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(BN_finalize_attributes::output_names,
+                             {
+                                 {BN_finalize_attributes::output_names::EQ_SCALE, "EQ_SCALE"},
+                                 {BN_finalize_attributes::output_names::EQ_BIAS, "EQ_BIAS"},
+                                 {BN_finalize_attributes::output_names::MEAN, "MEAN"},
+                                 {BN_finalize_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"},
+                                 {BN_finalize_attributes::output_names::NEXT_RUNNING_MEAN, "NEXT_RUNNING_MEAN"},
+                                 {BN_finalize_attributes::output_names::NEXT_RUNNING_VAR, "NEXT_RUNNING_VAR"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_attributes::input_names,
+                             {
+                                 {Batchnorm_attributes::input_names::X, "X"},
+                                 {Batchnorm_attributes::input_names::SCALE, "SCALE"},
+                                 {Batchnorm_attributes::input_names::BIAS, "BIAS"},
+                                 {Batchnorm_attributes::input_names::EPSILON, "EPSILON"},
+                                 {Batchnorm_attributes::input_names::PREV_RUNNING_MEAN, "PREV_RUNNING_MEAN"},
+                                 {Batchnorm_attributes::input_names::PREV_RUNNING_VAR, "PREV_RUNNING_VAR"},
+                                 {Batchnorm_attributes::input_names::MOMENTUM, "MOMENTUM"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_attributes::output_names,
+                             {
+                                 {Batchnorm_attributes::output_names::Y, "Y"},
+                                 {Batchnorm_attributes::output_names::MEAN, "MEAN"},
+                                 {Batchnorm_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"},
+                                 {Batchnorm_attributes::output_names::NEXT_RUNNING_MEAN, "NEXT_RUNNING_MEAN"},
+                                 {Batchnorm_attributes::output_names::NEXT_RUNNING_VAR, "NEXT_RUNNING_VAR"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_backward_attributes::input_names,
+                             {
+                                 {Batchnorm_backward_attributes::input_names::DY, "DY"},
+                                 {Batchnorm_backward_attributes::input_names::X, "X"},
+                                 {Batchnorm_backward_attributes::input_names::SCALE, "SCALE"},
+                                 {Batchnorm_backward_attributes::input_names::MEAN, "MEAN"},
+                                 {Batchnorm_backward_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_backward_attributes::output_names,
+                             {
+                                 {Batchnorm_backward_attributes::output_names::DX, "DX"},
+                                 {Batchnorm_backward_attributes::output_names::DSCALE, "DSCALE"},
+                                 {Batchnorm_backward_attributes::output_names::DBIAS, "DBIAS"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_inference_attributes::input_names,
+                             {
+                                 {Batchnorm_inference_attributes::input_names::X, "X"},
+                                 {Batchnorm_inference_attributes::input_names::SCALE, "SCALE"},
+                                 {Batchnorm_inference_attributes::input_names::BIAS, "BIAS"},
+                                 {Batchnorm_inference_attributes::input_names::MEAN, "MEAN"},
+                                 {Batchnorm_inference_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_inference_attributes::output_names,
+                             {{Batchnorm_inference_attributes::output_names::Y, "Y"}})
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Conv_dgrad_attributes::input_names,
+                             {
+                                 {Conv_dgrad_attributes::input_names::W, "W"},
+                                 {Conv_dgrad_attributes::input_names::DY, "DY"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Conv_dgrad_attributes::output_names,
+                             {
+                                 {Conv_dgrad_attributes::output_names::DX, "DX"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Conv_fprop_attributes::input_names,
+                             {
+                                 {Conv_fprop_attributes::input_names::X, "X"},
+                                 {Conv_fprop_attributes::input_names::W, "W"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Conv_fprop_attributes::output_names,
+                             {
+                                 {Conv_fprop_attributes::output_names::Y, "Y"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Conv_wgrad_attributes::input_names,
+                             {
+                                 {Conv_wgrad_attributes::input_names::DY, "DY"},
+                                 {Conv_wgrad_attributes::input_names::X, "X"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Conv_wgrad_attributes::output_names,
+                             {
+                                 {Conv_wgrad_attributes::output_names::DW, "DW"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(DBN_weight_attributes::input_names,
+                             {
+                                 {DBN_weight_attributes::input_names::DY, "DY"},
+                                 {DBN_weight_attributes::input_names::X, "X"},
+                                 {DBN_weight_attributes::input_names::SCALE, "SCALE"},
+                                 {DBN_weight_attributes::input_names::MEAN, "MEAN"},
+                                 {DBN_weight_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(DBN_weight_attributes::output_names,
+                             {
+                                 {DBN_weight_attributes::output_names::DSCALE, "DSCALE"},
+                                 {DBN_weight_attributes::output_names::DBIAS, "DBIAS"},
+                                 {DBN_weight_attributes::output_names::EQ_BIAS, "EQ_BIAS"},
+                                 {DBN_weight_attributes::output_names::EQ_SCALE_DY, "EQ_SCALE_DY"},
+                                 {DBN_weight_attributes::output_names::EQ_SCALE_X, "EQ_SCALE_X"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Genstats_attributes::input_names,
+                             {
+                                 {Genstats_attributes::input_names::X, "X"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Genstats_attributes::output_names,
+                             {
+                                 {Genstats_attributes::output_names::SUM, "SUM"},
+                                 {Genstats_attributes::output_names::SQ_SUM, "SQ_SUM"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Instancenorm_attributes::input_names,
+                             {
+                                 {Instancenorm_attributes::input_names::X, "X"},
+                                 {Instancenorm_attributes::input_names::SCALE, "SCALE"},
+                                 {Instancenorm_attributes::input_names::BIAS, "BIAS"},
+                                 {Instancenorm_attributes::input_names::EPSILON, "EPSILON"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Instancenorm_attributes::output_names,
+                             {
+                                 {Instancenorm_attributes::output_names::Y, "Y"},
+                                 {Instancenorm_attributes::output_names::MEAN, "MEAN"},
+                                 {Instancenorm_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Instancenorm_backward_attributes::input_names,
+                             {
+                                 {Instancenorm_backward_attributes::input_names::DY, "DY"},
+                                 {Instancenorm_backward_attributes::input_names::X, "X"},
+                                 {Instancenorm_backward_attributes::input_names::SCALE, "SCALE"},
+                                 {Instancenorm_backward_attributes::input_names::MEAN, "MEAN"},
+                                 {Instancenorm_backward_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Instancenorm_backward_attributes::output_names,
+                             {
+                                 {Instancenorm_backward_attributes::output_names::DX, "DX"},
+                                 {Instancenorm_backward_attributes::output_names::DSCALE, "DSCALE"},
+                                 {Instancenorm_backward_attributes::output_names::DBIAS, "DBIAS"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Layernorm_attributes::input_names,
+                             {
+                                 {Layernorm_attributes::input_names::X, "X"},
+                                 {Layernorm_attributes::input_names::SCALE, "SCALE"},
+                                 {Layernorm_attributes::input_names::BIAS, "BIAS"},
+                                 {Layernorm_attributes::input_names::EPSILON, "EPSILON"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Layernorm_attributes::output_names,
+                             {
+                                 {Layernorm_attributes::output_names::Y, "Y"},
+                                 {Layernorm_attributes::output_names::MEAN, "MEAN"},
+                                 {Layernorm_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Layernorm_backward_attributes::input_names,
+                             {
+                                 {Layernorm_backward_attributes::input_names::DY, "DY"},
+                                 {Layernorm_backward_attributes::input_names::X, "X"},
+                                 {Layernorm_backward_attributes::input_names::SCALE, "SCALE"},
+                                 {Layernorm_backward_attributes::input_names::MEAN, "MEAN"},
+                                 {Layernorm_backward_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Layernorm_backward_attributes::output_names,
+                             {
+                                 {Layernorm_backward_attributes::output_names::DX, "DX"},
+                                 {Layernorm_backward_attributes::output_names::DSCALE, "DSCALE"},
+                                 {Layernorm_backward_attributes::output_names::DBIAS, "DBIAS"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Matmul_attributes::input_names,
+                             {
+                                 {Matmul_attributes::input_names::A, "A"},
+                                 {Matmul_attributes::input_names::B, "B"},
+                                 {Matmul_attributes::input_names::M_override, "M_override"},
+                                 {Matmul_attributes::input_names::N_override, "N_override"},
+                                 {Matmul_attributes::input_names::K_override, "K_override"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Matmul_attributes::output_names,
+                             {
+                                 {Matmul_attributes::output_names::C, "C"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Pointwise_attributes::input_names,
+                             {
+                                 {Pointwise_attributes::input_names::IN_0, "IN_0"},
+                                 {Pointwise_attributes::input_names::IN_1, "IN_1"},
+                                 {Pointwise_attributes::input_names::IN_2, "IN_2"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Pointwise_attributes::output_names,
+                             {
+                                 {Pointwise_attributes::output_names::OUT_0, "OUT_0"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Reduction_attributes::input_names,
+                             {
+                                 {Reduction_attributes::input_names::X, "X"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Reduction_attributes::output_names, {{Reduction_attributes::output_names::Y, "Y"}})
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Reshape_attributes::input_names,
+                             {
+                                 {Reshape_attributes::input_names::X, "X"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Reshape_attributes::output_names, {{Reshape_attributes::output_names::Y, "Y"}})
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Rmsnorm_attributes::input_names,
+                             {
+                                 {Rmsnorm_attributes::input_names::X, "X"},
+                                 {Rmsnorm_attributes::input_names::SCALE, "SCALE"},
+                                 {Rmsnorm_attributes::input_names::BIAS, "BIAS"},
+                                 {Rmsnorm_attributes::input_names::EPSILON, "EPSILON"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Rmsnorm_attributes::output_names,
+                             {
+                                 {Rmsnorm_attributes::output_names::Y, "Y"},
+                                 {Rmsnorm_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Rmsnorm_backward_attributes::input_names,
+                             {
+                                 {Rmsnorm_backward_attributes::input_names::DY, "DY"},
+                                 {Rmsnorm_backward_attributes::input_names::X, "X"},
+                                 {Rmsnorm_backward_attributes::input_names::SCALE, "SCALE"},
+                                 {Rmsnorm_backward_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Rmsnorm_backward_attributes::output_names,
+                             {
+                                 {Rmsnorm_backward_attributes::output_names::DX, "DX"},
+                                 {Rmsnorm_backward_attributes::output_names::DSCALE, "DSCALE"},
+                                 {Rmsnorm_backward_attributes::output_names::DBIAS, "DBIAS"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Rng_attributes::input_names,
+                             {
+                                 {Rng_attributes::input_names::Seed, "Seed"},
+                                 {Rng_attributes::input_names::Offset, "Offset"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Rng_attributes::output_names, {{Rng_attributes::output_names::Y, "Y"}})
+
+NLOHMANN_JSON_SERIALIZE_ENUM(SDPA_attributes::input_names,
+                             {
+                                 {SDPA_attributes::input_names::Q, "Q"},
+                                 {SDPA_attributes::input_names::K, "K"},
+                                 {SDPA_attributes::input_names::V, "V"},
+                                 {SDPA_attributes::input_names::Attn_scale, "Attn_scale"},
+                                 {SDPA_attributes::input_names::Bias, "Bias"},
+                                 {SDPA_attributes::input_names::SEQ_LEN_Q, "SEQ_LEN_Q"},
+                                 {SDPA_attributes::input_names::SEQ_LEN_KV, "SEQ_LEN_KV"},
+                                 {SDPA_attributes::input_names::Seed, "Seed"},
+                                 {SDPA_attributes::input_names::Offset, "Offset"},
+                                 {SDPA_attributes::input_names::Dropout_mask, "Dropout_mask"},
+                                 {SDPA_attributes::input_names::Dropout_scale, "Dropout_scale"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(SDPA_attributes::output_names,
+                             {{SDPA_attributes::output_names::O, "O"},
+                              {SDPA_attributes::output_names::Stats, "Stats"},
+                              {SDPA_attributes::output_names::RNG_DUMP, "RNG_DUMP"}})
+
+NLOHMANN_JSON_SERIALIZE_ENUM(SDPA_backward_attributes::input_names,
+                             {
+                                 {SDPA_backward_attributes::input_names::Q, "Q"},
+                                 {SDPA_backward_attributes::input_names::K, "K"},
+                                 {SDPA_backward_attributes::input_names::V, "V"},
+                                 {SDPA_backward_attributes::input_names::O, "O"},
+                                 {SDPA_backward_attributes::input_names::dO, "dO"},
+                                 {SDPA_backward_attributes::input_names::Stats, "Stats"},
+                                 {SDPA_backward_attributes::input_names::Attn_scale, "Attn_scale"},
+                                 {SDPA_backward_attributes::input_names::Bias, "Bias"},
+                                 {SDPA_backward_attributes::input_names::SEQ_LEN_Q, "SEQ_LEN_Q"},
+                                 {SDPA_backward_attributes::input_names::SEQ_LEN_KV, "SEQ_LEN_KV"},
+                                 {SDPA_backward_attributes::input_names::Seed, "Seed"},
+                                 {SDPA_backward_attributes::input_names::Offset, "Offset"},
+                                 {SDPA_backward_attributes::input_names::Dropout_mask, "Dropout_mask"},
+                                 {SDPA_backward_attributes::input_names::Dropout_scale, "Dropout_scale"},
+                                 {SDPA_backward_attributes::input_names::Dropout_scale_inv, "Dropout_scale_inv"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(SDPA_backward_attributes::output_names,
+                             {
+                                 {SDPA_backward_attributes::output_names::dQ, "dQ"},
+                                 {SDPA_backward_attributes::output_names::dK, "dK"},
+                                 {SDPA_backward_attributes::output_names::dV, "dV"},
+                                 {SDPA_backward_attributes::output_names::dBias, "dBias"},
+                                 {SDPA_backward_attributes::output_names::RNG_DUMP, "RNG_DUMP"},
+                             })
+
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/include/cudnn_frontend_Heuristics.h b/include/cudnn_frontend_Heuristics.h
index 187003a2..e404951b 100644
--- a/include/cudnn_frontend_Heuristics.h
+++ b/include/cudnn_frontend_Heuristics.h
@@ -96,10 +96,11 @@ class EngineHeuristics_v8 : public BackendDescriptor {
                                           count,
                                           &result,
                                           heuristic_results_.data());
-        if (status != CUDNN_STATUS_SUCCESS) {
+        if (status != CUDNN_STATUS_SUCCESS || result < 1) {
             set_error_and_throw_exception(
                 this, status, "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: GetAttribute CUDNN_ATTR_ENGINEHEUR_RESULTS Failed");
         };
+        m_heuristic_results.resize(result);
         return m_heuristic_results;
     }
 
diff --git a/include/cudnn_frontend_utils.h b/include/cudnn_frontend_utils.h
index b0bacfc3..5454a71a 100644
--- a/include/cudnn_frontend_utils.h
+++ b/include/cudnn_frontend_utils.h
@@ -84,7 +84,6 @@ namespace cudnn_frontend {
 /// Detailed feature_vector. Generally the Tensor and Operation properties
 using feature_vector_t = std::vector<int64_t>;
 
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
 class cudnnException : public std::runtime_error {
    public:
     cudnnException(const char* message, cudnnStatus_t status) throw() : std::runtime_error(message) {
@@ -101,7 +100,6 @@ class cudnnException : public std::runtime_error {
 
     cudnnStatus_t error_status;
 };
-#endif
 
 static inline bool
 AllowAll(cudnnBackendDescriptor_t engine_config) {
@@ -109,28 +107,14 @@ AllowAll(cudnnBackendDescriptor_t engine_config) {
     return false;
 }
 
-static inline void
-throw_if(std::function<bool()> expr, const char* message, cudnnStatus_t status) {
-    if (expr()) {
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
-        throw cudnnException(message, status);
-#endif
-    }
-}
-static inline void
-throw_if(bool expr, const char* message, cudnnStatus_t status) {
-    if (expr) {
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
-        throw cudnnException(message, status);
-#endif
-    }
-}
-
 static inline std::string
 to_string(cudnnStatus_t const status) {
     return cudnnGetErrorString(status);
 }
 
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+[[noreturn]]
+#endif
 static inline void
 set_error_and_throw_exception(BackendDescriptor const* desc, cudnnStatus_t status, const char* message) {
     if (desc != nullptr) {
@@ -620,6 +604,7 @@ get_pointwise_mode_port_count(PointwiseMode_t const& mode) {
         case PointwiseMode_t::LOGICAL_OR:
         case PointwiseMode_t::MIN:
         case PointwiseMode_t::MAX:
+        case PointwiseMode_t::MOD:
         case PointwiseMode_t::RELU_BWD:
         case PointwiseMode_t::TANH_BWD:
         case PointwiseMode_t::SIGMOID_BWD:
@@ -642,7 +627,6 @@ get_pointwise_mode_port_count(PointwiseMode_t const& mode) {
         case PointwiseMode_t::EXP:
         case PointwiseMode_t::LOG:
         case PointwiseMode_t::NEG:
-        case PointwiseMode_t::MOD:
         case PointwiseMode_t::ABS:
         case PointwiseMode_t::CEIL:
         case PointwiseMode_t::FLOOR:
@@ -785,7 +769,7 @@ get_abili_slope(int64_t const n_heads) {
 #pragma warning(push)
 #pragma warning(disable : 4244)  // this could be ommited with c++17 and contexpr
 #endif
-    int n = 1 << static_cast<int>(log2f(n_heads));
+    int n = 1 << static_cast<int>(log2(static_cast<double>(n_heads)));
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
@@ -794,12 +778,12 @@ get_abili_slope(int64_t const n_heads) {
     }
 
     for (int i = 0; i < 2 * (n_heads - n); i += 2) {
-        slope.push_back((float)(i + 1.0f) * 0.5f);
+        slope.push_back(static_cast<float>(i + 1) * 0.5f);
     }
 
     for (float& elem : slope) {
-        elem *= -8.0;
-        elem /= n;
+        elem *= -8.0f;
+        elem /= static_cast<float>(n);
         elem = powf(2.0, elem);
     }
 
diff --git a/python_bindings/properties.cpp b/python_bindings/properties.cpp
index de2e9b4d..5fd700bb 100644
--- a/python_bindings/properties.cpp
+++ b/python_bindings/properties.cpp
@@ -90,6 +90,7 @@ init_properties(py::module_& m) {
         .def("set_is_pass_by_value", &cudnn_frontend::graph::Tensor_attributes::set_is_pass_by_value)
         .def("get_uid", &cudnn_frontend::graph::Tensor_attributes::get_uid)
         .def("set_uid", &cudnn_frontend::graph::Tensor_attributes::set_uid)
+        .def("set_ragged_offset", &cudnn_frontend::graph::Tensor_attributes::set_ragged_offset)
         .def("__repr__", [](cudnn_frontend::graph::Tensor_attributes const& props) {
             std::ostringstream out;
             out << json{props};
diff --git a/python_bindings/pygraph/pointwise.cpp b/python_bindings/pygraph/pointwise.cpp
index d5e8b486..5dabc23a 100644
--- a/python_bindings/pygraph/pointwise.cpp
+++ b/python_bindings/pygraph/pointwise.cpp
@@ -301,7 +301,7 @@ init_pygraph_pointwise_submodule(py::class_<PyGraph>& m) {
         )pbdoc");
     m.def("tanh",
           &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::TANH_FWD>,
-          py::arg("input0"),
+          py::arg("input"),
           py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
           py::arg_v("name", ""),
           R"pbdoc(
diff --git a/python_bindings/pygraph/pygraph.cpp b/python_bindings/pygraph/pygraph.cpp
index 015d15ef..2c516b78 100644
--- a/python_bindings/pygraph/pygraph.cpp
+++ b/python_bindings/pygraph/pygraph.cpp
@@ -107,6 +107,7 @@ PyGraph::tensor(std::vector<int64_t> const& dim,
                 cudnn_frontend::DataType_t const& data_type,
                 bool const& is_virtual,
                 bool const& is_pass_by_value,
+                std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& ragged_offset,
                 std::string const& name) {
     auto props = cudnn_frontend::graph::Tensor_attributes()
                      .set_data_type(data_type)
@@ -114,6 +115,7 @@ PyGraph::tensor(std::vector<int64_t> const& dim,
                      .set_is_pass_by_value(is_pass_by_value)
                      .set_dim(dim)
                      .set_stride(stride)
+                     .set_ragged_offset(ragged_offset)
                      .set_name(name);
 
     return graph.tensor(props);
@@ -168,13 +170,15 @@ PyGraph::tensor_like(py::object const& pyobj) {
 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
 PyGraph::conv_fprop(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& image,
                     std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& weight,
-                    std::vector<int64_t> const& padding,
+                    std::vector<int64_t> const& pre_padding,
+                    std::vector<int64_t> const& post_padding,
                     std::vector<int64_t> const& stride,
                     std::vector<int64_t> const& dilation,
                     cudnn_frontend::DataType_t const& compute_data_type,
                     std::string const& name) {
     auto attributes = cudnn_frontend::graph::Conv_fprop_attributes()
-                          .set_padding(padding)
+                          .set_pre_padding(pre_padding)
+                          .set_post_padding(post_padding)
                           .set_stride(stride)
                           .set_dilation(dilation)
                           .set_compute_data_type(compute_data_type)
@@ -187,13 +191,15 @@ PyGraph::conv_fprop(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& i
 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
 PyGraph::conv_dgrad(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
                     std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& filter,
-                    std::vector<int64_t> const& padding,
+                    std::vector<int64_t> const& pre_padding,
+                    std::vector<int64_t> const& post_padding,
                     std::vector<int64_t> const& stride,
                     std::vector<int64_t> const& dilation,
                     cudnn_frontend::DataType_t const& compute_data_type,
                     std::string const& name) {
     auto attributes = cudnn_frontend::graph::Conv_dgrad_attributes()
-                          .set_padding(padding)
+                          .set_pre_padding(pre_padding)
+                          .set_post_padding(post_padding)
                           .set_stride(stride)
                           .set_dilation(dilation)
                           .set_compute_data_type(compute_data_type)
@@ -205,13 +211,15 @@ PyGraph::conv_dgrad(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& l
 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
 PyGraph::conv_wgrad(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& image,
                     std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
-                    std::vector<int64_t> const& padding,
+                    std::vector<int64_t> const& pre_padding,
+                    std::vector<int64_t> const& post_padding,
                     std::vector<int64_t> const& stride,
                     std::vector<int64_t> const& dilation,
                     cudnn_frontend::DataType_t const& compute_data_type,
                     std::string const& name) {
     auto attributes = cudnn_frontend::graph::Conv_wgrad_attributes()
-                          .set_padding(padding)
+                          .set_pre_padding(pre_padding)
+                          .set_post_padding(post_padding)
                           .set_stride(stride)
                           .set_dilation(dilation)
                           .set_compute_data_type(compute_data_type)
@@ -331,6 +339,22 @@ PyGraph::execute(std::unordered_map<std::shared_ptr<cudnn_frontend::graph::Tenso
     return;
 }
 
+void
+PyGraph::execute(std::unordered_map<int64_t, py::object> var_pack, py::object workspace) {
+    std::unordered_map<int64_t, void*> var_pack_;
+    for (auto const& [uid, pyobject] : var_pack) {
+        var_pack_.emplace(uid, extract_data_pointer(pyobject));
+    }
+
+    void* workspace_ptr = extract_data_pointer(workspace);
+
+    // TODO: Probably concatenate in a macro?
+    auto status = graph.execute(handle, var_pack_, workspace_ptr);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+
+    return;
+}
+
 std::vector<int64_t>
 default_vector(void) {
     return {};
@@ -363,6 +387,7 @@ init_pygraph_submodule(py::module_& m) {
              py::arg_v("data_type", cudnn_frontend::DataType_t::NOT_SET),
              py::arg_v{"is_virtual", false},
              py::arg_v{"is_pass_by_value", false},
+             py::arg_v{"ragged_offset", nullptr},
              py::arg_v("name", ""),
              R"pbdoc(
                 Create a tensor.
@@ -373,6 +398,7 @@ init_pygraph_submodule(py::module_& m) {
                     data_type (cudnn.data_type): The data type of the tensor. Default is cudnn.data_type.NOT_SET.
                     is_virtual (bool): Flag indicating if the tensor is virtual. Default is False.
                     is_pass_by_value (bool): Flag indicating if the tensor is passed by value. Default is False.
+                    ragged_offset (cudnn_tensor): The ragged offset tensor. Default is nullptr.
                     name (Optional[str]): The name of the tensor.
 
                 Returns:
@@ -383,11 +409,31 @@ init_pygraph_submodule(py::module_& m) {
              py::arg("input"),
              py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
              py::arg_v("name", ""))
+        .def(
+            "conv_fprop",
+            [](PyGraph& self,
+               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& image,
+               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& weight,
+               std::vector<int64_t> const& padding,
+               std::vector<int64_t> const& stride,
+               std::vector<int64_t> const& dilation,
+               cudnn_frontend::DataType_t const& compute_data_type,
+               std::string const& name) {
+                return self.conv_fprop(image, weight, padding, padding, stride, dilation, compute_data_type, name);
+            },
+            py::arg("image"),
+            py::arg("weight"),
+            py::arg_v{"padding", default_vector()},
+            py::arg_v{"stride", default_vector()},
+            py::arg_v{"dilation", default_vector()},
+            py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+            py::arg_v("name", ""))
         .def("conv_fprop",
              &PyGraph::conv_fprop,
              py::arg("image"),
              py::arg("weight"),
-             py::arg_v{"padding", default_vector()},
+             py::arg_v{"pre_padding", default_vector()},
+             py::arg_v{"post_padding", default_vector()},
              py::arg_v{"stride", default_vector()},
              py::arg_v{"dilation", default_vector()},
              py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
@@ -398,7 +444,8 @@ init_pygraph_submodule(py::module_& m) {
                 Args:
                     image (cudnn_tensor): The image tensor.
                     weight (cudnn_tensor): The weight tensor.
-                    padding (Optional[List[int]]): The padding values for the operation. Default is an empty list.
+                    pre_padding (Optional[List[int]]): The pre padding values for the operation. Default is an empty list.
+                    post_padding (Optional[List[int]]): The post padding values for the operation. Default is an empty list.
                     stride (Optional[List[int]]): The stride values for the operation. Default is an empty list.
                     dilation (Optional[List[int]]): The dilation values for the operation. Default is an empty list.
                     compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
@@ -407,11 +454,31 @@ init_pygraph_submodule(py::module_& m) {
                 Returns:
                     cudnn_tensor: The created tensor.
             )pbdoc")
+        .def(
+            "conv_wgrad",
+            [](PyGraph& self,
+               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& image,
+               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
+               std::vector<int64_t> const& padding,
+               std::vector<int64_t> const& stride,
+               std::vector<int64_t> const& dilation,
+               cudnn_frontend::DataType_t const& compute_data_type,
+               std::string const& name) {
+                return self.conv_wgrad(image, loss, padding, padding, stride, dilation, compute_data_type, name);
+            },
+            py::arg("image"),
+            py::arg("loss"),
+            py::arg_v{"padding", default_vector()},
+            py::arg_v{"stride", default_vector()},
+            py::arg_v{"dilation", default_vector()},
+            py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+            py::arg_v("name", ""))
         .def("conv_wgrad",
              &PyGraph::conv_wgrad,
              py::arg("image"),
              py::arg("loss"),
-             py::arg_v{"padding", default_vector()},
+             py::arg_v{"pre_padding", default_vector()},
+             py::arg_v{"post_padding", default_vector()},
              py::arg_v{"stride", default_vector()},
              py::arg_v{"dilation", default_vector()},
              py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
@@ -422,8 +489,8 @@ init_pygraph_submodule(py::module_& m) {
                 Args:
                     image (cudnn_tensor): The image tensor.
                     loss (cudnn_tensor): The loss tensor.
-                    padding (Optional[List[int]]): The padding values for the operation. Default is an empty list.
-                    stride (Optional[List[int]]): The stride values for the operation. Default is an empty list.
+                    pre_padding (Optional[List[int]]): The pre padding values for the operation. Default is an empty list.
+                    post_padding (Optional[List[int]]): The post padding values for the operation. Default is an empty list.                    stride (Optional[List[int]]): The stride values for the operation. Default is an empty list.
                     dilation (Optional[List[int]]): The dilation values for the operation. Default is an empty list.
                     compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
                     name (Optional[str]): A name for the operation to be performed.
@@ -431,11 +498,31 @@ init_pygraph_submodule(py::module_& m) {
                 Returns:
                     cudnn_tensor: The created tensor.
             )pbdoc")
+        .def(
+            "conv_dgrad",
+            [](PyGraph& self,
+               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
+               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& filter,
+               std::vector<int64_t> const& padding,
+               std::vector<int64_t> const& stride,
+               std::vector<int64_t> const& dilation,
+               cudnn_frontend::DataType_t const& compute_data_type,
+               std::string const& name) {
+                return self.conv_dgrad(loss, filter, padding, padding, stride, dilation, compute_data_type, name);
+            },
+            py::arg("loss"),
+            py::arg("filter"),
+            py::arg_v{"padding", default_vector()},
+            py::arg_v{"stride", default_vector()},
+            py::arg_v{"dilation", default_vector()},
+            py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+            py::arg_v("name", ""))
         .def("conv_dgrad",
              &PyGraph::conv_dgrad,
              py::arg("loss"),
              py::arg("filter"),
-             py::arg_v{"padding", default_vector()},
+             py::arg_v{"pre_padding", default_vector()},
+             py::arg_v{"post_padding", default_vector()},
              py::arg_v{"stride", default_vector()},
              py::arg_v{"dilation", default_vector()},
              py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
@@ -446,7 +533,8 @@ init_pygraph_submodule(py::module_& m) {
                 Args:
                     loss (cudnn_tensor): The loss tensor.
                     filter (cudnn_tensor): The filter tensor.
-                    padding (Optional[List[int]]): The padding values for the operation. Default is an empty list.
+                    pre_padding (Optional[List[int]]): The pre padding values for the operation. Default is an empty list.
+                    post_padding (Optional[List[int]]): The post padding values for the operation. Default is an empty list.
                     stride (Optional[List[int]]): The stride values for the operation. Default is an empty list.
                     dilation (Optional[List[int]]): The dilation values for the operation. Default is an empty list.
                     compute_data_type (Optional[pycudnn.data_type]): The data type for computation. Default is NOT_SET.
@@ -505,7 +593,13 @@ init_pygraph_submodule(py::module_& m) {
              py::arg("policy") = cudnn_frontend::BuildPlanPolicy_t::HEURISTICS_CHOICE)
         .def("build", &PyGraph::build)
         .def("get_workspace_size", &PyGraph::get_workspace_size)
-        .def("execute", &PyGraph::execute)
+        .def(
+            "execute",
+            static_cast<void (PyGraph::*)(
+                std::unordered_map<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, py::object>, py::object)>(
+                &PyGraph::execute))
+        .def("execute",
+             static_cast<void (PyGraph::*)(std::unordered_map<int64_t, py::object>, py::object)>(&PyGraph::execute))
         .def("__repr__", [](PyGraph const& pygraph) {
             std::stringstream ss;
             json j = pygraph.graph;
diff --git a/python_bindings/pygraph/pygraph.h b/python_bindings/pygraph/pygraph.h
index cd5dc6ea..3d2dd86e 100644
--- a/python_bindings/pygraph/pygraph.h
+++ b/python_bindings/pygraph/pygraph.h
@@ -71,6 +71,7 @@ class PyGraph {
            cudnn_frontend::DataType_t const& data_type,
            bool const& is_virtual,
            bool const& is_pass_by_value,
+           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& ragged_offset,
            std::string const& name);
 
     std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
@@ -131,7 +132,8 @@ class PyGraph {
     std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
     conv_fprop(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& image,
                std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& weight,
-               std::vector<int64_t> const& padding,
+               std::vector<int64_t> const& pre_padding,
+               std::vector<int64_t> const& post_padding,
                std::vector<int64_t> const& stride,
                std::vector<int64_t> const& dilation,
                cudnn_frontend::DataType_t const& compute_data_type,
@@ -140,7 +142,8 @@ class PyGraph {
     std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
     conv_dgrad(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
                std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& filter,
-               std::vector<int64_t> const& padding,
+               std::vector<int64_t> const& pre_padding,
+               std::vector<int64_t> const& post_padding,
                std::vector<int64_t> const& stride,
                std::vector<int64_t> const& dilation,
                cudnn_frontend::DataType_t const& compute_data_type,
@@ -149,7 +152,8 @@ class PyGraph {
     std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
     conv_wgrad(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& image,
                std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
-               std::vector<int64_t> const& padding,
+               std::vector<int64_t> const& pre_padding,
+               std::vector<int64_t> const& post_padding,
                std::vector<int64_t> const& stride,
                std::vector<int64_t> const& dilation,
                cudnn_frontend::DataType_t const& compute_data_type,
@@ -306,6 +310,9 @@ class PyGraph {
     execute(std::unordered_map<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, py::object> var_pack,
             py::object workspace);
 
+    void
+    execute(std::unordered_map<int64_t, py::object> var_pack, py::object workspace);
+
     void
     deselect_numeric_notes(std::vector<NumericalNote_t> const& notes) {
         graph.deselect_numeric_notes(notes);
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 990d8fbb..95fbaa77 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -21,6 +21,8 @@ add_executable(
     cpp/layernorm.cpp
     cpp/rmsnorm.cpp
     cpp/wgrads.cpp
+    cpp/serialization.cpp
+    cpp/pointwise.cpp
 
     legacy_samples/conv_sample.cpp 
     legacy_samples/resnet_test_list.cpp
diff --git a/samples/README.md b/samples/README.md
index a5821258..71deae82 100644
--- a/samples/README.md
+++ b/samples/README.md
@@ -1,4 +1,10 @@
-This directory contains several samples for you to see how we envision using the CUDNN Frontend API.
+# FE - Programming Samples
 
+## Python Interface Samples
+Samples leveraging FE's Python interface are located in [samples/python](/samples/python/).
 
-For questions or to provide feedback, please contact cuDNN@nvidia.com.
+## C++ Interface Samples
+Samples leveraging FE's C++ interface are located in [samples/cpp](/samples/cpp/).
+
+## [Deprecated] C++ v0.x Interface Samples
+Samples leveraging FE's C++ 0.x interface are located in [samples/legacy_samples](/samples/legacy_samples/).
diff --git a/samples/cpp/convolutions.cpp b/samples/cpp/convolutions.cpp
index 514da386..51fc5816 100644
--- a/samples/cpp/convolutions.cpp
+++ b/samples/cpp/convolutions.cpp
@@ -28,7 +28,11 @@
 TEST_CASE("Convolution fprop", "[conv][graph][caching]") {
     namespace fe = cudnn_frontend;
 
-    int64_t n = 16, c = 128, h = 56, w = 56, k = 256, r = 3, s = 3;
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by currend cudnn version");
+    }
+
+    int64_t n = 16, c = 128, h = 64, w = 64, k = 256, r = 1, s = 1;
 
     auto build_new_graph = [=](cudnnHandle_t handle) {
         auto graph = std::make_shared<fe::graph::Graph>();
@@ -44,8 +48,10 @@ TEST_CASE("Convolution fprop", "[conv][graph][caching]") {
                                    .set_dim({k, c, r, s})
                                    .set_stride({c * r * s, 1, c * s, c}));
 
-        auto conv_options =
-            fe::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+        auto conv_options = fe::graph::Conv_fprop_attributes()
+                                .set_padding({0, 0})
+                                .set_stride({1, 1})
+                                .set_dilation({1, 1});
         auto Y = graph->conv_fprop(X, W, conv_options);
 
         Y->set_output(true);
@@ -73,10 +79,13 @@ TEST_CASE("Convolution fprop", "[conv][graph][caching]") {
     Surface<half> w_tensor(k * c * r * s, false);
     Surface<half> y_tensor(n * k * h * w, false);  // Should be p, q.
 
-    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
-        {X, x_tensor.devPtr}, {W, w_tensor.devPtr}, {Y, y_tensor.devPtr}};
+    std::unordered_map<int64_t, void*> variant_pack = {
+        {X->get_uid(), x_tensor.devPtr}, {W->get_uid(), w_tensor.devPtr}, {Y->get_uid(), y_tensor.devPtr}};
 
     Surface<int8_t> workspace(graph->get_workspace_size(), false);
+
+    std::cout << *graph << std::endl;
+
     REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
     cudnnDestroy(handle);
 }
@@ -358,4 +367,4 @@ TEST_CASE("Conv with Int8 datatypes", "[conv][graph][caching]") {
     Surface<int8_t> workspace(graph->get_workspace_size(), false);
     REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
     cudnnDestroy(handle);
-}
\ No newline at end of file
+}
diff --git a/samples/cpp/dgrads.cpp b/samples/cpp/dgrads.cpp
index b3e66514..36a3654c 100644
--- a/samples/cpp/dgrads.cpp
+++ b/samples/cpp/dgrads.cpp
@@ -27,6 +27,9 @@
 
 TEST_CASE("Convolution Dgrad", "[dgrad][graph]") {
     namespace fe = cudnn_frontend;
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by currend cudnn version");
+    }
     fe::graph::Graph graph;
     graph.set_io_data_type(fe::DataType_t::HALF)
         .set_intermediate_data_type(fe::DataType_t::FLOAT)
@@ -241,4 +244,4 @@ TEST_CASE("Dgrad Drelu DBNweight Graph", "[dgrad][graph]") {
         {drelu_output, drelu_output_tensor.devPtr}};
     REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
     cudnnDestroy(handle);
-}
\ No newline at end of file
+}
diff --git a/samples/cpp/matmuls.cpp b/samples/cpp/matmuls.cpp
index c0380b60..3c2cdf78 100644
--- a/samples/cpp/matmuls.cpp
+++ b/samples/cpp/matmuls.cpp
@@ -21,11 +21,17 @@
  */
 
 #include <catch2/catch_test_macros.hpp>
+
+#include <random>
+
 #include "../utils/helpers.h"
 
 #include <cudnn_frontend.h>
 
 TEST_CASE("Matmul", "[matmul][graph]") {
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by currend cudnn version");
+    }
     namespace fe = cudnn_frontend;
 
     // matmul problem size
@@ -69,6 +75,8 @@ TEST_CASE("Matmul", "[matmul][graph]") {
     REQUIRE(graph.build_operation_graph(handle).is_good());
     REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
 
+    REQUIRE(graph.check_support(handle).is_good());
+
     REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
 
     // Run cudnn graph
@@ -80,6 +88,104 @@ TEST_CASE("Matmul", "[matmul][graph]") {
     checkCudnnErr(cudnnDestroy(handle));
 }
 
+TEST_CASE("Matmul fp8 precision", "[matmul][graph]") {
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+
+    if ((is_hopper_arch() && cudnnGetVersion() >= 90000) == false) {
+        SKIP("FP8 gemm not supported pre-Hopper or pre-cudnn-9.0.0");
+    }
+
+    namespace fe = cudnn_frontend;
+    // matmul problem size
+    int64_t const b = 16;
+    int64_t const m = 32;
+    int64_t const n = 64;
+    int64_t const k = 128;
+
+    // Initialize input tensors with int8_t as proxy for fp8
+    Surface<int8_t> A_gpu(b * m * k, false);
+    Surface<int8_t> B_gpu(b * k * n, false);
+
+    Surface<float> A_descale_gpu(1, false);
+    Surface<float> B_descale_gpu(1, false);
+
+    fe::graph::Graph graph{};
+
+    // Create the two non-virtual input tensors A and B.
+    // There are read from global memory.
+    auto A_attributes = fe::graph::Tensor_attributes()
+                            .set_name("A")
+                            .set_dim({b, m, k})
+                            .set_stride({m * k, k, 1})
+                            .set_data_type(fe::DataType_t::FP8_E4M3);
+    auto A = graph.tensor(A_attributes);
+
+    auto B_attributes = fe::graph::Tensor_attributes()
+                            .set_name("B")
+                            .set_dim({b, k, n})
+                            .set_stride({k * n, 1, k})
+                            .set_data_type(fe::DataType_t::FP8_E4M3);
+    auto B = graph.tensor(B_attributes);
+
+    auto A_descale_attributes =
+        fe::graph::Tensor_attributes().set_name("A").set_dim({1, 1, 1}).set_stride({1, 1, 1}).set_data_type(
+            fe::DataType_t::FLOAT);
+    auto B_descale_attributes =
+        fe::graph::Tensor_attributes().set_name("B").set_dim({1, 1, 1}).set_stride({1, 1, 1}).set_data_type(
+            fe::DataType_t::FLOAT);
+
+    auto A_descale = graph.tensor(A_descale_attributes);
+    auto B_descale = graph.tensor(B_descale_attributes);
+
+    auto matmul_attributes =
+        fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C = graph.matmul(A, B, matmul_attributes);
+    C->set_data_type(fe::DataType_t::FLOAT);
+
+    // Add scale_A operation
+    auto pw_0_attributes = fe::graph::Pointwise_attributes()
+                               .set_name("pw0_Mul")
+                               .set_mode(fe::PointwiseMode_t::MUL)
+                               .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C_after_pw_0 = graph.pointwise(C, A_descale, pw_0_attributes);
+    C_after_pw_0->set_data_type(fe::DataType_t::FLOAT);
+
+    // Add descale_B operation
+    auto pw_1_attributes = fe::graph::Pointwise_attributes()
+                               .set_name("pw1_Mul")
+                               .set_mode(fe::PointwiseMode_t::MUL)
+                               .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C_after_pw_1 = graph.pointwise(C_after_pw_0, B_descale, pw_1_attributes);
+    C_after_pw_1->set_output(true).set_data_type(fe::DataType_t::BFLOAT16);
+
+    REQUIRE(graph.validate().is_good());
+
+    cudnnHandle_t handle;
+    checkCudnnErr(cudnnCreate(&handle));
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph.check_support(handle).is_good());
+
+    REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    Surface<float> C_gpu(b * m * n, false);
+    Surface<int8_t> workspace(graph.get_workspace_size(), false);
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {A, A_gpu.devPtr},
+        {B, B_gpu.devPtr},
+        {C_after_pw_1, C_gpu.devPtr},
+        {A_descale, A_descale_gpu.devPtr},
+        {B_descale, B_descale_gpu.devPtr}};
+
+    std::cout << graph.print() << std::endl;
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+    checkCudnnErr(cudnnDestroy(handle));
+}
+
 TEST_CASE("Mixed Precision Matmul", "[matmul][graph]") {
     if (cudnnGetCudartVersion() < 12000) {
         SKIP("Test requires cuda toolkit 12.0 or above");
@@ -140,7 +246,7 @@ TEST_CASE("Mixed Precision Matmul", "[matmul][graph]") {
     if (is_hopper_arch() && cudnnGetVersion() >= 8906) {
         REQUIRE(graph.check_support(handle).is_good());
     } else {
-        SKIP("int8_bf16 mixe precision gemm not supported pre-Hopper or pre-cudnn-8.9.6");
+        SKIP("int8_bf16 mixed precision gemm not supported pre-Hopper or pre-cudnn-8.9.6");
     }
 
     REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
@@ -151,6 +257,93 @@ TEST_CASE("Mixed Precision Matmul", "[matmul][graph]") {
     Surface<int8_t> workspace(graph.get_workspace_size(), false);
     std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
         {A, A_gpu.devPtr}, {B, B_gpu.devPtr}, {C, C_gpu.devPtr}};
+
+    std::cout << graph.print() << std::endl;
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+    checkCudnnErr(cudnnDestroy(handle));
+}
+
+TEST_CASE("Int8 Matmul", "[matmul][graph]") {
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+    namespace fe = cudnn_frontend;
+
+    // matmul problem size
+    int64_t const b = 16;
+    int64_t const m = 32;
+    int64_t const n = 64;
+    int64_t const k = 128;
+
+    // Initialize input tensors
+    Surface<int8_t> A_gpu(b * m * k, false);
+    // note this is a bf16 tensor, but half is used just for memory allocation
+    Surface<int8_t> B_gpu(b * k * n, false);
+
+    // Make cudnn graph
+    fe::graph::Graph graph{};
+
+    // Create the two non-virtual input tensors A and B.
+    // There are read from global memory.
+    auto A_attributes = fe::graph::Tensor_attributes()
+                            .set_name("A")
+                            .set_dim({b, m, k})
+                            .set_stride({m * k, k, 1})
+                            .set_data_type(fe::DataType_t::INT8);
+    auto A            = graph.tensor(A_attributes);
+    auto B_attributes = fe::graph::Tensor_attributes()
+                            .set_name("B")
+                            .set_dim({b, k, n})
+                            .set_stride({k * n, 1, n})
+                            .set_data_type(fe::DataType_t::INT8);
+    auto B = graph.tensor(B_attributes);
+
+    auto Bias_attributes = cudnn_frontend::graph::Tensor_attributes()
+                               .set_name("Bias")
+                               .set_dim({b, m, n})
+                               .set_data_type(cudnn_frontend::DataType_t::FLOAT)
+                               .set_stride({m * n, n, 1});
+    auto Bias = graph.tensor(Bias_attributes);
+
+    // Add MATMUL operation
+    auto matmul_attributes = cudnn_frontend::graph::Matmul_attributes()
+                                 .set_compute_data_type(cudnn_frontend::DataType_t::INT32)
+                                 .set_name("GEMM");
+    auto C = graph.matmul(A, B, matmul_attributes);
+    C->set_data_type(cudnn_frontend::DataType_t::FLOAT);
+
+    // Add ADD operation
+    auto add_attributes = cudnn_frontend::graph::Pointwise_attributes()
+                              .set_name("pw1_add")
+                              .set_mode(cudnn_frontend::PointwiseMode_t::ADD)
+                              .set_compute_data_type(cudnn_frontend::DataType_t::FLOAT);
+    auto C_after_add = graph.pointwise(C, Bias, add_attributes);
+    C_after_add->set_output(true).set_data_type(cudnn_frontend::DataType_t::FLOAT);
+    REQUIRE(graph.validate().is_good());
+
+    cudnnHandle_t handle;
+    checkCudnnErr(cudnnCreate(&handle));
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    if (check_device_arch_newer_than("ampere") && cudnnGetVersion() >= 8906) {
+        REQUIRE(graph.check_support(handle).is_good());
+    } else {
+        SKIP("int8 gemm not supported pre-Ampere or pre-cudnn-8.9.6");
+    }
+
+    REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    // Run cudnn graph
+    // note this is a bf16 tensor, but half is used just for memory allocation
+    Surface<float> C_gpu(b * m * n, false);
+    Surface<float> Bias_gpu(b * m * n, false);
+    Surface<int8_t> workspace(graph.get_workspace_size(), false);
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {A, A_gpu.devPtr}, {B, B_gpu.devPtr}, {C_after_add, C_gpu.devPtr}, {Bias, Bias_gpu.devPtr}};
+
+    std::cout << graph.print() << std::endl;
     REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
     checkCudnnErr(cudnnDestroy(handle));
 }
@@ -207,6 +400,8 @@ TEST_CASE("Abs + Matmul", "[matmul][graph]") {
     REQUIRE(graph.build_operation_graph(handle).is_good());
     REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
 
+    REQUIRE(graph.check_support(handle).is_good());
+
     REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
 
     // Run cudnn graph
@@ -286,14 +481,44 @@ TEST_CASE("Bias + Matmul", "[matmul][graph]") {
     REQUIRE(graph.build_operation_graph(handle).is_good());
     REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
 
-    REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+    int64_t plan_count = graph.get_execution_plan_count();
+
+    std::vector<int64_t> successful_plans;
+    std::vector<int64_t> unsuccessful_plans;
+    for (int64_t plan_index = 0; plan_index < plan_count; plan_index++) {
+        bool did_build_successfully = graph.build_plan_at_index(handle, plan_index).is_good();
+        if (did_build_successfully) {
+            successful_plans.push_back(plan_index);
+        } else {
+            unsuccessful_plans.push_back(plan_index);
+        }
+    }
 
     // Run cudnn graph
     Surface<float> C_gpu(b * m * n, false);
-    Surface<int8_t> workspace(graph.get_workspace_size(), false);
     std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
         {A, A_gpu.devPtr}, {B, B_gpu.devPtr}, {C, C_gpu.devPtr}, {Bias, Bias_gpu.devPtr}};
-    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    // Run a unsuccessful engine and except error
+    std::vector<int64_t> random_unsuccessful;
+    std::sample(unsuccessful_plans.begin(),
+                unsuccessful_plans.end(),
+                std::back_inserter(random_unsuccessful),
+                1,
+                std::mt19937{std::random_device{}()});
+    if (random_unsuccessful.size()) {
+        REQUIRE(graph.execute_plan_at_index(handle, variant_pack, nullptr, random_unsuccessful.front()).is_bad());
+    }
+
+    // Run a successful engine and except success
+    std::vector<int64_t> random_successful;
+    std::sample(successful_plans.begin(),
+                successful_plans.end(),
+                std::back_inserter(random_successful),
+                1,
+                std::mt19937{std::random_device{}()});
+    Surface<int8_t> workspace(graph.get_workspace_size_plan_at_index(random_successful.front()), false);
+    REQUIRE(graph.execute_plan_at_index(handle, variant_pack, workspace.devPtr, random_successful.front()).is_good());
     checkCudnnErr(cudnnDestroy(handle));
 }
 
@@ -398,4 +623,4 @@ TEST_CASE("Matmul SBR Graph", "[matmul][graph]") {
                                                                                              {O, y_tensor.devPtr}};
     REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
     cudnnDestroy(handle);
-}
\ No newline at end of file
+}
diff --git a/samples/cpp/pointwise.cpp b/samples/cpp/pointwise.cpp
new file mode 100644
index 00000000..8137bf5a
--- /dev/null
+++ b/samples/cpp/pointwise.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Reduction", "[reduction]") {
+    namespace fe    = cudnn_frontend;
+    constexpr int n = 64;
+    if (cudnnGetVersion() < 8600) {
+        SKIP("TEST REQUIRES minimum cudnn version 8.6.0");
+    }
+    Surface<float> A_gpu(n * n * n * n, false);
+    fe::graph::Graph graph{};
+    auto A = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_dim({n, n, n, n})
+                              .set_stride({n * n * n, 1, n * n, n})
+                              .set_data_type(fe::DataType_t::FLOAT));
+    auto C = graph.reduction(A,
+                             fe::graph::Reduction_attributes()
+                                 .set_mode(fe::ReductionMode_t::MAX)
+                                 .set_compute_data_type(fe::DataType_t::FLOAT));
+    C->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_dim({1, 1, 1, 1});
+    REQUIRE(graph.validate().is_good());
+    cudnnHandle_t handle;
+    checkCudnnErr(cudnnCreate(&handle));
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+    Surface<float> C_gpu(n * n * n * n, false);
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {{A, A_gpu.devPtr},
+                                                                                             {C, C_gpu.devPtr}};
+    Surface<int8_t> workspace(graph.get_workspace_size(), false);
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+    checkCudnnErr(cudnnDestroy(handle));
+}
diff --git a/samples/cpp/serialization.cpp b/samples/cpp/serialization.cpp
new file mode 100644
index 00000000..32651382
--- /dev/null
+++ b/samples/cpp/serialization.cpp
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("CSBR Graph with serialization", "[conv][graph][serialization]") {
+    enum UIDs {
+        x_tensor,
+        w_tensor,
+        y_tensor,
+        scale_tensor,
+        bias_tensor,
+    };
+
+#if (CUDNN_VERSION < 8905)
+    SKIP("Serialization tests is not supported in cudnn versions prior to 8.9.5");
+#endif
+
+    int64_t n = 8, c = 32, h = 16, w = 16, k = 64, r = 3, s = 3;
+
+    cudnnHandle_t handle;  // Handle to use during deserialize and execute
+
+    checkCudnnErr(cudnnCreate(&handle));
+
+    auto build_and_validate_graph_helper =
+        [](int64_t n, int64_t c, int64_t h, int64_t w, int64_t k, int64_t r, int64_t s)
+        -> std::shared_ptr<cudnn_frontend::graph::Graph> {
+        auto graph = std::make_shared<cudnn_frontend::graph::Graph>();
+        graph->set_io_data_type(cudnn_frontend::DataType_t::HALF)
+            .set_intermediate_data_type(cudnn_frontend::DataType_t::FLOAT)
+            .set_compute_data_type(cudnn_frontend::DataType_t::FLOAT);
+
+        auto X = graph->tensor(cudnn_frontend::graph::Tensor_attributes()
+                                   .set_name("image")
+                                   .set_uid(x_tensor)
+                                   .set_dim({n, c, h, w})
+                                   .set_stride({c * h * w, 1, c * w, c}));
+
+        auto W = graph->tensor(cudnn_frontend::graph::Tensor_attributes()
+                                   .set_uid(w_tensor)
+                                   .set_name("filter")
+                                   .set_dim({k, c, r, s})
+                                   .set_stride({c * r * s, 1, c * s, c}));
+
+        auto conv_options =
+            cudnn_frontend::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+        auto conv_output = graph->conv_fprop(X, W, conv_options);
+
+        auto S = graph->tensor(cudnn_frontend::graph::Tensor_attributes()
+                                   .set_uid(scale_tensor)
+                                   .set_name("scale")
+                                   .set_dim({1, k, 1, 1})
+                                   .set_stride({k, 1, k, k}));
+        auto scale_options =
+            cudnn_frontend::graph::Pointwise_attributes().set_mode(cudnn_frontend::PointwiseMode_t::MUL);
+        auto scale_output = graph->pointwise(conv_output, S, scale_options);
+
+        auto B = graph->tensor(cudnn_frontend::graph::Tensor_attributes()
+                                   .set_name("bias")
+                                   .set_uid(bias_tensor)
+                                   .set_dim({1, k, 1, 1})
+                                   .set_stride({k, 1, k, k}));
+        auto bias_options =
+            cudnn_frontend::graph::Pointwise_attributes().set_mode(cudnn_frontend::PointwiseMode_t::ADD);
+        auto bias_output = graph->pointwise(scale_output, B, bias_options);
+
+        auto relu_options =
+            cudnn_frontend::graph::Pointwise_attributes().set_mode(cudnn_frontend::PointwiseMode_t::RELU_FWD);
+        auto Y = graph->pointwise(bias_output, relu_options);
+        Y->set_output(true).set_uid(y_tensor);
+
+        REQUIRE(graph->validate().is_good());
+
+        return graph;
+    };
+
+    // Check support
+
+    auto check_support = [build_and_validate_graph_helper](
+                             int64_t n, int64_t c, int64_t h, int64_t w, int64_t k, int64_t r, int64_t s) -> bool {
+        cudnnHandle_t handle;
+
+        checkCudnnErr(cudnnCreate(&handle));
+
+        auto graph = build_and_validate_graph_helper(n, c, h, w, k, r, s);
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({cudnn_frontend::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support(handle).is_good());
+
+        cudnnDestroy(handle);
+
+        return true;
+    };
+
+    // Serialization Phase
+
+    auto serialize =
+        [build_and_validate_graph_helper](
+            int64_t n, int64_t c, int64_t h, int64_t w, int64_t k, int64_t r, int64_t s) -> std::vector<uint8_t> {
+        cudnnHandle_t handle;
+
+        std::vector<uint8_t> serialized_data;
+
+        checkCudnnErr(cudnnCreate(&handle));
+
+        auto graph = build_and_validate_graph_helper(n, c, h, w, k, r, s);
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({cudnn_frontend::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support(handle).is_good());
+
+        REQUIRE(graph->build_plans(handle).is_good());
+
+        // Insert auto-tuning logic here
+
+        REQUIRE(graph->serialize(serialized_data).is_good());
+
+        cudnnDestroy(handle);
+
+        return serialized_data;
+    };
+
+    auto deserialize = [](cudnnHandle_t handle,
+                          std::vector<uint8_t> const& data) -> std::shared_ptr<cudnn_frontend::graph::Graph> {
+        auto graph = std::make_shared<cudnn_frontend::graph::Graph>();
+
+        REQUIRE(graph->deserialize(handle, data).is_good());
+
+        return graph;
+    };
+
+    // Check if the graph is supported
+    REQUIRE(check_support(n, c, h, w, k, r, s));
+
+    // Serialize the graph.
+    auto serialize_data = serialize(n, c, h, w, k, r, s);
+
+    // Deserialize the graph and execute
+    auto graph = deserialize(handle, serialize_data);
+
+    Surface<half> x_device_memory(n * c * h * w, false);
+    Surface<half> w_device_memory(k * c * r * s, false);
+    Surface<half> s_device_memory(k, false);
+    Surface<half> b_device_memory(k, false);
+    Surface<half> y_device_memory(n * k * h * w, false);  // Should be p, q.
+
+    Surface<int8_t> workspace(graph->get_workspace_size(), false);
+
+    std::unordered_map<int64_t, void*> variant_pack = {{x_tensor, x_device_memory.devPtr},
+                                                       {w_tensor, w_device_memory.devPtr},
+                                                       {scale_tensor, s_device_memory.devPtr},
+                                                       {bias_tensor, b_device_memory.devPtr},
+                                                       {y_tensor, y_device_memory.devPtr}};
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    cudnnDestroy(handle);
+}
+
+TEST_CASE("SDPA Graph with serialization", "[sdpa][graph][serialization]") {
+    int64_t b    = 12;    // batch size
+    int64_t h    = 6;     // head dim
+    int64_t s_q  = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv = 1024;  // k and v tensor is padded to this seq length
+    int64_t d    = 128;   // hidden dim
+
+#if (CUDNN_VERSION < 8905)
+    SKIP("Serialization tests is not supported in cudnn versions prior to 8.9.5");
+#endif
+
+    // Mode of sdpa operation
+    bool is_inference = true;
+
+    // attention scale
+    bool is_attn_scale   = true;
+    float attn_scale_cpu = 0.5f;
+
+    // Dropout configutation
+    bool use_dropout_with_rng = true;
+    float dropout_probability = 0.1f;
+
+    enum UIDs { uid_Q, uid_K, uid_V, uid_ATTN_SCALE, uid_SEED, uid_OFFSET, uid_O, uid_STATS };
+
+    auto build_and_validate_graph_helper =
+        [](int64_t b,
+           int64_t h,
+           int64_t s_q,
+           int64_t s_kv,
+           int64_t d,
+           bool is_attn_scale,
+           bool is_inference,
+           bool use_dropout_with_rng,
+           float dropout_probability) -> std::shared_ptr<cudnn_frontend::graph::Graph> {
+        namespace fe = cudnn_frontend;
+
+        auto graph = std::make_shared<fe::graph::Graph>();
+
+        graph->set_io_data_type(fe::DataType_t::HALF)
+            .set_intermediate_data_type(fe::DataType_t::FLOAT)
+            .set_compute_data_type(fe::DataType_t::FLOAT);
+
+        auto Q = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("Q")
+                                   .set_dim({b, h, s_q, d})
+                                   .set_uid(uid_Q)
+                                   .set_stride({3 * h * d, 3 * d, 3 * b * h * d, 1}));
+        auto K = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("K")
+                                   .set_uid(uid_K)
+                                   .set_dim({b, h, s_kv, d})
+                                   .set_stride({3 * h * d, 3 * d, 3 * b * h * d, 1}));
+        auto V = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("V")
+                                   .set_uid(uid_V)
+                                   .set_dim({b, h, s_kv, d})
+                                   .set_stride({3 * h * d, 3 * d, 3 * b * h * d, 1}));
+
+        auto attn_scale = is_attn_scale ? graph->tensor(fe::graph::Tensor_attributes()
+                                                            .set_name("attn_scale")
+                                                            .set_dim({1, 1, 1, 1})
+                                                            .set_uid(uid_ATTN_SCALE)
+                                                            .set_stride({1, 1, 1, 1})
+                                                            .set_is_pass_by_value(true)
+                                                            .set_data_type(fe::DataType_t::FLOAT))
+                                        : nullptr;
+
+        auto sdpa_options = fe::graph::SDPA_attributes().set_name("flash_attention").set_is_inference(is_inference);
+
+        sdpa_options.set_causal_mask(true);
+        sdpa_options.set_alibi_mask(true);
+
+        if (is_attn_scale) {
+            sdpa_options.set_attn_scale(attn_scale);
+        };
+
+        auto seed = use_dropout_with_rng ? graph->tensor(fe::graph::Tensor_attributes()
+                                                             .set_name("Seed")
+                                                             .set_uid(uid_SEED)
+                                                             .set_dim({1, 1, 1, 1})
+                                                             .set_stride({1, 1, 1, 1})
+                                                             .set_data_type(fe::DataType_t::INT32))
+                                         : nullptr;
+
+        auto offset = use_dropout_with_rng ? graph->tensor(fe::graph::Tensor_attributes()
+                                                               .set_uid(uid_OFFSET)
+                                                               .set_name("Offset")
+                                                               .set_dim({1, 1, 1, 1})
+                                                               .set_stride({1, 1, 1, 1})
+                                                               .set_data_type(fe::DataType_t::INT32))
+                                           : nullptr;
+
+        if (use_dropout_with_rng) {
+            sdpa_options.set_dropout(dropout_probability, seed, offset);
+        }
+
+        auto [O, stats] = graph->sdpa(Q, K, V, sdpa_options);
+
+        O->set_output(true).set_dim({b, h, s_q, d}).set_uid(uid_O).set_stride({h * d, d, b * h * d, 1});
+
+        // Check that Stats tensor is real, which is only when its training step
+        if (is_inference) {
+            REQUIRE(stats == nullptr);
+        } else {
+            stats->set_output(true).set_uid(uid_STATS).set_data_type(fe::DataType_t::FLOAT);
+        }
+
+        REQUIRE(graph->validate().is_good());
+
+        return graph;
+    };
+
+    auto check_support = [build_and_validate_graph_helper](int64_t b,
+                                                           int64_t h,
+                                                           int64_t s_q,
+                                                           int64_t s_kv,
+                                                           int64_t d,
+                                                           bool is_attn_scale,
+                                                           bool is_inference,
+                                                           bool use_dropout_with_rng,
+                                                           float dropout_probability) -> bool {
+        cudnnHandle_t handle;
+
+        checkCudnnErr(cudnnCreate(&handle));
+
+        auto graph = build_and_validate_graph_helper(
+            b, h, s_q, s_kv, d, is_attn_scale, is_inference, use_dropout_with_rng, dropout_probability);
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({cudnn_frontend::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support(handle).is_good());
+
+        cudnnDestroy(handle);
+
+        return true;
+    };
+
+    auto serialize = [build_and_validate_graph_helper](int64_t b,
+                                                       int64_t h,
+                                                       int64_t s_q,
+                                                       int64_t s_kv,
+                                                       int64_t d,
+                                                       bool is_attn_scale,
+                                                       bool is_inference,
+                                                       bool use_dropout_with_rng,
+                                                       float dropout_probability) -> std::vector<uint8_t> {
+        cudnnHandle_t handle;
+
+        std::vector<uint8_t> serialized_data;
+
+        checkCudnnErr(cudnnCreate(&handle));
+
+        auto graph = build_and_validate_graph_helper(
+            b, h, s_q, s_kv, d, is_attn_scale, is_inference, use_dropout_with_rng, dropout_probability);
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({cudnn_frontend::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support(handle).is_good());
+
+        REQUIRE(graph->build_plans(handle).is_good());
+
+        // Insert auto-tuning logic here
+
+        REQUIRE(graph->serialize(serialized_data).is_good());
+
+        cudnnDestroy(handle);
+
+        return serialized_data;
+    };
+
+    auto deserialize = [](cudnnHandle_t handle,
+                          std::vector<uint8_t> const& data) -> std::shared_ptr<cudnn_frontend::graph::Graph> {
+        auto graph = std::make_shared<cudnn_frontend::graph::Graph>();
+
+        REQUIRE(graph->deserialize(handle, data).is_good());
+
+        return graph;
+    };
+
+    // Check support
+    REQUIRE(check_support(b, h, s_q, s_kv, d, is_attn_scale, is_inference, use_dropout_with_rng, dropout_probability));
+
+    // Serialize the graph.
+    auto serialize_data =
+        serialize(b, h, s_q, s_kv, d, is_attn_scale, is_inference, use_dropout_with_rng, dropout_probability);
+
+    cudnnHandle_t handle;
+    checkCudnnErr(cudnnCreate(&handle));
+
+    auto graph = deserialize(handle, serialize_data);
+
+    //// Build variant pack
+    Surface<half> qkvTensor(b * s_q * 3 * h * d, false);
+    Surface<half> oTensor(b * s_q * h * d, false);
+    void* devPtrQ = qkvTensor.devPtr;
+    void* devPtrK = (qkvTensor.devPtr + d);
+    void* devPtrV = (qkvTensor.devPtr + 2 * d);
+    void* devPtrO = oTensor.devPtr;
+
+    int32_t scaleSize  = 1;
+    int32_t seed_value = 123456;
+    Surface<int32_t> dropoutSeed(scaleSize, false, seed_value);
+    Surface<int32_t> dropoutOffset(scaleSize, false, (int32_t)1);
+
+    Surface<int8_t> workspace(graph->get_workspace_size(), false);
+
+    std::cout << "Graph requires workspace " << graph->get_workspace_size() << std::endl;
+
+    std::unordered_map<int64_t, void*> variant_pack = {{uid_Q, devPtrQ},
+                                                       {uid_K, devPtrK},
+                                                       {uid_V, devPtrV},
+                                                       {uid_ATTN_SCALE, &attn_scale_cpu},
+                                                       {uid_SEED, dropoutSeed.devPtr},
+                                                       {uid_OFFSET, dropoutOffset.devPtr},
+                                                       {uid_O, devPtrO}};
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    checkCudnnErr(cudnnDestroy(handle));
+}
\ No newline at end of file
diff --git a/samples/cpp/wgrads.cpp b/samples/cpp/wgrads.cpp
index e2bb4e64..dfcec459 100644
--- a/samples/cpp/wgrads.cpp
+++ b/samples/cpp/wgrads.cpp
@@ -27,6 +27,9 @@
 
 TEST_CASE("Convolution Wgrad", "[wgrad][graph][wgrad][Conv_wgrad]") {
     namespace fe = cudnn_frontend;
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by currend cudnn version");
+    }
     fe::graph::Graph graph;
     graph.set_io_data_type(fe::DataType_t::HALF)
         .set_intermediate_data_type(fe::DataType_t::HALF)
@@ -135,4 +138,4 @@ TEST_CASE("Wgrad Graph", "[wgrad][graph][scale-bias-relu-wgrad][ConvBNwgrad]") {
                                                                                              {DW, dw_tensor.devPtr}};
     REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
     cudnnDestroy(handle);
-}
\ No newline at end of file
+}
diff --git a/samples/legacy_samples/helpers.cpp b/samples/legacy_samples/helpers.cpp
index 2cfcd162..ac0abc4f 100644
--- a/samples/legacy_samples/helpers.cpp
+++ b/samples/legacy_samples/helpers.cpp
@@ -47,6 +47,14 @@ is_hopper_arch() {
     return (90 <= cc);
 }
 
+bool
+is_arch_supported_by_cudnn() {
+    if (cudnnGetVersion() < 8600 && (is_hopper_arch() || is_ada_arch())) {
+        return false;
+    }
+    return true;
+}
+
 bool
 check_device_arch_newer_than(std::string const& arch) {
     size_t arch_major = 6;
diff --git a/samples/legacy_samples/norm_samples.h b/samples/legacy_samples/norm_samples.h
index ee7c9ba8..480c3aa7 100644
--- a/samples/legacy_samples/norm_samples.h
+++ b/samples/legacy_samples/norm_samples.h
@@ -33,7 +33,6 @@
 #include <tuple>
 #include <functional>
 
-#include <cudnn.h>
 #include <cudnn_frontend.h>
 
 /**
diff --git a/samples/utils/error_util.h b/samples/utils/error_util.h
index 3980fea7..c8abd199 100644
--- a/samples/utils/error_util.h
+++ b/samples/utils/error_util.h
@@ -23,11 +23,14 @@
 #if !defined(_ERROR_UTIL_H_)
 #define _ERROR_UTIL_H_
 
+#include <functional>
 #include <sstream>
 #include <stdlib.h>
 #include <stdio.h>
 #include <iostream>
 
+#include <cudnn_frontend.h>
+
 #define TOSTR_(s) #s
 #define TOSTR(s) TOSTR_(s)
 #if defined(__GNUC__)
@@ -100,6 +103,24 @@
         }                                                      \
     }
 
+namespace cudnn_frontend {
+static inline void
+throw_if(std::function<bool()> expr, [[maybe_unused]] const char *message, [[maybe_unused]] cudnnStatus_t status) {
+    if (expr()) {
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        throw cudnn_frontend::cudnnException(message, status);
+#endif
+    }
+}
+static inline void
+throw_if(bool expr, [[maybe_unused]] const char *message, [[maybe_unused]] cudnnStatus_t status) {
+    if (expr) {
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        throw cudnn_frontend::cudnnException(message, status);
+#endif
+    }
+}
+}  // namespace cudnn_frontend
 // CUDA Utility Helper Functions
 
 static void
diff --git a/samples/utils/helpers.h b/samples/utils/helpers.h
index 3583f3f0..90badc14 100644
--- a/samples/utils/helpers.h
+++ b/samples/utils/helpers.h
@@ -63,6 +63,8 @@ bool
 is_hopper_arch();
 bool
 check_device_arch_newer_than(std::string const& arch);
+bool
+is_arch_supported_by_cudnn();
 
 int64_t
 getFwdConvDilatedFilterDim(int64_t filterDim, int64_t dilation);
diff --git a/setup.py b/setup.py
index 4a845eae..335e35c8 100644
--- a/setup.py
+++ b/setup.py
@@ -88,7 +88,7 @@ def build_extension(self, ext: CMakeExtension) -> None:
 # logic and declaration, and simpler if you include description/version in a file.
 setup(
     name="cudnn",
-    version="1.0.3",
+    version="1.1.0",
     author="",
     author_email="",
     description="cudnn_frontend python package",
diff --git a/samples/python/test_apply_rope.py b/test/python_fe/test_apply_rope.py
similarity index 100%
rename from samples/python/test_apply_rope.py
rename to test/python_fe/test_apply_rope.py
diff --git a/samples/python/test_batchnorm.py b/test/python_fe/test_batchnorm.py
similarity index 100%
rename from samples/python/test_batchnorm.py
rename to test/python_fe/test_batchnorm.py
diff --git a/samples/python/test_conv_bias.py b/test/python_fe/test_conv_bias.py
similarity index 95%
rename from samples/python/test_conv_bias.py
rename to test/python_fe/test_conv_bias.py
index 454e5fc9..98a1c920 100644
--- a/samples/python/test_conv_bias.py
+++ b/test/python_fe/test_conv_bias.py
@@ -1,4 +1,5 @@
 import cudnn
+import pytest
 import torch
 
 def convert_to_cudnn_type(torch_type):
@@ -17,12 +18,14 @@ def forward(self, x, w, b = None, padding = [1,1], stride = [1,1], dilation = [1
         return torch.nn.functional.relu(conv_output)
 
 def test_conv_bias_relu():
+    torch.manual_seed(0)
+
     # Reference code
     X_gpu = torch.randn(4, 16, 56, 56, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
     W_gpu = torch.randn(16, 16, 3, 3, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
     B_gpu = torch.randn(1, 16, 1, 1, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
-    padding = [0,1]
-    stride = [2,3]
+    padding = [1,1]
+    stride = [3,3]
     dilation = [1,1]
     model = CSBR().eval().to("cuda").to(torch.float16)
     Y_expected = model(X_gpu, W_gpu, b = B_gpu, padding = padding, stride = stride, dilation = dilation)
@@ -37,7 +40,7 @@ def test_conv_bias_relu():
     W = graph.tensor(name = "W", dim = W_gpu.size(), stride = W_gpu.stride(), data_type = convert_to_cudnn_type(W_gpu.dtype))
     B = graph.tensor(name = "B", dim = B_gpu.size(), stride = B_gpu.stride(), data_type = convert_to_cudnn_type(B_gpu.dtype))
 
-    conv_output = graph.conv_fprop(image = X, weight = W, padding = padding, stride = stride, dilation = dilation)
+    conv_output = graph.conv_fprop(image = X, weight = W, pre_padding = padding, post_padding = padding, stride = stride, dilation = dilation)
 
     bias_output = graph.bias(name = "bias", input = conv_output, bias = B)
 
@@ -55,7 +58,7 @@ def test_conv_bias_relu():
     Y_actual = torch.zeros_like(Y_expected)
     graph.execute({X: X_gpu, W: W_gpu, B: B_gpu, Y: Y_actual}, workspace)
 
-    torch.testing.assert_close(Y_expected, Y_actual, atol=1e-2, rtol=1e-2)
+    torch.testing.assert_close(Y_expected, Y_actual, atol=0.05, rtol=1e-2)
     
     cudnn.destroy_handle(handle)
     
@@ -171,6 +174,7 @@ def dleaky_relu(grad: torch.Tensor, mask: torch.Tensor, negative_slope: float):
     torch.testing.assert_close(Y_expected, Y_actual, atol=1e-4, rtol=1e-4)
 
 
+@pytest.mark.skipif(cudnn.backend_version() < 8600, reason="requires cudnn 8.6.0 or higher")
 def test_conv_int8():
     N, C, H, W = 1, 64, 32, 32
     K, R, S = 4, 3, 3
@@ -215,8 +219,8 @@ def test_conv_int8():
         torch.testing.assert_close(Y_expected, Y_actual, atol=1e-2, rtol=1e-2)
     
 if __name__ == "__main__":
-    # test_conv_int8()
-    # test_conv_relu()
+    test_conv_int8()
+    test_conv_relu()
     test_conv_bias_relu()
-    # test_conv3d_bias_leaky_relu()
-    # test_leaky_relu_backward()
\ No newline at end of file
+    test_conv3d_bias_leaky_relu()
+    test_leaky_relu_backward()
diff --git a/samples/python/test_conv_genstats.py b/test/python_fe/test_conv_genstats.py
similarity index 100%
rename from samples/python/test_conv_genstats.py
rename to test/python_fe/test_conv_genstats.py
diff --git a/samples/python/test_conv_reduction.py b/test/python_fe/test_conv_reduction.py
similarity index 100%
rename from samples/python/test_conv_reduction.py
rename to test/python_fe/test_conv_reduction.py
diff --git a/samples/python/test_instancenorm.py b/test/python_fe/test_instancenorm.py
similarity index 100%
rename from samples/python/test_instancenorm.py
rename to test/python_fe/test_instancenorm.py
diff --git a/samples/python/test_layernorm.py b/test/python_fe/test_layernorm.py
similarity index 95%
rename from samples/python/test_layernorm.py
rename to test/python_fe/test_layernorm.py
index ffe96c7f..e6d4887f 100644
--- a/samples/python/test_layernorm.py
+++ b/test/python_fe/test_layernorm.py
@@ -28,7 +28,7 @@ def param_extract(request):
   return request.param
 
 @pytest.mark.skipif(cudnn.backend_version() < 8905, reason="LN not supported below cudnn 8.9.5")
-def test_in(param_extract):
+def test_layernorm(param_extract):
     torch.manual_seed(0)
 
     embedding_dim, input_type = param_extract
@@ -48,12 +48,10 @@ def test_in(param_extract):
     bias_gpu = 7*torch.randn(1, C, H, W, requires_grad=True, device="cuda", dtype=input_type).to(memory_format=torch.channels_last) -2
     epsilon_cpu = torch.full((1, 1, 1, 1), epsilon_value, requires_grad=False, device="cpu", dtype=torch.float32)
 
-    print("Running reference")
         
     Y_expected = torch.nn.functional.layer_norm(x_gpu, [C, H, W], weight=scale_gpu.squeeze(0), bias=bias_gpu.squeeze(0), eps=epsilon_value)
     mean_expected = x_gpu.to(torch.float32).mean(dim=(1, 2, 3), keepdim=True)
     inv_var_expected = torch.rsqrt(torch.var(x_gpu.to(torch.float32), dim=(1, 2, 3), keepdim=True) + epsilon_value)
-    print("Building cudnn graph")
 
     graph = cudnn.pygraph(intermediate_data_type = cudnn.data_type.FLOAT, compute_data_type = cudnn.data_type.FLOAT)
 
@@ -84,7 +82,6 @@ def test_in(param_extract):
     inv_var_actual = torch.empty_like(inv_var_expected)
     
     workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
-    print("Executing cudnn graph")
     
     graph.execute({
                 X : x_gpu.detach()
@@ -96,11 +93,9 @@ def test_in(param_extract):
                 , inv_var: inv_var_actual
             }, workspace)
     
-    print("Comparing with reference")
     torch.testing.assert_close(Y_expected, Y_actual, atol=atol, rtol=rtol)
     torch.testing.assert_close(mean_expected, mean_actual, atol=atol, rtol=rtol)
     torch.testing.assert_close(inv_var_expected, inv_var_actual, atol=atol, rtol=rtol)
-    print("Success!!")
     
     target = torch.randn_like(Y_expected)
     criterion = torch.nn.MSELoss()
@@ -143,7 +138,6 @@ def test_in(param_extract):
     Dbias_actual = torch.empty_like(bias_gpu)
 
     workspace = torch.empty(bwd_graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
-    print("Executing cudnn bwd_graph")
     
     bwd_graph.execute({
                 X_bwd : x_gpu.detach()
@@ -156,11 +150,9 @@ def test_in(param_extract):
                 , Dbias: Dbias_actual
             }, workspace)
 
-    print("Comparing with reference")
     torch.testing.assert_close(x_gpu.grad, DX_actual, atol=2e-4, rtol=2e-4)
     torch.testing.assert_close(scale_gpu.grad, DScale_actual, atol=2e-4, rtol=2e-4)
     torch.testing.assert_close(bias_gpu.grad, Dbias_actual, atol=2e-4, rtol=2e-4)
-    print("Success!!")
 
 if __name__ == "__main__":
-    test_in((1600, torch.bfloat16))
\ No newline at end of file
+    test_layernorm((1600, torch.bfloat16))
\ No newline at end of file
diff --git a/samples/python/test_matmul_bias_relu.py b/test/python_fe/test_matmul_bias_relu.py
similarity index 74%
rename from samples/python/test_matmul_bias_relu.py
rename to test/python_fe/test_matmul_bias_relu.py
index 1f56cc4e..745d014a 100644
--- a/samples/python/test_matmul_bias_relu.py
+++ b/test/python_fe/test_matmul_bias_relu.py
@@ -21,7 +21,52 @@ def convert_to_cudnn_type(torch_type):
     
 def get_cc():
     (major, minor) = torch.cuda.get_device_capability()
-    return major*10 + minor
+    return major*10 + minor 
+
+def compare_tensors(expected, actual, name, rtol=2e-2, atol=2e-2, fudge=1e-9):
+    assert expected.shape == actual.shape
+
+    expected = expected.float().cuda().flatten()
+    actual = actual.float().cuda().flatten()
+
+    n_elem = torch.numel(expected)
+
+    mae = (expected - actual).abs().mean().item()
+    perr = ((expected - actual).abs().sum() / expected.abs().sum()).item()
+    snr = (expected**2).mean().sqrt() / ((expected - actual) ** 2).mean().sqrt()
+    snr_db = (10 * torch.log10(snr)).item()
+
+    absolute_error = (expected - actual).abs()
+    relative_error = absolute_error / torch.where(expected.abs() < fudge, fudge, expected.abs())
+
+    abs_error_indices = absolute_error > atol
+    rel_error_indices = relative_error > rtol
+    n_abs_errors = torch.sum(abs_error_indices)
+    n_rel_errors = torch.sum(rel_error_indices)
+    error_indices = torch.logical_and(abs_error_indices, rel_error_indices)
+    n_errors = torch.sum(error_indices)
+
+    n_nans = torch.isnan(actual).sum()
+    n_zeros = n_elem - torch.count_nonzero(actual)
+
+    if n_errors != 0:
+        print(f"========== Comparison for {name} ==========")
+        print(f"Absolute Tolerance = {atol}")
+        print(f"Relative Tolerance = {rtol}")
+        print(f"Number of elements = {n_elem}")
+        print(f"Number of absolute errors = {n_abs_errors} ({n_abs_errors * 100 / n_elem:.2f}%)")
+        print(f"Number of relative errors = {n_rel_errors} ({n_rel_errors * 100 / n_elem:.2f}%)")
+        print(f"Number of errors (absolute and relative) = {n_errors} ({(n_errors * 100)/n_elem:.2f}%)")
+        print(f"Maximum absolute error = {absolute_error.max():.4f}")
+        print(f"Maximum relative error = {relative_error.max():.4f}")
+        print(f"Mean average error = {mae:.4f}")
+        print(f"Perr error = {perr:.4f} = 1/{(1/perr) if perr != 0 else float('inf'):.2f}")
+        print(f"Signal to noise ratio = {snr.item():.2f} = {snr_db:.2f}dB")
+        print(f"Number of Nans = {n_nans} ({n_nans * 100 / n_elem:.2f}%)")
+        print(f"Number of Zeros = {n_zeros} ({n_zeros * 100 / n_elem:.2f}%)")
+        print("===================================\n")
+
+    return n_errors
 
 @pytest.mark.skipif(cudnn.backend_version() < 8906, reason="requires cudnn 8.9.6 or higher")
 @pytest.mark.skipif(torch.cuda.get_device_capability()[0] < 9, reason="requires Hopper or newer arch")
@@ -82,10 +127,12 @@ def test_mixed_precision_matmul(A_data_type, B_data_type, MMA_data_type):
         A_gpu = torch.randint(4, (B, M, K), requires_grad=False, device="cuda", dtype=A_data_type) - 1
 
     if B_data_type != torch.int8:
-        B_gpu = 3 * torch.randn(B, K, N, requires_grad=False, device="cuda", dtype=B_data_type) - 1.25
+        B_gpu_strided = 3 * torch.randn(B, K, N, requires_grad=False, device="cuda", dtype=B_data_type) - 1.25
     else:
-        B_gpu = torch.randint(3, (B, K, N), requires_grad=False, device="cuda", dtype=B_data_type) - 2
-
+        B_gpu_strided = torch.randint(3, (B, K, N), requires_grad=False, device="cuda", dtype=B_data_type).contiguous() - 2
+    
+    B_gpu = torch.as_strided(B_gpu_strided, (B, K, N), (N*K, 1, N))
+    
     # Make cudnn graph
     graph = cudnn.pygraph()
 
@@ -123,7 +170,7 @@ def test_mixed_precision_matmul(A_data_type, B_data_type, MMA_data_type):
     graph.execute({A: A_gpu, B:  B_gpu, C:  C_actual}, workspace)
 
     # compare'em
-    torch.testing.assert_close(C_expected, C_actual)
+    compare_tensors(C_expected, C_actual, "output", atol=1e-4, rtol=1e-4)
 
 problem_size_options = [(1, 128, 768)
                         , (16, 512, 1600)
diff --git a/samples/python/test_mhas.py b/test/python_fe/test_mhas.py
similarity index 69%
rename from samples/python/test_mhas.py
rename to test/python_fe/test_mhas.py
index 76b4598a..5e6f7a6e 100644
--- a/samples/python/test_mhas.py
+++ b/test/python_fe/test_mhas.py
@@ -69,28 +69,6 @@ def compare_tensors(expected, actual, name, rtol=2e-2, atol=2e-2, fudge=1e-9):
     return n_errors + n_nans
 
 
-def get_alibi_slopes(n_heads, device="cuda"):
-    # Get the closest power of 2 to `n_heads`.
-    # If `n_heads` is not a power of 2, then we first calculate slopes to the closest (smaller) power of 2,
-    # and then add the remaining slopes.
-    n = 2 ** math.floor(math.log2(n_heads))
-    m_0 = 2.0 ** (-8.0 / n)
-    m = torch.pow(m_0, torch.arange(1, 1 + n))
-
-    # If `n_heads` is not a power of 2, then we add the remaining slopes.
-    # We calculate the remaining slopes for $n * 2$ (avoiding slopes added previously).
-    # And pick the slopes upto `n_heads`.
-    if n < n_heads:
-        m_hat_0 = 2.0 ** (-4.0 / n)
-        m_hat = torch.pow(m_hat_0, torch.arange(1, 1 + 2 * (n_heads - n), 2))
-        # Concatenate the slopes with the remaining slopes.
-        m = torch.cat([m, m_hat])
-
-    # Reshape the tensor to [1, num_heads, 1, 1]
-    m = m.view(1, -1, 1, 1).to(device=device)
-    return m
-
-
 def compute_ref(
     q,
     k,
@@ -163,7 +141,27 @@ def compute_ref(
         index_row = torch.arange(s_q, dtype=torch.float32, device=device).view(-1, 1)
         index_col = torch.arange(s_kv, dtype=torch.float32, device=device)
         distance = index_col - index_row
-        alibi_mask = distance.to(dtype=torch.float32) * get_alibi_slopes(h_q, device=device)
+
+        # Get the closest power of 2 to `n_heads`.
+        # If `n_heads` is not a power of 2, then we first calculate slopes to the closest (smaller) power of 2,
+        # and then add the remaining slopes.
+        n = 2 ** math.floor(math.log2(h_q))
+        m_0 = 2.0 ** (-8.0 / n)
+        m = torch.pow(m_0, torch.arange(1, 1 + n))
+
+        # If `n_heads` is not a power of 2, then we add the remaining slopes.
+        # We calculate the remaining slopes for $n * 2$ (avoiding slopes added previously).
+        # And pick the slopes upto `n_heads`.
+        if n < h_q:
+            m_hat_0 = 2.0 ** (-4.0 / n)
+            m_hat = torch.pow(m_hat_0, torch.arange(1, 1 + 2 * (h_q - n), 2))
+            # Concatenate the slopes with the remaining slopes.
+            m = torch.cat([m, m_hat])
+
+        # Reshape the tensor to [1, num_heads, 1, 1]
+        m = m.view(1, -1, 1, 1).to(device=device)
+
+        alibi_mask = distance.to(dtype=torch.float32) * m
         s = s + alibi_mask
     if padding is not None:
         s = s.masked_fill(s_mask, float("-inf"))
@@ -202,6 +200,7 @@ def compute_ref(
 padding_mask_options = [False, True]
 causal_mask_options = [False, True]
 dropout_options = [False, True]
+ragged_options = [False, True]
 is_infer_options = [False, True]
 
 all_options_forward = [
@@ -216,6 +215,7 @@ def compute_ref(
             padding_mask_options,
             causal_mask_options,
             dropout_options,
+            ragged_options,
             is_infer_options,
         ]
     )
@@ -233,6 +233,7 @@ def compute_ref(
             padding_mask_options,
             causal_mask_options,
             dropout_options,
+            ragged_options,
         ]
     )
 ]
@@ -308,15 +309,63 @@ def generate_layout(layout, head_group, shape_q, shape_k, shape_v, shape_o):
     return stride_q, stride_k, stride_v, stride_o, offset_q, offset_k, offset_v
 
 
+def compute_exclusive_prefix_sum(tensor):
+    # tensor has shape (B, 1, 1, 1)
+    # output has shape (B+1, 1, 1, 1)
+    # ex) tensor = [[[[2, 4, 1, 6]]]]
+    #     output = [[[[0, 2, 6, 7, 13]]]]
+    assert tensor.size(1) == tensor.size(2) == tensor.size(3) == 1
+    return torch.cat((torch.zeros(1, 1, 1, 1, dtype=tensor.dtype, device=tensor.device), torch.cumsum(tensor, dim=0)))
+
+
+def convert_ragged_to_uniform(ragged_tensor, ragged_offset):
+    # limitations:
+    # 1. tensor is non-interleaved with bhsd dim order and bshd stride order
+    # 2. ragged tensor is packed and in-order, therefore
+    #    ragged offset is monatomically increasing
+    assert ragged_tensor.dim() == 4
+    b, h, s, d = ragged_tensor.size()
+    b_stride, h_stride, s_stride, d_stride = ragged_tensor.stride()
+    assert b_stride >= s_stride >= h_stride >= d_stride
+    assert ragged_offset.dim() == 4 and (b + 1, 1, 1, 1) == ragged_offset.size()
+
+    # ragged offset is given in 4D, convert to 1D locally
+    ragged_offset = ragged_offset.flatten()
+
+    # convert bhsd to bshd and flatten
+    ragged_tensor_flat = torch.einsum("bhsd->bshd", ragged_tensor).flatten()
+    uniform_tensor_flat = torch.zeros_like(ragged_tensor_flat)
+
+    # copy
+    for i, num_elements in enumerate(ragged_offset[1:] - ragged_offset[:-1]):
+        unif_a = i * s * h * d
+        unif_b = unif_a + num_elements
+        ragg_a = ragged_offset[i]
+        ragg_b = ragg_a + num_elements
+        uniform_tensor_flat[unif_a:unif_b] = ragged_tensor_flat[ragg_a:ragg_b]
+
+    # unflatten and convert bshd to bhsd
+    uniform_tensor = uniform_tensor_flat.view(b, s, h, d)
+    uniform_tensor = torch.einsum("bshd->bhsd", uniform_tensor)
+    return uniform_tensor
+
+
 @pytest.fixture(params=all_options_forward)
 def param_extract_forward(request):
     return request.param
 
 
-@pytest.mark.skipif(cudnn.backend_version() < 8903, reason="requires cudnn 8.9.3 or higher")
-def test_sdpa(param_extract_forward):
-    (
-        input_type,
+@pytest.mark.parametrize("input_type", input_type_options)
+@pytest.mark.parametrize("layout", layout_options)
+@pytest.mark.parametrize("head_group", head_group_options)
+@pytest.mark.parametrize("is_bias", bias_options)
+@pytest.mark.parametrize("is_alibi", alibi_mask_options)
+@pytest.mark.parametrize("is_padding", padding_mask_options)
+@pytest.mark.parametrize("is_causal", causal_mask_options)
+@pytest.mark.parametrize("is_dropout", dropout_options)
+@pytest.mark.parametrize("is_ragged", ragged_options)
+@pytest.mark.parametrize("is_infer", is_infer_options)
+def test_sdpa(input_type,
         layout,
         head_group,
         is_bias,
@@ -324,8 +373,10 @@ def test_sdpa(param_extract_forward):
         is_padding,
         is_causal,
         is_dropout,
-        is_infer,
-    ) = param_extract_forward
+        is_ragged,
+        is_infer):
+    if cudnn.backend_version() < 8903:
+        pytest.skip("SDPA fprop requires cudnn 8.9.3 or higher")
 
     if head_group != "multi_head" and cudnn.backend_version() < 8907:
         pytest.skip("GQA and MQA is only supported 8.9.7 onwards.")
@@ -339,16 +390,28 @@ def test_sdpa(param_extract_forward):
     if is_dropout and cudnn.backend_version() < 8906:
         pytest.skip("Dropout reference is only supported on 8.9.6 onwards.")
 
+    if is_ragged and cudnn.backend_version() < 90000:
+        pytest.skip("Ragged tensor is only supported 9.0.0 onwards")
+
+    if is_ragged and torch.cuda.get_device_capability()[0] < 9:
+        pytest.skip("Ragged tensor is only supported hopper")
+
+    if is_ragged and layout != "non_interleaved":
+        pytest.skip("Ragged tensor is only tested with non-interleaved bshd layout")
+
+    if is_ragged and not is_padding:
+        pytest.skip("Ragged tensor is only tested with packed variable length tensors")
+
     # batch size
     b = 2
     # query sequence length
-    s_q = random.choice([256, 512, 1024, 2048])
+    s_q = random.choice([8, 16, 24, 32, 256, 512, 1024, 2048])
     # key+value sequence length
     s_kv = random.choice([8, 16, 24, 32, 256, 512, 1024, 2048]) if layout == "non_interleaved" else s_q
     # query+key embedding dimension per head
     d_qk = random.choice([32, 56, 64, 128])
     # value embedding dimension per head
-    d_v = random.choice([64, 96, 128]) if layout == "non_interleaved" else d_qk
+    d_v = random.choice([64, 96, 128]) if (layout == "non_interleaved" and not is_ragged) else d_qk
     # number of heads
     h_q = 6
     if head_group == "multi_head":
@@ -366,12 +429,23 @@ def test_sdpa(param_extract_forward):
     if d_qk != d_v and cudnn.backend_version() < 8906:
         pytest.skip("d_qk != d_v is only supported on 8.9.6 onwards.")
 
-    if is_dropout and (s_kv % 64 != 0) and cudnn.backend_version() < 90000:
-        pytest.skip("Dropout mask dump with not-multiple-of-64 seq_kv is not supported.")
+    if cudnn.backend_version() < 90000:
+        if ((s_q % 64 != 0) or (s_kv % 64 != 0)) and (is_padding or is_dropout):
+            pytest.skip("s_q not a multiple of 64 with padding/dropout is not supported with cudnn version 9.0.0")
 
-    if ((d_qk % 64 != 0) or (s_kv % 64 != 0)) and cudnn.backend_version() < 8906:
+    if cudnn.backend_version() < 8906:
         pytest.skip("d not a multiple of 64, not-multiple-of-64 seq_kv is not supported below 8.9.6")
-        
+
+    if (d_qk % 64 != 0) and cudnn.backend_version() < 8906:
+        pytest.skip("d not a multiple of 64 is not supported below 8.9.6")
+
+    if (d_qk % 64 != 0) and cudnn.backend_version() < 8906:
+        pytest.skip("d not a multiple of 64 is not supported below 8.9.6")
+
+    # TODO file bug
+    if d_qk != d_v and is_ragged:
+        pytest.skip("d_qk != d_v is not supported with ragged offset")
+
     print(f"{s_q=} {s_kv=} {d_qk=} {d_v=} {h_q=} {h_k=} {h_v=}")
 
     attn_scale = 0.125
@@ -409,6 +483,11 @@ def test_sdpa(param_extract_forward):
 
     rng_dump_gpu = torch.empty((b, h_q, s_q, s_kv), dtype=torch.float32, device="cuda") if is_dropout else None
 
+    q_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_q_gpu) * h_q * d_qk).int() if is_ragged else None
+    k_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_kv_gpu) * h_k * d_qk).int() if is_ragged else None
+    v_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_kv_gpu) * h_v * d_v).int() if is_ragged else None
+    o_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_q_gpu) * h_q * d_v).int() if is_ragged else None
+
     o_gpu = torch.empty(b * h_q * s_q * d_v, dtype=input_type, device="cuda").as_strided(shape_o, stride_o)
     stats_gpu = torch.empty(b, h_q, s_q, 1, dtype=torch.float32, device="cuda") if not is_infer else None
 
@@ -418,6 +497,7 @@ def test_sdpa(param_extract_forward):
         intermediate_data_type=cudnn.data_type.FLOAT,
         compute_data_type=cudnn.data_type.FLOAT,
     )
+
     q = graph.tensor_like(q_gpu)
     k = graph.tensor_like(k_gpu)
     v = graph.tensor_like(v_gpu)
@@ -434,6 +514,16 @@ def test_sdpa(param_extract_forward):
 
     rng_dump = graph.tensor_like(rng_dump_gpu) if is_dropout else None
 
+    q_ragged_offset = graph.tensor_like(q_ragged_offset_gpu) if is_ragged else None
+    k_ragged_offset = graph.tensor_like(k_ragged_offset_gpu) if is_ragged else None
+    v_ragged_offset = graph.tensor_like(v_ragged_offset_gpu) if is_ragged else None
+    o_ragged_offset = graph.tensor_like(o_ragged_offset_gpu) if is_ragged else None
+
+    if is_ragged:
+        q.set_ragged_offset(q_ragged_offset)
+        k.set_ragged_offset(k_ragged_offset)
+        v.set_ragged_offset(v_ragged_offset)
+
     o, stats = graph.sdpa(
         name="sdpa",
         q=q,
@@ -452,6 +542,9 @@ def test_sdpa(param_extract_forward):
     )
 
     o.set_output(True).set_dim(shape_o).set_stride(stride_o)
+    if is_ragged:
+        o.set_ragged_offset(o_ragged_offset)
+
     if is_infer == False:
         stats.set_output(True).set_data_type(cudnn.data_type.FLOAT)
 
@@ -468,6 +561,10 @@ def test_sdpa(param_extract_forward):
         bias: bias_gpu,
         seq_len_q: seq_len_q_gpu,
         seq_len_kv: seq_len_kv_gpu,
+        q_ragged_offset: q_ragged_offset_gpu,
+        k_ragged_offset: k_ragged_offset_gpu,
+        v_ragged_offset: v_ragged_offset_gpu,
+        o_ragged_offset: o_ragged_offset_gpu,
         o: o_gpu,
         stats: stats_gpu,
         rng_dump: rng_dump_gpu,
@@ -481,10 +578,16 @@ def test_sdpa(param_extract_forward):
     graph.execute(variant_pack, workspace)
     torch.cuda.synchronize()
 
+    # compare with torch autograd reference
     q_ref = q_gpu.detach().float()
     k_ref = k_gpu.detach().float()
     v_ref = v_gpu.detach().float()
 
+    if is_ragged:
+        q_ref = convert_ragged_to_uniform(q_ref, q_ragged_offset_gpu.detach())
+        k_ref = convert_ragged_to_uniform(k_ref, k_ragged_offset_gpu.detach())
+        v_ref = convert_ragged_to_uniform(v_ref, v_ragged_offset_gpu.detach())
+
     if is_bias:
         bias_ref = bias_gpu.detach().float()
 
@@ -513,6 +616,9 @@ def test_sdpa(param_extract_forward):
     else:
         o_ref = ret
 
+    if is_ragged:
+        o_gpu = convert_ragged_to_uniform(o_gpu, o_ragged_offset_gpu.detach())
+
     if is_padding:
         # zero out padded region of the output for comparison
         for i, m in enumerate(seq_len_q_ref):
@@ -527,15 +633,16 @@ def test_sdpa(param_extract_forward):
         assert compare_tensors(stats_ref, stats_gpu, "stats") == 0
 
 
-@pytest.fixture(params=all_options_backward)
-def param_extract_backward(request):
-    return request.param
-
-
-@pytest.mark.skipif(cudnn.backend_version() < 8903, reason="requires cudnn 8.9.3 or higher")
-def test_sdpa_backward(param_extract_backward):
-    (
-        input_type,
+@pytest.mark.parametrize("input_type", input_type_options)
+@pytest.mark.parametrize("layout", layout_options)
+@pytest.mark.parametrize("head_group", head_group_options)
+@pytest.mark.parametrize("is_bias", bias_options)
+@pytest.mark.parametrize("is_alibi", alibi_mask_options)
+@pytest.mark.parametrize("is_padding", padding_mask_options)
+@pytest.mark.parametrize("is_causal", causal_mask_options)
+@pytest.mark.parametrize("is_dropout", dropout_options)
+@pytest.mark.parametrize("is_ragged", ragged_options)
+def test_sdpa_backward(input_type,
         layout,
         head_group,
         is_bias,
@@ -543,7 +650,9 @@ def test_sdpa_backward(param_extract_backward):
         is_padding,
         is_causal,
         is_dropout,
-    ) = param_extract_backward
+        is_ragged):
+    if cudnn.backend_version() < 8903:
+        pytest.skip("SDPA bprop requires cudnn 8.9.3 or higher")
 
     if head_group != "multi_head" and cudnn.backend_version() < 8907:
         pytest.skip("GQA and MQA is only supported 8.9.7 onwards.")
@@ -557,6 +666,9 @@ def test_sdpa_backward(param_extract_backward):
     if is_bias and is_padding:
         pytest.skip("dBias is not supported with padding mask")
 
+    if is_alibi and not is_causal:
+        pytest.skip("ALiBi mask is only supported with causal mask")
+
     if is_alibi and cudnn.backend_version() < 8904:
         pytest.skip("ALiBi mask is only supported 8.9.4 onwards.")
 
@@ -566,19 +678,34 @@ def test_sdpa_backward(param_extract_backward):
     if is_dropout and cudnn.backend_version() < 8906:
         pytest.skip("RNG dump is only supported on 8.9.6 onwards.")
 
+    if is_ragged and cudnn.backend_version() < 90000:
+        pytest.skip("Ragged tensor is only supported 9.0.0 onwards")
+
+    if is_ragged and torch.cuda.get_device_capability()[0] < 9:
+        pytest.skip("Ragged tensor is only supported hopper")
+
+    if is_ragged and layout != "non_interleaved":
+        pytest.skip("Ragged tensor is only tested with non-interleaved bshd layout")
+
+    if is_ragged and head_group != "multi_head":
+        pytest.skip("Ragged offset is only supported with multi_head")
+
+    if is_ragged and not is_padding:
+        pytest.skip("Ragged tensor is only tested with packed variable length tensors")
+
     # test both dP workspace optimization by lowering dP workspace limit to 8MB
     os.environ["CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT"] = str(8 * 1024 * 1024)
 
     # batch size
     b = 2
     # query sequence length
-    s_q = random.choice([256, 512, 1024])
+    s_q = random.choice([8, 16, 24, 32, 256, 512, 1024])
     # key+value sequence length
-    s_kv = random.choice([32, 256, 512, 1024]) if layout == "non_interleaved" else s_q
+    s_kv = random.choice([8, 16, 24, 32, 256, 512, 1024]) if layout == "non_interleaved" else s_q
     # query+key embedding dimension per head
     d_qk = random.choice([32, 56, 64, 128])
     # value embedding dimension per head
-    d_v = random.choice([64, 96, 128]) if layout == "non_interleaved" else d_qk
+    d_v = random.choice([64, 96, 128]) if (layout == "non_interleaved" and not is_ragged) else d_qk
     # number of heads
     h_q = 6
     if head_group == "multi_head":
@@ -596,13 +723,27 @@ def test_sdpa_backward(param_extract_backward):
     if d_qk != d_v and cudnn.backend_version() < 8906:
         pytest.skip("d_qk != d_v is only supported on 8.9.6 onwards.")
 
-    if (s_kv % 64 != 0) and layout == "non_interleaved":
-        pytest.skip("cudnn backend does not support non-interlaved layout with non-64-aligned seq_kv.")
-        
-    if ((d_qk % 64 != 0) or (s_kv % 64 != 0)) and cudnn.backend_version() < 8906:
-        pytest.skip("d not a multiple of 64, not-multiple-of-64 seq_kv is not supported below 8.9.6")
+    if (cudnn.backend_version() < 90000):
+        if (s_q < 64):
+            pytest.skip("s_q less than 64 is not supported before cudnn 9.0.0")
+
+        if ((s_q % 64 != 0) or (s_kv % 64 != 0)) and (is_padding or is_dropout):
+            pytest.skip("s_q not a multiple of 64 with padding/dropout is not supported with cudnn version 9.0.0")
+
+    if ((s_q % 64 != 0) or (s_kv % 64 != 0)) and is_bias:
+        pytest.skip("cudnn backend does not support bias with non-64-aligned seq_q or seq_kv.")
+
+    if (s_kv % 64 != 0) and cudnn.backend_version() < 8906:
+        pytest.skip("not-multiple-of-64 seq_kv is not supported below 8.9.6")
 
-    print(f"{str(param_extract_backward)} {s_q=} {s_kv=} {d_qk=} {d_v=} {h_q=} {h_k=} {h_v=}")
+    if (d_qk % 64 != 0) and cudnn.backend_version() < 8906:
+        pytest.skip("d not a multiple of 64 is not supported below 8.9.6")
+
+    # TODO file bug
+    if d_qk != d_v and is_ragged:
+        pytest.skip("d_qk != d_v is not supported with ragged offset")
+
+    print(f"{s_q=} {s_kv=} {d_qk=} {d_v=} {h_q=} {h_k=} {h_v=}")
 
     attn_scale = 0.125
     dropout_prob = 0.1 if is_dropout else 0.0
@@ -647,6 +788,11 @@ def test_sdpa_backward(param_extract_backward):
 
     rng_dump_gpu = torch.empty((b, h_q, s_q, s_kv), dtype=torch.float32, device="cuda") if is_dropout else None
 
+    q_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_q_gpu) * h_q * d_qk).int() if is_ragged else None
+    k_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_kv_gpu) * h_k * d_qk).int() if is_ragged else None
+    v_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_kv_gpu) * h_v * d_v).int() if is_ragged else None
+    o_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_q_gpu) * h_q * d_v).int() if is_ragged else None
+
     o_gpu = torch.empty(b * h_q * s_q * d_v, dtype=input_type, device="cuda").as_strided(shape_o, stride_o)
     stats_gpu = torch.empty(b, h_q, s_q, 1, dtype=torch.float32, device="cuda")
 
@@ -656,6 +802,7 @@ def test_sdpa_backward(param_extract_backward):
         intermediate_data_type=cudnn.data_type.FLOAT,
         compute_data_type=cudnn.data_type.FLOAT,
     )
+
     q = graph.tensor_like(q_gpu)
     k = graph.tensor_like(k_gpu)
     v = graph.tensor_like(v_gpu)
@@ -672,6 +819,16 @@ def test_sdpa_backward(param_extract_backward):
 
     rng_dump = graph.tensor_like(rng_dump_gpu) if is_dropout else None
 
+    q_ragged_offset = graph.tensor_like(q_ragged_offset_gpu) if is_ragged else None
+    k_ragged_offset = graph.tensor_like(k_ragged_offset_gpu) if is_ragged else None
+    v_ragged_offset = graph.tensor_like(v_ragged_offset_gpu) if is_ragged else None
+    o_ragged_offset = graph.tensor_like(o_ragged_offset_gpu) if is_ragged else None
+
+    if is_ragged:
+        q.set_ragged_offset(q_ragged_offset)
+        k.set_ragged_offset(k_ragged_offset)
+        v.set_ragged_offset(v_ragged_offset)
+
     o, stats = graph.sdpa(
         name="sdpa",
         q=q,
@@ -690,6 +847,9 @@ def test_sdpa_backward(param_extract_backward):
     )
 
     o.set_output(True).set_dim(shape_o).set_stride(stride_o)
+    if is_ragged:
+        o.set_ragged_offset(o_ragged_offset)
+
     stats.set_output(True).set_data_type(cudnn.data_type.FLOAT)
 
     graph.validate()
@@ -705,6 +865,10 @@ def test_sdpa_backward(param_extract_backward):
         bias: bias_gpu,
         seq_len_q: seq_len_q_gpu,
         seq_len_kv: seq_len_kv_gpu,
+        q_ragged_offset: q_ragged_offset_gpu,
+        k_ragged_offset: k_ragged_offset_gpu,
+        v_ragged_offset: v_ragged_offset_gpu,
+        o_ragged_offset: o_ragged_offset_gpu,
         o: o_gpu,
         stats: stats_gpu,
         rng_dump: rng_dump_gpu,
@@ -730,6 +894,7 @@ def test_sdpa_backward(param_extract_backward):
         intermediate_data_type=cudnn.data_type.FLOAT,
         compute_data_type=cudnn.data_type.FLOAT,
     )
+
     q = graph.tensor_like(q_gpu)
     k = graph.tensor_like(k_gpu)
     v = graph.tensor_like(v_gpu)
@@ -748,6 +913,18 @@ def test_sdpa_backward(param_extract_backward):
         offset = graph.tensor_like(offset_gpu)
         dropout_tuple = (dropout_prob, seed, offset)
 
+    q_ragged_offset = graph.tensor_like(q_ragged_offset_gpu) if is_ragged else None
+    k_ragged_offset = graph.tensor_like(k_ragged_offset_gpu) if is_ragged else None
+    v_ragged_offset = graph.tensor_like(v_ragged_offset_gpu) if is_ragged else None
+    o_ragged_offset = graph.tensor_like(o_ragged_offset_gpu) if is_ragged else None
+
+    if is_ragged:
+        q.set_ragged_offset(q_ragged_offset)
+        k.set_ragged_offset(k_ragged_offset)
+        v.set_ragged_offset(v_ragged_offset)
+        o.set_ragged_offset(o_ragged_offset)
+        dO.set_ragged_offset(o_ragged_offset)
+
     dQ, dK, dV = graph.sdpa_backward(
         name="sdpa_backward",
         q=q,
@@ -770,6 +947,10 @@ def test_sdpa_backward(param_extract_backward):
     dQ.set_output(True).set_dim(dQ_gpu.size()).set_stride(dQ_gpu.stride())
     dK.set_output(True).set_dim(dK_gpu.size()).set_stride(dK_gpu.stride())
     dV.set_output(True).set_dim(dV_gpu.size()).set_stride(dV_gpu.stride())
+    if is_ragged:
+        dQ.set_ragged_offset(q_ragged_offset)
+        dK.set_ragged_offset(k_ragged_offset)
+        dV.set_ragged_offset(v_ragged_offset)
 
     graph.validate()
     graph.build_operation_graph()
@@ -791,6 +972,10 @@ def test_sdpa_backward(param_extract_backward):
         dBias: dBias_gpu,
         seq_len_q: seq_len_q_gpu,
         seq_len_kv: seq_len_kv_gpu,
+        q_ragged_offset: q_ragged_offset_gpu,
+        k_ragged_offset: k_ragged_offset_gpu,
+        v_ragged_offset: v_ragged_offset_gpu,
+        o_ragged_offset: o_ragged_offset_gpu,
     }
 
     if is_dropout:
@@ -810,6 +995,12 @@ def test_sdpa_backward(param_extract_backward):
     v_ref.requires_grad = True
     dO_ref = dO_gpu.detach().float()
 
+    if is_ragged:
+        q_ref = convert_ragged_to_uniform(q_ref, q_ragged_offset_gpu.detach())
+        k_ref = convert_ragged_to_uniform(k_ref, k_ragged_offset_gpu.detach())
+        v_ref = convert_ragged_to_uniform(v_ref, v_ragged_offset_gpu.detach())
+        dO_ref = convert_ragged_to_uniform(dO_ref, o_ragged_offset_gpu.detach())
+
     if is_bias:
         bias_ref = bias_gpu.detach().float()
         bias_ref.requires_grad = True
@@ -848,6 +1039,11 @@ def test_sdpa_backward(param_extract_backward):
     if is_bias:
         dBias_ref = opt_refs.pop(0)
 
+    if is_ragged:
+        dQ_gpu = convert_ragged_to_uniform(dQ_gpu, q_ragged_offset_gpu.detach())
+        dK_gpu = convert_ragged_to_uniform(dK_gpu, k_ragged_offset_gpu.detach())
+        dV_gpu = convert_ragged_to_uniform(dV_gpu, v_ragged_offset_gpu.detach())
+
     if is_padding:
         # zero out padded region of the output for comparison
         for i, (m, n) in enumerate(zip(seq_len_q_ref, seq_len_kv_ref)):
@@ -870,22 +1066,24 @@ def test_sdpa_backward(param_extract_backward):
 
 if __name__ == "__main__":
     """
-    option_forward = (input_type, layout, head_group, is_bias, is_alibi, is_padding, is_causal, is_dropout, is_infer)
-    option_backward = (input_type, layout, head_group, is_bias, is_alibi, is_padding, is_causal, is_dropout)
-    test_sdpa((torch.float16, "bs3hd", "multi_head", False, False, False, False, False, False))
-    test_sdpa_backward((torch.float16, "bs3hd", "multi_head", False, False, False, False, False))
+    option_forward = (input_type, layout, head_group, is_bias, is_alibi, is_padding, is_causal, is_dropout, is_ragged, is_infer)
+    option_backward = (input_type, layout, head_group, is_bias, is_alibi, is_padding, is_causal, is_dropout, is_ragged)
+    test_sdpa(torch.float16, "bs3hd", "multi_head", False, False, False, False, False, False, False)
+    test_sdpa_backward(torch.float16, "bs3hd", "multi_head", False, False, False, False, False, False)
     """
 
     print("==========running forward tests==========")
     for option in all_options_forward:
         try:
-            test_sdpa(option)
+            print(f"Running {option}")
+            test_sdpa(*option)
         except pytest.skip.Exception as e:
-            print(f"Skipped {option}: {e}")
+            print(f"Skipped {option}\n{e}")
 
     print("==========running backward tests==========")
     for option in all_options_backward:
         try:
-            test_sdpa_backward(option)
+            print(f"Running {option}")
+            test_sdpa_backward(*option)
         except pytest.skip.Exception as e:
-            print(f"Skipped {option}: {e}")
+            print(f"Skipped {option}\n{e}")
diff --git a/samples/python/test_rmsnorm.py b/test/python_fe/test_rmsnorm.py
similarity index 100%
rename from samples/python/test_rmsnorm.py
rename to test/python_fe/test_rmsnorm.py
diff --git a/samples/python/test_wgrads.py b/test/python_fe/test_wgrads.py
similarity index 100%
rename from samples/python/test_wgrads.py
rename to test/python_fe/test_wgrads.py