From 6cf4ee772cb459b35f2302c31b89bd400ffe9d93 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 16 May 2024 14:59:06 +0200
Subject: [PATCH 01/43] use std::shared_mutex

---
 include/rmm/mr/device/statistics_resource_adaptor.hpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/include/rmm/mr/device/statistics_resource_adaptor.hpp b/include/rmm/mr/device/statistics_resource_adaptor.hpp
index d072d5886..53e4119d1 100644
--- a/include/rmm/mr/device/statistics_resource_adaptor.hpp
+++ b/include/rmm/mr/device/statistics_resource_adaptor.hpp
@@ -45,11 +45,10 @@ namespace rmm::mr {
 template <typename Upstream>
 class statistics_resource_adaptor final : public device_memory_resource {
  public:
-  // can be a std::shared_mutex once C++17 is adopted
   using read_lock_t =
-    std::shared_lock<std::shared_timed_mutex>;  ///< Type of lock used to synchronize read access
+    std::shared_lock<std::shared_mutex>;  ///< Type of lock used to synchronize read access
   using write_lock_t =
-    std::unique_lock<std::shared_timed_mutex>;  ///< Type of lock used to synchronize write access
+    std::unique_lock<std::shared_mutex>;  ///< Type of lock used to synchronize write access
   /**
    * @brief Utility struct for counting the current, peak, and total value of a number
    */

From d13ee6b213145330cbbba6088bc7355c4bfb80e0 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 16 May 2024 15:00:10 +0200
Subject: [PATCH 02/43] clean up

---
 python/rmm/rmm/_lib/memory_resource.pyx | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx
index 100d18b56..0ee141fe7 100644
--- a/python/rmm/rmm/_lib/memory_resource.pyx
+++ b/python/rmm/rmm/_lib/memory_resource.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -177,8 +177,7 @@ cdef extern from "rmm/mr/device/logging_resource_adaptor.hpp" \
 
 cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \
         namespace "rmm::mr" nogil:
-    cdef cppclass statistics_resource_adaptor[Upstream](
-            device_memory_resource):
+    cdef cppclass statistics_resource_adaptor[Upstream](device_memory_resource):
         struct counter:
             counter()
 
@@ -186,8 +185,7 @@ cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \
             int64_t peak
             int64_t total
 
-        statistics_resource_adaptor(
-            Upstream* upstream_mr) except +
+        statistics_resource_adaptor(Upstream* upstream_mr) except +
 
         counter get_bytes_counter() except +
         counter get_allocations_counter() except +
@@ -812,12 +810,11 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
         Returns:
             dict: Dictionary containing allocation counts and bytes.
         """
+        cdef statistics_resource_adaptor[device_memory_resource]* mr = \
+            <statistics_resource_adaptor[device_memory_resource]*> self.c_obj.get()
 
-        counts = (<statistics_resource_adaptor[device_memory_resource]*>(
-            self.c_obj.get()))[0].get_allocations_counter()
-        byte_counts = (<statistics_resource_adaptor[device_memory_resource]*>(
-            self.c_obj.get()))[0].get_bytes_counter()
-
+        counts = deref(mr).get_allocations_counter()
+        byte_counts = deref(mr).get_bytes_counter()
         return {
             "current_bytes": byte_counts.value,
             "current_count": counts.value,

From 25ff814fe97b1622f07e40d4abacf67a53192586 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 16 May 2024 17:42:47 +0200
Subject: [PATCH 03/43] impl. push_counters() and pop_counters()

---
 .../mr/device/statistics_resource_adaptor.hpp | 63 ++++++++++++++++---
 1 file changed, 53 insertions(+), 10 deletions(-)

diff --git a/include/rmm/mr/device/statistics_resource_adaptor.hpp b/include/rmm/mr/device/statistics_resource_adaptor.hpp
index 53e4119d1..9d99db5aa 100644
--- a/include/rmm/mr/device/statistics_resource_adaptor.hpp
+++ b/include/rmm/mr/device/statistics_resource_adaptor.hpp
@@ -21,6 +21,7 @@
 #include <cstddef>
 #include <mutex>
 #include <shared_mutex>
+#include <stack>
 
 namespace rmm::mr {
 /**
@@ -82,6 +83,24 @@ class statistics_resource_adaptor final : public device_memory_resource {
       value -= val;
       return *this;
     }
+
+    /**
+     * @brief Add `val` to the current value and update the peak value if necessary
+     *
+     * @note When updating the peak value, we assume that `val` is the inner counter of
+     * `this` on the counter stack so its peak value becomes `this->value + val.peak`.
+     *
+     * @param val Value to add
+     * @return Reference to this object
+     */
+    counter& operator+=(const counter& val)
+    {
+      // We count the peak from value
+      peak = std::max(value + val.peak, peak);
+      value += val.value;
+      total += val.total;
+      return *this;
+    }
   };
 
   /**
@@ -95,6 +114,8 @@ class statistics_resource_adaptor final : public device_memory_resource {
   statistics_resource_adaptor(Upstream* upstream) : upstream_{upstream}
   {
     RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
+    // Initially, we push a single counter pair on the stack
+    push_counters();
   }
 
   statistics_resource_adaptor()                                              = delete;
@@ -130,7 +151,7 @@ class statistics_resource_adaptor final : public device_memory_resource {
   {
     read_lock_t lock(mtx_);
 
-    return bytes_;
+    return counter_stack_.top().first;
   }
 
   /**
@@ -144,7 +165,29 @@ class statistics_resource_adaptor final : public device_memory_resource {
   {
     read_lock_t lock(mtx_);
 
-    return allocations_;
+    return counter_stack_.top().second;
+  }
+
+  std::pair<counter, counter> push_counters()
+  {
+    write_lock_t lock(mtx_);
+    // auto [bytes, allocations] = counter_stack_.top();
+    // bytes.
+    auto ret = counter_stack_.top();
+    counter_stack_.push(std::make_pair(counter{}, counter{}));
+    return ret;
+  }
+
+  std::pair<counter, counter> pop_counters()
+  {
+    write_lock_t lock(mtx_);
+    if (counter_stack_.size() < 2) { throw std::out_of_range("cannot pop the last counter pair"); }
+    auto ret = counter_stack_.top();
+    counter_stack_.pop();
+    // The new top inherits the statistics
+    counter_stack_.top().first += ret.first;
+    counter_stack_.top().second += ret.second;
+    return ret;
   }
 
  private:
@@ -170,8 +213,8 @@ class statistics_resource_adaptor final : public device_memory_resource {
       write_lock_t lock(mtx_);
 
       // Increment the allocation_count_ while we have the lock
-      bytes_ += bytes;
-      allocations_ += 1;
+      counter_stack_.top().first += bytes;
+      counter_stack_.top().second += 1;
     }
 
     return ptr;
@@ -192,8 +235,8 @@ class statistics_resource_adaptor final : public device_memory_resource {
       write_lock_t lock(mtx_);
 
       // Decrement the current allocated counts.
-      bytes_ -= bytes;
-      allocations_ -= 1;
+      counter_stack_.top().first -= bytes;
+      counter_stack_.top().second -= 1;
     }
   }
 
@@ -212,10 +255,10 @@ class statistics_resource_adaptor final : public device_memory_resource {
     return get_upstream_resource() == cast->get_upstream_resource();
   }
 
-  counter bytes_;                        // peak, current and total allocated bytes
-  counter allocations_;                  // peak, current and total allocation count
-  std::shared_timed_mutex mutable mtx_;  // mutex for thread safe access to allocations_
-  Upstream* upstream_;  // the upstream resource used for satisfying allocation requests
+  // Stack of counter pairs <bytes, allocations>
+  std::stack<std::pair<counter, counter>> counter_stack_;
+  std::shared_mutex mutable mtx_;  // mutex for thread safe access to allocations_
+  Upstream* upstream_;             // the upstream resource used for satisfying allocation requests
 };
 
 /**

From b4533934ff36420c7be0611f0d361a8575017a70 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 16 May 2024 17:44:06 +0200
Subject: [PATCH 04/43] python bindings

---
 python/rmm/rmm/_lib/memory_resource.pyx | 40 +++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx
index 0ee141fe7..60e692757 100644
--- a/python/rmm/rmm/_lib/memory_resource.pyx
+++ b/python/rmm/rmm/_lib/memory_resource.pyx
@@ -189,6 +189,8 @@ cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \
 
         counter get_bytes_counter() except +
         counter get_allocations_counter() except +
+        pair[counter, counter] pop_counters() except +
+        pair[counter, counter] push_counters() except +
 
 cdef extern from "rmm/mr/device/tracking_resource_adaptor.hpp" \
         namespace "rmm::mr" nogil:
@@ -791,6 +793,9 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
         allocations/deallocations performed by an upstream memory resource.
         Includes the ability to query these statistics at any time.
 
+        The resource maintains a stack of counters, use `.push_counters()`
+        and `.pop_counters()` to record statistics at different nested levels.
+
         Parameters
         ----------
         upstream : DeviceMemoryResource
@@ -824,6 +829,41 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
             "total_count": counts.total,
         }
 
+    def pop_counters(self) -> dict:
+        """
+        Pop a counter pair (bytes and allocations) from the stack
+        """
+        cdef statistics_resource_adaptor[device_memory_resource]* mr = \
+            <statistics_resource_adaptor[device_memory_resource]*> self.c_obj.get()
+
+        bytes_and_allocs = deref(mr).pop_counters()
+        return {
+            "current_bytes": bytes_and_allocs.first.value,
+            "current_count": bytes_and_allocs.second.value,
+            "peak_bytes": bytes_and_allocs.first.peak,
+            "peak_count": bytes_and_allocs.second.peak,
+            "total_bytes": bytes_and_allocs.first.total,
+            "total_count": bytes_and_allocs.second.total,
+        }
+
+    def push_counters(self) -> dict:
+        """
+        Push a new counter pair (bytes and allocations) on the stack
+        """
+
+        cdef statistics_resource_adaptor[device_memory_resource]* mr = \
+            <statistics_resource_adaptor[device_memory_resource]*> self.c_obj.get()
+
+        bytes_and_allocs = deref(mr).push_counters()
+        return {
+            "current_bytes": bytes_and_allocs.first.value,
+            "current_count": bytes_and_allocs.second.value,
+            "peak_bytes": bytes_and_allocs.first.peak,
+            "peak_count": bytes_and_allocs.second.peak,
+            "total_bytes": bytes_and_allocs.first.total,
+            "total_count": bytes_and_allocs.second.total,
+        }
+
 cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor):
 
     def __cinit__(

From 1f7daa1d8c8a67106369f13f9bc1887bbd3a7b9f Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 16 May 2024 17:44:17 +0200
Subject: [PATCH 05/43] python tests

---
 python/rmm/rmm/tests/test_rmm.py | 70 ++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index c37fe0298..76fb255d1 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -708,7 +708,77 @@ def test_statistics_resource_adaptor(stats_mr):
         "total_bytes": 12096,
         "total_count": 12,
     }
+
+    # Test the counter stack
+    # push returns the stats from the top before the push
+    assert stats_mr.push_counters() == {  # stats from stack level 0
+        "current_bytes": 0,
+        "current_count": 0,
+        "peak_bytes": 10080,
+        "peak_count": 10,
+        "total_bytes": 12096,
+        "total_count": 12,
+    }
+    b1 = rmm.DeviceBuffer(size=10)
+    assert stats_mr.push_counters() == {  # stats from stack level 1
+        "current_bytes": 16,
+        "current_count": 1,
+        "peak_bytes": 16,
+        "peak_count": 1,
+        "total_bytes": 16,
+        "total_count": 1,
+    }
+    del b1
+    gc.collect()
+    # pop returns the stats from the top before the pop. Note, the bytes and
+    # count become negative since we are only deleted `b1` above.
+    assert stats_mr.pop_counters() == {  # stats from stack level 2
+        "current_bytes": -16,
+        "current_count": -1,
+        "peak_bytes": 0,
+        "peak_count": 0,
+        "total_bytes": 0,
+        "total_count": 0,
+    }
+    b1 = rmm.DeviceBuffer(size=10)
+    assert stats_mr.push_counters() == {  # stats from stack level 1
+        "current_bytes": 16,
+        "current_count": 1,
+        "peak_bytes": 16,
+        "peak_count": 1,
+        "total_bytes": 32,
+        "total_count": 2,
+    }
+    b2 = rmm.DeviceBuffer(size=10)
+    assert stats_mr.pop_counters() == {  # stats from stack level 2
+        "current_bytes": 16,
+        "current_count": 1,
+        "peak_bytes": 16,
+        "peak_count": 1,
+        "total_bytes": 16,
+        "total_count": 1,
+    }
+    assert stats_mr.pop_counters() == {  # stats from stack level 1
+        "current_bytes": 32,
+        "current_count": 2,
+        "peak_bytes": 32,
+        "peak_count": 2,
+        "total_bytes": 48,
+        "total_count": 3,
+    }
+    del b1
+    del b2
     gc.collect()
+    assert stats_mr.allocation_counts == {  # stats from stack level 0
+        "current_bytes": 0,
+        "current_count": 0,
+        "peak_bytes": 10080,
+        "peak_count": 10,
+        "total_bytes": 12144,
+        "total_count": 15,
+    }
+    with pytest.raises(IndexError, match="cannot pop the last counter pair"):
+        stats_mr.pop_counters()
 
 
 def test_tracking_resource_adaptor():

From d6fd147cfbdf340d4089c96952f891ce688de36b Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Fri, 17 May 2024 08:24:42 +0200
Subject: [PATCH 06/43] doc

---
 .../mr/device/statistics_resource_adaptor.hpp | 19 +++++++++++++++++--
 python/rmm/rmm/_lib/memory_resource.pyx       |  4 ++--
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/include/rmm/mr/device/statistics_resource_adaptor.hpp b/include/rmm/mr/device/statistics_resource_adaptor.hpp
index 9d99db5aa..b135ce0ae 100644
--- a/include/rmm/mr/device/statistics_resource_adaptor.hpp
+++ b/include/rmm/mr/device/statistics_resource_adaptor.hpp
@@ -37,8 +37,12 @@ namespace rmm::mr {
  * resource in order to satisfy allocation requests, but any existing
  * allocations will be untracked. Tracking statistics stores the current, peak
  * and total memory allocations for both the number of bytes and number of calls
- * to the memory resource. `statistics_resource_adaptor` is intended as a debug
- * adaptor and shouldn't be used in performance-sensitive code.
+ * to the memory resource.
+ * A stack of counters is maintained, use `.push_counters()` and `.pop_counters()`
+ * to track statistics at different nesting levels.
+ *
+ * `statistics_resource_adaptor` is intended as a debug adaptor and shouldn't be
+ * used in performance-sensitive code.
  *
  * @tparam Upstream Type of the upstream resource used for
  * allocation/deallocation.
@@ -168,6 +172,12 @@ class statistics_resource_adaptor final : public device_memory_resource {
     return counter_stack_.top().second;
   }
 
+  /**
+   * @brief Push a pair of zero counters on the stack, which becomes the new
+   * counters returned by `get_bytes_counter()` and `get_allocations_counter()`
+   *
+   * @return pair of counters <bytes, allocations> from the stack _before_ the push
+   */
   std::pair<counter, counter> push_counters()
   {
     write_lock_t lock(mtx_);
@@ -178,6 +188,11 @@ class statistics_resource_adaptor final : public device_memory_resource {
     return ret;
   }
 
+  /**
+   * @brief Pop a pair of counters from the stack
+   *
+   * @return pair of counters <bytes, allocations> from the stack _before_ the pop
+   */
   std::pair<counter, counter> pop_counters()
   {
     write_lock_t lock(mtx_);
diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx
index 60e692757..f9c7ef8b6 100644
--- a/python/rmm/rmm/_lib/memory_resource.pyx
+++ b/python/rmm/rmm/_lib/memory_resource.pyx
@@ -793,8 +793,8 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
         allocations/deallocations performed by an upstream memory resource.
         Includes the ability to query these statistics at any time.
 
-        The resource maintains a stack of counters, use `.push_counters()`
-        and `.pop_counters()` to record statistics at different nested levels.
+        A stack of counters is maintained, use `.push_counters()` and
+        `.pop_counters()` to track statistics at different nesting levels.
 
         Parameters
         ----------

From fc49fe923b23fe124b97dd4258ce7ff5155abf86 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Fri, 17 May 2024 10:03:25 +0200
Subject: [PATCH 07/43] test_statistics

---
 python/rmm/rmm/statistics.py            |  68 ++++++++++
 python/rmm/rmm/tests/conftest.py        |  21 ++-
 python/rmm/rmm/tests/test_rmm.py        | 132 -------------------
 python/rmm/rmm/tests/test_statistics.py | 167 ++++++++++++++++++++++++
 4 files changed, 249 insertions(+), 139 deletions(-)
 create mode 100644 python/rmm/rmm/statistics.py
 create mode 100644 python/rmm/rmm/tests/test_statistics.py

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
new file mode 100644
index 000000000..7defaafb5
--- /dev/null
+++ b/python/rmm/rmm/statistics.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from contextlib import contextmanager
+
+import rmm.mr
+
+
+def enable_statistics() -> None:
+    """Enable allocation statistics
+
+    This function is idempotent, if statistics has been enabled for the
+    current RMM resource stack, this is a no-op.
+
+    Warning
+    -------
+    This modifies the current RMM memory resource. StatisticsResourceAdaptor
+    is pushed onto the current RMM memory resource stack and must remain the
+    the top must resource throughout the statistics gathering.
+    """
+
+    mr = rmm.mr.get_current_device_resource()
+    if not isinstance(mr, rmm.mr.StatisticsResourceAdaptor):
+        rmm.mr.set_current_device_resource(
+            rmm.mr.StatisticsResourceAdaptor(mr)
+        )
+
+
+@contextmanager
+def statistics():
+    """Context to enable allocation statistics temporarily.
+
+    Warning
+    -------
+    This modifies the current RMM memory resource. StatisticsResourceAdaptor
+    is pushed onto the current RMM memory resource stack when entering the
+    context and popped again when exiting. If statistics has been enabled for
+    the current RMM resource stack already, this is a no-op.
+
+    Raises
+    ------
+    ValueError
+        If the RMM memory source stack was changed while in the context.
+    """
+
+    # Save the current memory resource for later cleanup
+    prior_mr = rmm.mr.get_current_device_resource()
+    enable_statistics()
+    try:
+        current_mr = rmm.mr.get_current_device_resource()
+        yield
+    finally:
+        if current_mr is not rmm.mr.get_current_device_resource():
+            raise ValueError(
+                "RMM memory source stack was changed while in the context"
+            )
+        rmm.mr.set_current_device_resource(prior_mr)
diff --git a/python/rmm/rmm/tests/conftest.py b/python/rmm/rmm/tests/conftest.py
index 5fad81c79..e7e74eebc 100644
--- a/python/rmm/rmm/tests/conftest.py
+++ b/python/rmm/rmm/tests/conftest.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import pytest
 
 import rmm
@@ -12,10 +26,3 @@ def rmm_auto_reinitialize():
     # test
 
     rmm.reinitialize()
-
-
-@pytest.fixture
-def stats_mr():
-    mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.CudaMemoryResource())
-    rmm.mr.set_current_device_resource(mr)
-    return mr
diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index 76fb255d1..85764d285 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -649,138 +649,6 @@ def test_limiting_resource_adaptor(mr):
         rmm.DeviceBuffer(size=1)
 
 
-def test_statistics_resource_adaptor(stats_mr):
-
-    buffers = [rmm.DeviceBuffer(size=1000) for _ in range(10)]
-
-    for i in range(9, 0, -2):
-        del buffers[i]
-
-    assert stats_mr.allocation_counts == {
-        "current_bytes": 5040,
-        "current_count": 5,
-        "peak_bytes": 10080,
-        "peak_count": 10,
-        "total_bytes": 10080,
-        "total_count": 10,
-    }
-
-    # Push a new Tracking adaptor
-    mr2 = rmm.mr.StatisticsResourceAdaptor(stats_mr)
-    rmm.mr.set_current_device_resource(mr2)
-
-    for _ in range(2):
-        buffers.append(rmm.DeviceBuffer(size=1000))
-
-    assert mr2.allocation_counts == {
-        "current_bytes": 2016,
-        "current_count": 2,
-        "peak_bytes": 2016,
-        "peak_count": 2,
-        "total_bytes": 2016,
-        "total_count": 2,
-    }
-    assert stats_mr.allocation_counts == {
-        "current_bytes": 7056,
-        "current_count": 7,
-        "peak_bytes": 10080,
-        "peak_count": 10,
-        "total_bytes": 12096,
-        "total_count": 12,
-    }
-
-    del buffers
-    gc.collect()
-
-    assert mr2.allocation_counts == {
-        "current_bytes": 0,
-        "current_count": 0,
-        "peak_bytes": 2016,
-        "peak_count": 2,
-        "total_bytes": 2016,
-        "total_count": 2,
-    }
-    assert stats_mr.allocation_counts == {
-        "current_bytes": 0,
-        "current_count": 0,
-        "peak_bytes": 10080,
-        "peak_count": 10,
-        "total_bytes": 12096,
-        "total_count": 12,
-    }
-
-    # Test the counter stack
-    # push returns the stats from the top before the push
-    assert stats_mr.push_counters() == {  # stats from stack level 0
-        "current_bytes": 0,
-        "current_count": 0,
-        "peak_bytes": 10080,
-        "peak_count": 10,
-        "total_bytes": 12096,
-        "total_count": 12,
-    }
-    b1 = rmm.DeviceBuffer(size=10)
-    assert stats_mr.push_counters() == {  # stats from stack level 1
-        "current_bytes": 16,
-        "current_count": 1,
-        "peak_bytes": 16,
-        "peak_count": 1,
-        "total_bytes": 16,
-        "total_count": 1,
-    }
-    del b1
-    gc.collect()
-    # pop returns the stats from the top before the pop. Note, the bytes and
-    # count become negative since we are only deleted `b1` above.
-    assert stats_mr.pop_counters() == {  # stats from stack level 2
-        "current_bytes": -16,
-        "current_count": -1,
-        "peak_bytes": 0,
-        "peak_count": 0,
-        "total_bytes": 0,
-        "total_count": 0,
-    }
-    b1 = rmm.DeviceBuffer(size=10)
-    assert stats_mr.push_counters() == {  # stats from stack level 1
-        "current_bytes": 16,
-        "current_count": 1,
-        "peak_bytes": 16,
-        "peak_count": 1,
-        "total_bytes": 32,
-        "total_count": 2,
-    }
-    b2 = rmm.DeviceBuffer(size=10)
-    assert stats_mr.pop_counters() == {  # stats from stack level 2
-        "current_bytes": 16,
-        "current_count": 1,
-        "peak_bytes": 16,
-        "peak_count": 1,
-        "total_bytes": 16,
-        "total_count": 1,
-    }
-    assert stats_mr.pop_counters() == {  # stats from stack level 1
-        "current_bytes": 32,
-        "current_count": 2,
-        "peak_bytes": 32,
-        "peak_count": 2,
-        "total_bytes": 48,
-        "total_count": 3,
-    }
-    del b1
-    del b2
-    gc.collect()
-    assert stats_mr.allocation_counts == {  # stats from stack level 0
-        "current_bytes": 0,
-        "current_count": 0,
-        "peak_bytes": 10080,
-        "peak_count": 10,
-        "total_bytes": 12144,
-        "total_count": 15,
-    }
-    with pytest.raises(IndexError, match="cannot pop the last counter pair"):
-        stats_mr.pop_counters()
-
-
 def test_tracking_resource_adaptor():
     cuda_mr = rmm.mr.CudaMemoryResource()
 
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
new file mode 100644
index 000000000..b10d5b606
--- /dev/null
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import rmm.mr
+from rmm.statistics import statistics
+
+
+@pytest.fixture
+def stats_mr():
+    with statistics():
+        yield rmm.mr.get_current_device_resource()
+
+
+def test_context():
+    prior_mr = rmm.mr.get_current_device_resource()
+    with statistics():
+        assert isinstance(
+            rmm.mr.get_current_device_resource(),
+            rmm.mr.StatisticsResourceAdaptor,
+        )
+    assert rmm.mr.get_current_device_resource() is prior_mr
+
+
+def test_multiple_mr(stats_mr):
+    buffers = [rmm.DeviceBuffer(size=1000) for _ in range(10)]
+
+    for i in range(9, 0, -2):
+        del buffers[i]
+
+    assert stats_mr.allocation_counts == {
+        "current_bytes": 5040,
+        "current_count": 5,
+        "peak_bytes": 10080,
+        "peak_count": 10,
+        "total_bytes": 10080,
+        "total_count": 10,
+    }
+
+    # Push a new Tracking adaptor
+    mr2 = rmm.mr.StatisticsResourceAdaptor(stats_mr)
+    rmm.mr.set_current_device_resource(mr2)
+
+    for _ in range(2):
+        buffers.append(rmm.DeviceBuffer(size=1000))
+
+    assert mr2.allocation_counts == {
+        "current_bytes": 2016,
+        "current_count": 2,
+        "peak_bytes": 2016,
+        "peak_count": 2,
+        "total_bytes": 2016,
+        "total_count": 2,
+    }
+    assert stats_mr.allocation_counts == {
+        "current_bytes": 7056,
+        "current_count": 7,
+        "peak_bytes": 10080,
+        "peak_count": 10,
+        "total_bytes": 12096,
+        "total_count": 12,
+    }
+
+    del buffers
+
+    assert mr2.allocation_counts == {
+        "current_bytes": 0,
+        "current_count": 0,
+        "peak_bytes": 2016,
+        "peak_count": 2,
+        "total_bytes": 2016,
+        "total_count": 2,
+    }
+    assert stats_mr.allocation_counts == {
+        "current_bytes": 0,
+        "current_count": 0,
+        "peak_bytes": 10080,
+        "peak_count": 10,
+        "total_bytes": 12096,
+        "total_count": 12,
+    }
+    rmm.mr.set_current_device_resource(stats_mr)
+
+
+def test_counter_stack(stats_mr):
+    buffers = [rmm.DeviceBuffer(size=10) for _ in range(10)]
+
+    # push returns the stats from the top before the push
+    assert stats_mr.push_counters() == {  # stats from stack level 0
+        "current_bytes": 160,
+        "current_count": 10,
+        "peak_bytes": 160,
+        "peak_count": 10,
+        "total_bytes": 160,
+        "total_count": 10,
+    }
+    b1 = rmm.DeviceBuffer(size=10)
+    assert stats_mr.push_counters() == {  # stats from stack level 1
+        "current_bytes": 16,
+        "current_count": 1,
+        "peak_bytes": 16,
+        "peak_count": 1,
+        "total_bytes": 16,
+        "total_count": 1,
+    }
+    del b1
+    # pop returns the stats from the top before the pop.
+    # Note, the bytes and counts can be negative
+    assert stats_mr.pop_counters() == {  # stats from stack level 2
+        "current_bytes": -16,
+        "current_count": -1,
+        "peak_bytes": 0,
+        "peak_count": 0,
+        "total_bytes": 0,
+        "total_count": 0,
+    }
+    b1 = rmm.DeviceBuffer(size=10)
+    assert stats_mr.push_counters() == {  # stats from stack level 1
+        "current_bytes": 16,
+        "current_count": 1,
+        "peak_bytes": 16,
+        "peak_count": 1,
+        "total_bytes": 32,
+        "total_count": 2,
+    }
+    b2 = rmm.DeviceBuffer(size=10)
+    assert stats_mr.pop_counters() == {  # stats from stack level 2
+        "current_bytes": 16,
+        "current_count": 1,
+        "peak_bytes": 16,
+        "peak_count": 1,
+        "total_bytes": 16,
+        "total_count": 1,
+    }
+    assert stats_mr.pop_counters() == {  # stats from stack level 1
+        "current_bytes": 32,
+        "current_count": 2,
+        "peak_bytes": 32,
+        "peak_count": 2,
+        "total_bytes": 48,
+        "total_count": 3,
+    }
+    del b1
+    del b2
+    assert stats_mr.allocation_counts == {  # stats from stack level 0
+        "current_bytes": 160,
+        "current_count": 10,
+        "peak_bytes": 192,
+        "peak_count": 12,
+        "total_bytes": 208,
+        "total_count": 13,
+    }
+    del buffers
+    with pytest.raises(IndexError, match="cannot pop the last counter pair"):
+        stats_mr.pop_counters()

From b9d57dba6534e056458fe87d4e930ca71bf0f366 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Fri, 17 May 2024 10:35:22 +0200
Subject: [PATCH 08/43] current allocation statistics

---
 python/rmm/rmm/statistics.py            | 55 +++++++++++++++++++++++++
 python/rmm/rmm/tests/test_statistics.py | 54 +++++++++++++++++++++++-
 2 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 7defaafb5..8b4eb1aa8 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from contextlib import contextmanager
+from typing import Dict, Optional
 
 import rmm.mr
 
@@ -66,3 +67,57 @@ def statistics():
                 "RMM memory source stack was changed while in the context"
             )
         rmm.mr.set_current_device_resource(prior_mr)
+
+
+def get_statistics() -> Optional[Dict[str, int]]:
+    """Get the current allocation statistics
+
+    Return
+    ------
+    If enabled, returns the current tracked statistics.
+    If disabled, returns None.
+    """
+    mr = rmm.mr.get_current_device_resource()
+    if isinstance(mr, rmm.mr.StatisticsResourceAdaptor):
+        return mr.allocation_counts
+    return None
+
+
+def push_statistics() -> Optional[Dict[str, int]]:
+    """Push new counters on the current allocation statistics stack
+
+    This returns the current tracked statistics and push a new set
+    of zero counters on the stack of statistics.
+
+    If statistics are disabled (the current memory resource is not an
+    instance of `StatisticsResourceAdaptor`), this function is a no-op.
+
+    Return
+    ------
+    If enabled, returns the current tracked statistics _before_ the pop.
+    If disabled, returns None.
+    """
+    mr = rmm.mr.get_current_device_resource()
+    if isinstance(mr, rmm.mr.StatisticsResourceAdaptor):
+        return mr.push_counters()
+    return None
+
+
+def pop_statistics() -> Optional[Dict[str, int]]:
+    """Pop the counters of the current allocation statistics stack
+
+    This returns the counters of current tracked statistics and pops
+    them from the stack.
+
+    If statistics are disabled (the current memory resource is not an
+    instance of `StatisticsResourceAdaptor`), this function is a no-op.
+
+    Return
+    ------
+    If enabled, returns the popped counters.
+    If disabled, returns None.
+    """
+    mr = rmm.mr.get_current_device_resource()
+    if isinstance(mr, rmm.mr.StatisticsResourceAdaptor):
+        return mr.pop_counters()
+    return None
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index b10d5b606..5c36fd743 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -15,7 +15,12 @@
 import pytest
 
 import rmm.mr
-from rmm.statistics import statistics
+from rmm.statistics import (
+    get_statistics,
+    pop_statistics,
+    push_statistics,
+    statistics,
+)
 
 
 @pytest.fixture
@@ -116,7 +121,7 @@ def test_counter_stack(stats_mr):
         "total_count": 1,
     }
     del b1
-    # pop returns the stats from the top before the pop.
+    # pop returns the popped stats
     # Note, the bytes and counts can be negative
     assert stats_mr.pop_counters() == {  # stats from stack level 2
         "current_bytes": -16,
@@ -165,3 +170,48 @@ def test_counter_stack(stats_mr):
     del buffers
     with pytest.raises(IndexError, match="cannot pop the last counter pair"):
         stats_mr.pop_counters()
+
+
+def test_current_statistics(stats_mr):
+    b1 = rmm.DeviceBuffer(size=10)
+    assert get_statistics() == {
+        "current_bytes": 16,
+        "current_count": 1,
+        "peak_bytes": 16,
+        "peak_count": 1,
+        "total_bytes": 16,
+        "total_count": 1,
+    }
+    b2 = rmm.DeviceBuffer(size=20)
+    assert push_statistics() == {
+        "current_bytes": 48,
+        "current_count": 2,
+        "peak_bytes": 48,
+        "peak_count": 2,
+        "total_bytes": 48,
+        "total_count": 2,
+    }
+    del b1
+    assert pop_statistics() == {
+        "current_bytes": -16,
+        "current_count": -1,
+        "peak_bytes": 0,
+        "peak_count": 0,
+        "total_bytes": 0,
+        "total_count": 0,
+    }
+    del b2
+    assert get_statistics() == {
+        "current_bytes": 0,
+        "current_count": 0,
+        "peak_bytes": 48,
+        "peak_count": 2,
+        "total_bytes": 48,
+        "total_count": 2,
+    }
+
+
+def test_statistics_disabled():
+    assert get_statistics() is None
+    assert push_statistics() is None
+    assert get_statistics() is None

From 0e2f19c258a0214e4ce117236f8fd231180fffa0 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Fri, 17 May 2024 10:43:16 +0200
Subject: [PATCH 09/43] clean up

---
 .../mr/device/statistics_resource_adaptor.hpp |  1 -
 python/rmm/rmm/tests/test_statistics.py       | 79 +++++++++----------
 2 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/include/rmm/mr/device/statistics_resource_adaptor.hpp b/include/rmm/mr/device/statistics_resource_adaptor.hpp
index b135ce0ae..1b4252cef 100644
--- a/include/rmm/mr/device/statistics_resource_adaptor.hpp
+++ b/include/rmm/mr/device/statistics_resource_adaptor.hpp
@@ -99,7 +99,6 @@ class statistics_resource_adaptor final : public device_memory_resource {
      */
     counter& operator+=(const counter& val)
     {
-      // We count the peak from value
       peak = std::max(value + val.peak, peak);
       value += val.value;
       total += val.total;
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index 5c36fd743..f8451dbe0 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -57,46 +57,45 @@ def test_multiple_mr(stats_mr):
     # Push a new Tracking adaptor
     mr2 = rmm.mr.StatisticsResourceAdaptor(stats_mr)
     rmm.mr.set_current_device_resource(mr2)
-
-    for _ in range(2):
-        buffers.append(rmm.DeviceBuffer(size=1000))
-
-    assert mr2.allocation_counts == {
-        "current_bytes": 2016,
-        "current_count": 2,
-        "peak_bytes": 2016,
-        "peak_count": 2,
-        "total_bytes": 2016,
-        "total_count": 2,
-    }
-    assert stats_mr.allocation_counts == {
-        "current_bytes": 7056,
-        "current_count": 7,
-        "peak_bytes": 10080,
-        "peak_count": 10,
-        "total_bytes": 12096,
-        "total_count": 12,
-    }
-
-    del buffers
-
-    assert mr2.allocation_counts == {
-        "current_bytes": 0,
-        "current_count": 0,
-        "peak_bytes": 2016,
-        "peak_count": 2,
-        "total_bytes": 2016,
-        "total_count": 2,
-    }
-    assert stats_mr.allocation_counts == {
-        "current_bytes": 0,
-        "current_count": 0,
-        "peak_bytes": 10080,
-        "peak_count": 10,
-        "total_bytes": 12096,
-        "total_count": 12,
-    }
-    rmm.mr.set_current_device_resource(stats_mr)
+    try:
+        for _ in range(2):
+            buffers.append(rmm.DeviceBuffer(size=1000))
+
+        assert mr2.allocation_counts == {
+            "current_bytes": 2016,
+            "current_count": 2,
+            "peak_bytes": 2016,
+            "peak_count": 2,
+            "total_bytes": 2016,
+            "total_count": 2,
+        }
+        assert stats_mr.allocation_counts == {
+            "current_bytes": 7056,
+            "current_count": 7,
+            "peak_bytes": 10080,
+            "peak_count": 10,
+            "total_bytes": 12096,
+            "total_count": 12,
+        }
+        del buffers
+        assert mr2.allocation_counts == {
+            "current_bytes": 0,
+            "current_count": 0,
+            "peak_bytes": 2016,
+            "peak_count": 2,
+            "total_bytes": 2016,
+            "total_count": 2,
+        }
+        assert stats_mr.allocation_counts == {
+            "current_bytes": 0,
+            "current_count": 0,
+            "peak_bytes": 10080,
+            "peak_count": 10,
+            "total_bytes": 12096,
+            "total_count": 12,
+        }
+    finally:
+        rmm.mr.set_current_device_resource(stats_mr)
 
 
 def test_counter_stack(stats_mr):

From 7f7f9409dce3e323a9fb9862495504507c6c5571 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Fri, 17 May 2024 11:25:03 +0200
Subject: [PATCH 10/43] Context to enable allocation statistics

---
 python/rmm/rmm/statistics.py            | 76 ++++++++++++++-----------
 python/rmm/rmm/tests/test_statistics.py | 37 +++++++++++-
 2 files changed, 78 insertions(+), 35 deletions(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 8b4eb1aa8..2cab02c9d 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -38,37 +38,6 @@ def enable_statistics() -> None:
         )
 
 
-@contextmanager
-def statistics():
-    """Context to enable allocation statistics temporarily.
-
-    Warning
-    -------
-    This modifies the current RMM memory resource. StatisticsResourceAdaptor
-    is pushed onto the current RMM memory resource stack when entering the
-    context and popped again when exiting. If statistics has been enabled for
-    the current RMM resource stack already, this is a no-op.
-
-    Raises
-    ------
-    ValueError
-        If the RMM memory source stack was changed while in the context.
-    """
-
-    # Save the current memory resource for later cleanup
-    prior_mr = rmm.mr.get_current_device_resource()
-    enable_statistics()
-    try:
-        current_mr = rmm.mr.get_current_device_resource()
-        yield
-    finally:
-        if current_mr is not rmm.mr.get_current_device_resource():
-            raise ValueError(
-                "RMM memory source stack was changed while in the context"
-            )
-        rmm.mr.set_current_device_resource(prior_mr)
-
-
 def get_statistics() -> Optional[Dict[str, int]]:
     """Get the current allocation statistics
 
@@ -90,7 +59,7 @@ def push_statistics() -> Optional[Dict[str, int]]:
     of zero counters on the stack of statistics.
 
     If statistics are disabled (the current memory resource is not an
-    instance of `StatisticsResourceAdaptor`), this function is a no-op.
+    instance of StatisticsResourceAdaptor), this function is a no-op.
 
     Return
     ------
@@ -110,7 +79,7 @@ def pop_statistics() -> Optional[Dict[str, int]]:
     them from the stack.
 
     If statistics are disabled (the current memory resource is not an
-    instance of `StatisticsResourceAdaptor`), this function is a no-op.
+    instance of StatisticsResourceAdaptor), this function is a no-op.
 
     Return
     ------
@@ -121,3 +90,44 @@ def pop_statistics() -> Optional[Dict[str, int]]:
     if isinstance(mr, rmm.mr.StatisticsResourceAdaptor):
         return mr.pop_counters()
     return None
+
+
+@contextmanager
+def statistics():
+    """Context to enable allocation statistics.
+
+    If statistics has been enabled already (the current memory resource is an
+    instance of StatisticsResourceAdaptor), new counters are pushed on the
+    current allocation statistics stack when entering the context and popped
+    again when exiting using `push_statistics()` and `push_statistics()`.
+
+    If statistics has not been enabled, StatisticsResourceAdaptor is set as
+    the current RMM memory resource when entering the context and removed
+    again when exiting.
+
+    Raises
+    ------
+    ValueError
+        If the current RMM memory source was changed while in the context.
+    """
+
+    if push_statistics() is None:
+        # Save the current non-statistics memory resource for later cleanup
+        prior_non_stats_mr = rmm.mr.get_current_device_resource()
+        enable_statistics()
+    else:
+        prior_non_stats_mr = None
+
+    try:
+        current_mr = rmm.mr.get_current_device_resource()
+        yield
+    finally:
+        if current_mr is not rmm.mr.get_current_device_resource():
+            raise ValueError(
+                "RMM memory source stack was changed "
+                "while in the statistics context"
+            )
+        if prior_non_stats_mr is None:
+            pop_statistics()
+        else:
+            rmm.mr.set_current_device_resource(prior_non_stats_mr)
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index f8451dbe0..e4ab76e1d 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -30,13 +30,46 @@ def stats_mr():
 
 
 def test_context():
-    prior_mr = rmm.mr.get_current_device_resource()
+    mr0 = rmm.mr.get_current_device_resource()
+    assert get_statistics() is None
     with statistics():
+        mr1 = rmm.mr.get_current_device_resource()
         assert isinstance(
             rmm.mr.get_current_device_resource(),
             rmm.mr.StatisticsResourceAdaptor,
         )
-    assert rmm.mr.get_current_device_resource() is prior_mr
+        b1 = rmm.DeviceBuffer(size=20)
+        assert get_statistics() == {
+            "current_bytes": 32,
+            "current_count": 1,
+            "peak_bytes": 32,
+            "peak_count": 1,
+            "total_bytes": 32,
+            "total_count": 1,
+        }
+        with statistics():
+            mr2 = rmm.mr.get_current_device_resource()
+            assert mr1 is mr2
+            b2 = rmm.DeviceBuffer(size=10)
+            assert get_statistics() == {
+                "current_bytes": 16,
+                "current_count": 1,
+                "peak_bytes": 16,
+                "peak_count": 1,
+                "total_bytes": 16,
+                "total_count": 1,
+            }
+        assert get_statistics() == {
+            "current_bytes": 48,
+            "current_count": 2,
+            "peak_bytes": 48,
+            "peak_count": 2,
+            "total_bytes": 48,
+            "total_count": 2,
+        }
+        del b1
+        del b2
+    assert rmm.mr.get_current_device_resource() is mr0
 
 
 def test_multiple_mr(stats_mr):

From 68fcd081eefdbfbde3bf23e399b5acef203671b8 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 23 May 2024 08:41:33 +0200
Subject: [PATCH 11/43] Apply suggestions from code review

Co-authored-by: Mark Harris <783069+harrism@users.noreply.github.com>
---
 python/rmm/rmm/_lib/memory_resource.pyx | 2 +-
 python/rmm/rmm/statistics.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx
index f9c7ef8b6..e5e267144 100644
--- a/python/rmm/rmm/_lib/memory_resource.pyx
+++ b/python/rmm/rmm/_lib/memory_resource.pyx
@@ -793,7 +793,7 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
         allocations/deallocations performed by an upstream memory resource.
         Includes the ability to query these statistics at any time.
 
-        A stack of counters is maintained, use `.push_counters()` and
+        A stack of counters is maintained. Use `.push_counters()` and
         `.pop_counters()` to track statistics at different nesting levels.
 
         Parameters
diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 2cab02c9d..ba5860256 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -55,7 +55,7 @@ def get_statistics() -> Optional[Dict[str, int]]:
 def push_statistics() -> Optional[Dict[str, int]]:
     """Push new counters on the current allocation statistics stack
 
-    This returns the current tracked statistics and push a new set
+    This returns the current tracked statistics and pushes a new set
     of zero counters on the stack of statistics.
 
     If statistics are disabled (the current memory resource is not an

From 0254b73d53a79f47270f36697418b36dcd9e6707 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 23 May 2024 08:41:55 +0200
Subject: [PATCH 12/43] doc

---
 include/rmm/mr/device/statistics_resource_adaptor.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/rmm/mr/device/statistics_resource_adaptor.hpp b/include/rmm/mr/device/statistics_resource_adaptor.hpp
index 1b4252cef..713304263 100644
--- a/include/rmm/mr/device/statistics_resource_adaptor.hpp
+++ b/include/rmm/mr/device/statistics_resource_adaptor.hpp
@@ -175,13 +175,12 @@ class statistics_resource_adaptor final : public device_memory_resource {
    * @brief Push a pair of zero counters on the stack, which becomes the new
    * counters returned by `get_bytes_counter()` and `get_allocations_counter()`
    *
-   * @return pair of counters <bytes, allocations> from the stack _before_ the push
+   * @return top pair of counters <bytes, allocations> from the stack _before_
+   * the push
    */
   std::pair<counter, counter> push_counters()
   {
     write_lock_t lock(mtx_);
-    // auto [bytes, allocations] = counter_stack_.top();
-    // bytes.
     auto ret = counter_stack_.top();
     counter_stack_.push(std::make_pair(counter{}, counter{}));
     return ret;
@@ -190,7 +189,8 @@ class statistics_resource_adaptor final : public device_memory_resource {
   /**
    * @brief Pop a pair of counters from the stack
    *
-   * @return pair of counters <bytes, allocations> from the stack _before_ the pop
+   * @return top pair of counters <bytes, allocations> from the stack _before_
+   * the pop
    */
   std::pair<counter, counter> pop_counters()
   {

From e6dd682fa262864e93e8f11ca76247790c644d66 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 23 May 2024 09:43:17 +0200
Subject: [PATCH 13/43] add_counters_from_tracked_sub_block

---
 .../mr/device/statistics_resource_adaptor.hpp | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/include/rmm/mr/device/statistics_resource_adaptor.hpp b/include/rmm/mr/device/statistics_resource_adaptor.hpp
index 713304263..7819d688d 100644
--- a/include/rmm/mr/device/statistics_resource_adaptor.hpp
+++ b/include/rmm/mr/device/statistics_resource_adaptor.hpp
@@ -38,8 +38,12 @@ namespace rmm::mr {
  * allocations will be untracked. Tracking statistics stores the current, peak
  * and total memory allocations for both the number of bytes and number of calls
  * to the memory resource.
- * A stack of counters is maintained, use `.push_counters()` and `.pop_counters()`
- * to track statistics at different nesting levels.
+ *
+ * This resource supports nested statistics, which makes it possible to track statistics
+ * of a code block. Use `.push_counters()` to start tracking statistics on a code block
+ * and use `.pop_counters()` to stop the tracking. The nested statistics are cascading
+ * such that the statistics tracked by a code block includes the statistics tracked in
+ * all its tracked sub code block.
  *
  * `statistics_resource_adaptor` is intended as a debug adaptor and shouldn't be
  * used in performance-sensitive code.
@@ -91,18 +95,19 @@ class statistics_resource_adaptor final : public device_memory_resource {
     /**
      * @brief Add `val` to the current value and update the peak value if necessary
      *
-     * @note When updating the peak value, we assume that `val` is the inner counter of
-     * `this` on the counter stack so its peak value becomes `this->value + val.peak`.
+     * When updating the peak value, we assume that `val` is tracking a code block inside the
+     * code block tracked by `this`. Because nested statistics are cascading, we have to convert
+     * `val.peak` to the peak it would have been if it was part of the statistics tracked by `this`.
+     * We do this by adding the current value that was active when `val` started tracking such that
+     * we get `std::max(value + val.peak, peak)`.
      *
      * @param val Value to add
-     * @return Reference to this object
      */
-    counter& operator+=(const counter& val)
+    void add_counters_from_tracked_sub_block(const counter& val)
     {
       peak = std::max(value + val.peak, peak);
       value += val.value;
       total += val.total;
-      return *this;
     }
   };
 
@@ -198,9 +203,9 @@ class statistics_resource_adaptor final : public device_memory_resource {
     if (counter_stack_.size() < 2) { throw std::out_of_range("cannot pop the last counter pair"); }
     auto ret = counter_stack_.top();
     counter_stack_.pop();
-    // The new top inherits the statistics
-    counter_stack_.top().first += ret.first;
-    counter_stack_.top().second += ret.second;
+    // Update the new top pair of counters
+    counter_stack_.top().first.add_counters_from_tracked_sub_block(ret.first);
+    counter_stack_.top().second.add_counters_from_tracked_sub_block(ret.second);
     return ret;
   }
 

From bf49dab1b9114880e85b410843b4c96f5cfb18eb Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 23 May 2024 12:25:34 +0200
Subject: [PATCH 14/43] c++ tests

---
 tests/mr/device/statistics_mr_tests.cpp | 44 ++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/tests/mr/device/statistics_mr_tests.cpp b/tests/mr/device/statistics_mr_tests.cpp
index 8fd12f49b..6c5700f0b 100644
--- a/tests/mr/device/statistics_mr_tests.cpp
+++ b/tests/mr/device/statistics_mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -234,5 +234,47 @@ TEST(StatisticsTest, NegativeInnerTracking)
   EXPECT_EQ(inner_mr.get_allocations_counter().total, 5);
 }
 
+TEST(StatisticsTest, Nested)
+{
+  statistics_adaptor mr{rmm::mr::get_current_device_resource()};
+  void* a0 = mr.allocate(ten_MiB);
+  EXPECT_EQ(mr.get_bytes_counter().value, ten_MiB);
+  EXPECT_EQ(mr.get_allocations_counter().value, 1);
+  {
+    auto [bytes, allocs] = mr.push_counters();
+    EXPECT_EQ(bytes.value, ten_MiB);
+    EXPECT_EQ(allocs.value, 1);
+  }
+  EXPECT_EQ(mr.get_bytes_counter().value, 0);
+  EXPECT_EQ(mr.get_allocations_counter().value, 0);
+  void* a1 = mr.allocate(ten_MiB);
+  mr.push_counters();
+  EXPECT_EQ(mr.get_bytes_counter().value, 0);
+  EXPECT_EQ(mr.get_allocations_counter().value, 0);
+  void* a2 = mr.allocate(ten_MiB);
+  mr.deallocate(a2, ten_MiB);
+  EXPECT_EQ(mr.get_bytes_counter().value, 0);
+  EXPECT_EQ(mr.get_bytes_counter().peak, ten_MiB);
+  EXPECT_EQ(mr.get_allocations_counter().value, 0);
+  EXPECT_EQ(mr.get_allocations_counter().peak, 1);
+  {
+    auto [bytes, allocs] = mr.pop_counters();
+    EXPECT_EQ(bytes.value, 0);
+    EXPECT_EQ(bytes.peak, ten_MiB);
+    EXPECT_EQ(allocs.value, 0);
+    EXPECT_EQ(allocs.peak, 1);
+  }
+  mr.deallocate(a0, ten_MiB);
+  {
+    auto [bytes, allocs] = mr.pop_counters();
+    EXPECT_EQ(bytes.value, 0);
+    EXPECT_EQ(bytes.peak, ten_MiB * 2);
+    EXPECT_EQ(allocs.value, 0);
+    EXPECT_EQ(allocs.peak, 2);
+  }
+  mr.deallocate(a1, ten_MiB);
+  EXPECT_THROW(mr.pop_counters(), std::out_of_range);
+}
+
 }  // namespace
 }  // namespace rmm::test

From 04b39cb6e3831a58bb60da6823c0dab42d66ad5e Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 27 May 2024 10:27:54 +0200
Subject: [PATCH 15/43] use dataclass Statistics

---
 python/rmm/rmm/_lib/memory_resource.pyx |  55 ++---
 python/rmm/rmm/statistics.py            |  37 ++-
 python/rmm/rmm/tests/test_statistics.py | 308 ++++++++++++------------
 3 files changed, 217 insertions(+), 183 deletions(-)

diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx
index e5e267144..f7b18d3d1 100644
--- a/python/rmm/rmm/_lib/memory_resource.pyx
+++ b/python/rmm/rmm/_lib/memory_resource.pyx
@@ -34,6 +34,7 @@ from cuda.cudart import cudaError_t
 from rmm._cuda.gpu import CUDARuntimeError, getDevice, setDevice
 from rmm._cuda.stream cimport Stream
 from rmm._cuda.stream import DEFAULT_STREAM
+from rmm.statistics import Statistics
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 from rmm._lib.per_device_resource cimport (
     cuda_device_id,
@@ -804,7 +805,7 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
         pass
 
     @property
-    def allocation_counts(self) -> dict:
+    def allocation_counts(self) -> Statistics:
         """
         Gets the current, peak, and total allocated bytes and number of
         allocations.
@@ -820,16 +821,16 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
 
         counts = deref(mr).get_allocations_counter()
         byte_counts = deref(mr).get_bytes_counter()
-        return {
-            "current_bytes": byte_counts.value,
-            "current_count": counts.value,
-            "peak_bytes": byte_counts.peak,
-            "peak_count": counts.peak,
-            "total_bytes": byte_counts.total,
-            "total_count": counts.total,
-        }
+        return Statistics(
+            current_bytes=byte_counts.value,
+            current_count=counts.value,
+            peak_bytes=byte_counts.peak,
+            peak_count=counts.peak,
+            total_bytes=byte_counts.total,
+            total_count=counts.total,
+        )
 
-    def pop_counters(self) -> dict:
+    def pop_counters(self) -> Statistics:
         """
         Pop a counter pair (bytes and allocations) from the stack
         """
@@ -837,16 +838,16 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
             <statistics_resource_adaptor[device_memory_resource]*> self.c_obj.get()
 
         bytes_and_allocs = deref(mr).pop_counters()
-        return {
-            "current_bytes": bytes_and_allocs.first.value,
-            "current_count": bytes_and_allocs.second.value,
-            "peak_bytes": bytes_and_allocs.first.peak,
-            "peak_count": bytes_and_allocs.second.peak,
-            "total_bytes": bytes_and_allocs.first.total,
-            "total_count": bytes_and_allocs.second.total,
-        }
+        return Statistics(
+            current_bytes=bytes_and_allocs.first.value,
+            current_count=bytes_and_allocs.second.value,
+            peak_bytes=bytes_and_allocs.first.peak,
+            peak_count=bytes_and_allocs.second.peak,
+            total_bytes=bytes_and_allocs.first.total,
+            total_count=bytes_and_allocs.second.total,
+        )
 
-    def push_counters(self) -> dict:
+    def push_counters(self) -> Statistics:
         """
         Push a new counter pair (bytes and allocations) on the stack
         """
@@ -855,14 +856,14 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
             <statistics_resource_adaptor[device_memory_resource]*> self.c_obj.get()
 
         bytes_and_allocs = deref(mr).push_counters()
-        return {
-            "current_bytes": bytes_and_allocs.first.value,
-            "current_count": bytes_and_allocs.second.value,
-            "peak_bytes": bytes_and_allocs.first.peak,
-            "peak_count": bytes_and_allocs.second.peak,
-            "total_bytes": bytes_and_allocs.first.total,
-            "total_count": bytes_and_allocs.second.total,
-        }
+        return Statistics(
+            current_bytes=bytes_and_allocs.first.value,
+            current_count=bytes_and_allocs.second.value,
+            peak_bytes=bytes_and_allocs.first.peak,
+            peak_count=bytes_and_allocs.second.peak,
+            total_bytes=bytes_and_allocs.first.total,
+            total_count=bytes_and_allocs.second.total,
+        )
 
 cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor):
 
diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index ba5860256..3bfad94c5 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -13,11 +13,40 @@
 # limitations under the License.
 
 from contextlib import contextmanager
-from typing import Dict, Optional
+from dataclasses import dataclass
+from typing import Optional
 
 import rmm.mr
 
 
+@dataclass
+class Statistics:
+    """Statistics returned by `{get,push,pop}_statistics()`
+
+    Attributes
+    ----------
+    current_bytes
+        Current number of bytes allocated
+    current_count
+        Current number of allocations allocated
+    peak_bytes
+        Peak number of bytes allocated
+    peak_count
+        Peak number of allocations allocated
+    total_bytes
+        Total number of bytes allocated
+    total_count
+        Total number of allocations allocated
+    """
+
+    current_bytes: int
+    current_count: int
+    peak_bytes: int
+    peak_count: int
+    total_bytes: int
+    total_count: int
+
+
 def enable_statistics() -> None:
     """Enable allocation statistics
 
@@ -38,7 +67,7 @@ def enable_statistics() -> None:
         )
 
 
-def get_statistics() -> Optional[Dict[str, int]]:
+def get_statistics() -> Optional[Statistics]:
     """Get the current allocation statistics
 
     Return
@@ -52,7 +81,7 @@ def get_statistics() -> Optional[Dict[str, int]]:
     return None
 
 
-def push_statistics() -> Optional[Dict[str, int]]:
+def push_statistics() -> Optional[Statistics]:
     """Push new counters on the current allocation statistics stack
 
     This returns the current tracked statistics and pushes a new set
@@ -72,7 +101,7 @@ def push_statistics() -> Optional[Dict[str, int]]:
     return None
 
 
-def pop_statistics() -> Optional[Dict[str, int]]:
+def pop_statistics() -> Optional[Statistics]:
     """Pop the counters of the current allocation statistics stack
 
     This returns the counters of current tracked statistics and pops
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index e4ab76e1d..af6f2ece9 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -16,6 +16,7 @@
 
 import rmm.mr
 from rmm.statistics import (
+    Statistics,
     get_statistics,
     pop_statistics,
     push_statistics,
@@ -39,34 +40,34 @@ def test_context():
             rmm.mr.StatisticsResourceAdaptor,
         )
         b1 = rmm.DeviceBuffer(size=20)
-        assert get_statistics() == {
-            "current_bytes": 32,
-            "current_count": 1,
-            "peak_bytes": 32,
-            "peak_count": 1,
-            "total_bytes": 32,
-            "total_count": 1,
-        }
+        assert get_statistics() == Statistics(
+            current_bytes=32,
+            current_count=1,
+            peak_bytes=32,
+            peak_count=1,
+            total_bytes=32,
+            total_count=1,
+        )
         with statistics():
             mr2 = rmm.mr.get_current_device_resource()
             assert mr1 is mr2
             b2 = rmm.DeviceBuffer(size=10)
-            assert get_statistics() == {
-                "current_bytes": 16,
-                "current_count": 1,
-                "peak_bytes": 16,
-                "peak_count": 1,
-                "total_bytes": 16,
-                "total_count": 1,
-            }
-        assert get_statistics() == {
-            "current_bytes": 48,
-            "current_count": 2,
-            "peak_bytes": 48,
-            "peak_count": 2,
-            "total_bytes": 48,
-            "total_count": 2,
-        }
+            assert get_statistics() == Statistics(
+                current_bytes=16,
+                current_count=1,
+                peak_bytes=16,
+                peak_count=1,
+                total_bytes=16,
+                total_count=1,
+            )
+        assert get_statistics() == Statistics(
+            current_bytes=48,
+            current_count=2,
+            peak_bytes=48,
+            peak_count=2,
+            total_bytes=48,
+            total_count=2,
+        )
         del b1
         del b2
     assert rmm.mr.get_current_device_resource() is mr0
@@ -78,14 +79,14 @@ def test_multiple_mr(stats_mr):
     for i in range(9, 0, -2):
         del buffers[i]
 
-    assert stats_mr.allocation_counts == {
-        "current_bytes": 5040,
-        "current_count": 5,
-        "peak_bytes": 10080,
-        "peak_count": 10,
-        "total_bytes": 10080,
-        "total_count": 10,
-    }
+    assert stats_mr.allocation_counts == Statistics(
+        current_bytes=5040,
+        current_count=5,
+        peak_bytes=10080,
+        peak_count=10,
+        total_bytes=10080,
+        total_count=10,
+    )
 
     # Push a new Tracking adaptor
     mr2 = rmm.mr.StatisticsResourceAdaptor(stats_mr)
@@ -94,39 +95,39 @@ def test_multiple_mr(stats_mr):
         for _ in range(2):
             buffers.append(rmm.DeviceBuffer(size=1000))
 
-        assert mr2.allocation_counts == {
-            "current_bytes": 2016,
-            "current_count": 2,
-            "peak_bytes": 2016,
-            "peak_count": 2,
-            "total_bytes": 2016,
-            "total_count": 2,
-        }
-        assert stats_mr.allocation_counts == {
-            "current_bytes": 7056,
-            "current_count": 7,
-            "peak_bytes": 10080,
-            "peak_count": 10,
-            "total_bytes": 12096,
-            "total_count": 12,
-        }
+        assert mr2.allocation_counts == Statistics(
+            current_bytes=2016,
+            current_count=2,
+            peak_bytes=2016,
+            peak_count=2,
+            total_bytes=2016,
+            total_count=2,
+        )
+        assert stats_mr.allocation_counts == Statistics(
+            current_bytes=7056,
+            current_count=7,
+            peak_bytes=10080,
+            peak_count=10,
+            total_bytes=12096,
+            total_count=12,
+        )
         del buffers
-        assert mr2.allocation_counts == {
-            "current_bytes": 0,
-            "current_count": 0,
-            "peak_bytes": 2016,
-            "peak_count": 2,
-            "total_bytes": 2016,
-            "total_count": 2,
-        }
-        assert stats_mr.allocation_counts == {
-            "current_bytes": 0,
-            "current_count": 0,
-            "peak_bytes": 10080,
-            "peak_count": 10,
-            "total_bytes": 12096,
-            "total_count": 12,
-        }
+        assert mr2.allocation_counts == Statistics(
+            current_bytes=0,
+            current_count=0,
+            peak_bytes=2016,
+            peak_count=2,
+            total_bytes=2016,
+            total_count=2,
+        )
+        assert stats_mr.allocation_counts == Statistics(
+            current_bytes=0,
+            current_count=0,
+            peak_bytes=10080,
+            peak_count=10,
+            total_bytes=12096,
+            total_count=12,
+        )
     finally:
         rmm.mr.set_current_device_resource(stats_mr)
 
@@ -135,70 +136,73 @@ def test_counter_stack(stats_mr):
     buffers = [rmm.DeviceBuffer(size=10) for _ in range(10)]
 
     # push returns the stats from the top before the push
-    assert stats_mr.push_counters() == {  # stats from stack level 0
-        "current_bytes": 160,
-        "current_count": 10,
-        "peak_bytes": 160,
-        "peak_count": 10,
-        "total_bytes": 160,
-        "total_count": 10,
-    }
+    assert stats_mr.push_counters() == Statistics(  # stats from stack level 0
+        current_bytes=160,
+        current_count=10,
+        peak_bytes=160,
+        peak_count=10,
+        total_bytes=160,
+        total_count=10,
+    )
     b1 = rmm.DeviceBuffer(size=10)
-    assert stats_mr.push_counters() == {  # stats from stack level 1
-        "current_bytes": 16,
-        "current_count": 1,
-        "peak_bytes": 16,
-        "peak_count": 1,
-        "total_bytes": 16,
-        "total_count": 1,
-    }
+    assert stats_mr.push_counters() == Statistics(  # stats from stack level 1
+        current_bytes=16,
+        current_count=1,
+        peak_bytes=16,
+        peak_count=1,
+        total_bytes=16,
+        total_count=1,
+    )
     del b1
     # pop returns the popped stats
     # Note, the bytes and counts can be negative
-    assert stats_mr.pop_counters() == {  # stats from stack level 2
-        "current_bytes": -16,
-        "current_count": -1,
-        "peak_bytes": 0,
-        "peak_count": 0,
-        "total_bytes": 0,
-        "total_count": 0,
-    }
+    assert stats_mr.pop_counters() == Statistics(  # stats from stack level 2
+        current_bytes=-16,
+        current_count=-1,
+        peak_bytes=0,
+        peak_count=0,
+        total_bytes=0,
+        total_count=0,
+    )
     b1 = rmm.DeviceBuffer(size=10)
-    assert stats_mr.push_counters() == {  # stats from stack level 1
-        "current_bytes": 16,
-        "current_count": 1,
-        "peak_bytes": 16,
-        "peak_count": 1,
-        "total_bytes": 32,
-        "total_count": 2,
-    }
+    assert stats_mr.push_counters() == Statistics(  # stats from stack level 1
+        current_bytes=16,
+        current_count=1,
+        peak_bytes=16,
+        peak_count=1,
+        total_bytes=32,
+        total_count=2,
+    )
     b2 = rmm.DeviceBuffer(size=10)
-    assert stats_mr.pop_counters() == {  # stats from stack level 2
-        "current_bytes": 16,
-        "current_count": 1,
-        "peak_bytes": 16,
-        "peak_count": 1,
-        "total_bytes": 16,
-        "total_count": 1,
-    }
-    assert stats_mr.pop_counters() == {  # stats from stack level 1
-        "current_bytes": 32,
-        "current_count": 2,
-        "peak_bytes": 32,
-        "peak_count": 2,
-        "total_bytes": 48,
-        "total_count": 3,
-    }
+    assert stats_mr.pop_counters() == Statistics(  # stats from stack level 2
+        current_bytes=16,
+        current_count=1,
+        peak_bytes=16,
+        peak_count=1,
+        total_bytes=16,
+        total_count=1,
+    )
+    assert stats_mr.pop_counters() == Statistics(  # stats from stack level 1
+        current_bytes=32,
+        current_count=2,
+        peak_bytes=32,
+        peak_count=2,
+        total_bytes=48,
+        total_count=3,
+    )
     del b1
     del b2
-    assert stats_mr.allocation_counts == {  # stats from stack level 0
-        "current_bytes": 160,
-        "current_count": 10,
-        "peak_bytes": 192,
-        "peak_count": 12,
-        "total_bytes": 208,
-        "total_count": 13,
-    }
+    assert (
+        stats_mr.allocation_counts
+        == Statistics(  # stats from stack level 0
+            current_bytes=160,
+            current_count=10,
+            peak_bytes=192,
+            peak_count=12,
+            total_bytes=208,
+            total_count=13,
+        )
+    )
     del buffers
     with pytest.raises(IndexError, match="cannot pop the last counter pair"):
         stats_mr.pop_counters()
@@ -206,41 +210,41 @@ def test_counter_stack(stats_mr):
 
 def test_current_statistics(stats_mr):
     b1 = rmm.DeviceBuffer(size=10)
-    assert get_statistics() == {
-        "current_bytes": 16,
-        "current_count": 1,
-        "peak_bytes": 16,
-        "peak_count": 1,
-        "total_bytes": 16,
-        "total_count": 1,
-    }
+    assert get_statistics() == Statistics(
+        current_bytes=16,
+        current_count=1,
+        peak_bytes=16,
+        peak_count=1,
+        total_bytes=16,
+        total_count=1,
+    )
     b2 = rmm.DeviceBuffer(size=20)
-    assert push_statistics() == {
-        "current_bytes": 48,
-        "current_count": 2,
-        "peak_bytes": 48,
-        "peak_count": 2,
-        "total_bytes": 48,
-        "total_count": 2,
-    }
+    assert push_statistics() == Statistics(
+        current_bytes=48,
+        current_count=2,
+        peak_bytes=48,
+        peak_count=2,
+        total_bytes=48,
+        total_count=2,
+    )
     del b1
-    assert pop_statistics() == {
-        "current_bytes": -16,
-        "current_count": -1,
-        "peak_bytes": 0,
-        "peak_count": 0,
-        "total_bytes": 0,
-        "total_count": 0,
-    }
+    assert pop_statistics() == Statistics(
+        current_bytes=-16,
+        current_count=-1,
+        peak_bytes=0,
+        peak_count=0,
+        total_bytes=0,
+        total_count=0,
+    )
     del b2
-    assert get_statistics() == {
-        "current_bytes": 0,
-        "current_count": 0,
-        "peak_bytes": 48,
-        "peak_count": 2,
-        "total_bytes": 48,
-        "total_count": 2,
-    }
+    assert get_statistics() == Statistics(
+        current_bytes=0,
+        current_count=0,
+        peak_bytes=48,
+        peak_count=2,
+        total_bytes=48,
+        total_count=2,
+    )
 
 
 def test_statistics_disabled():

From 1dbd022b61e23eaed3ee4b5f84e967a037965ade Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 27 May 2024 14:51:45 +0200
Subject: [PATCH 16/43] memory profiler

---
 python/rmm/rmm/statistics.py            | 117 +++++++++++++++++++++++-
 python/rmm/rmm/tests/test_statistics.py |  50 ++++++++++
 2 files changed, 166 insertions(+), 1 deletion(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 3bfad94c5..731de4d9a 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Optional
+from functools import wraps
+from typing import Dict, Literal, Optional
 
 import rmm.mr
 
@@ -160,3 +162,116 @@ def statistics():
             pop_statistics()
         else:
             rmm.mr.set_current_device_resource(prior_non_stats_mr)
+
+
+class ProfilerRecords:
+    """Records of the memory statistics recorded by a profiler"""
+
+    @dataclass
+    class Data:
+        """Memory statistics of a single function"""
+
+        num_calls: int = 0
+        memory_total: int = 0
+        memory_peak: int = 0
+
+        def add(self, memory_total: int, memory_peak: int):
+            self.num_calls += 1
+            self.memory_total += memory_total
+            self.memory_peak = max(self.memory_peak, memory_peak)
+
+    def __init__(self) -> None:
+        self._records: Dict[str, ProfilerRecords.Data] = defaultdict(
+            ProfilerRecords.Data
+        )
+
+    def add(self, name: str, data: Statistics) -> None:
+        """Add memory statistics of the function named `name`
+
+        Parameters
+        ----------
+        name
+            Name of the function
+        data
+            Memory statistics of `name`
+        """
+        self._records[name].add(
+            memory_total=data.current_bytes, memory_peak=data.peak_bytes
+        )
+
+    @property
+    def records(self) -> Dict[str, Data]:
+        """Dictionary mapping function names to their memory statistics"""
+        return dict(self._records)
+
+    def pretty_print(
+        self,
+        ordered_by: Literal[
+            "num_calls", "memory_peak", "memory_total"
+        ] = "memory_peak",
+    ) -> str:
+        """Pretty format the recorded memory statistics
+
+        Parameters
+        ----------
+        ordered_by
+            Sort the statistics by this attribute.
+
+        Return
+        ------
+        The pretty formatted string of the memory statistics
+        """
+
+        # Sort by `ordered_by`
+        records = sorted(
+            ((name, data) for name, data in self.records.items()),
+            key=lambda x: getattr(x[1], ordered_by),
+            reverse=True,
+        )
+        ret = "Memory Profiling\n"
+        ret += "================\n\n"
+        if len(records) == 0:
+            return ret + "No data, maybe profiling wasn't enabled?"
+        ret += f"Ordered by: {ordered_by}\n\n"
+        ret += "ncalls     memory_peak    memory_total  filename\n"
+        for name, data in records:
+            ret += f"{data.num_calls:6,d} {data.memory_peak:15,d} "
+            ret += f"{data.memory_total:15,d}  {name}\n"
+        return ret[:-1]  # Remove the final newline
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.records})"
+
+    def __str__(self) -> str:
+        return self.pretty_print()
+
+
+def profiler(profiler_records: ProfilerRecords):
+    """Decorator to memory profile function
+
+    If statistics are enabled (the current memory resource is not an
+    instance of StatisticsResourceAdaptor), this decorator records the
+    memory statistics of the decorated function.
+
+    If statistics are disabled, this decorator is a no-op.
+
+    Parameters
+    ----------
+    profiler_records
+        The profiler records that the memory statistics are written to.
+    """
+
+    def f(func: callable):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            try:
+                push_statistics()
+                ret = func(*args, **kwargs)
+            finally:
+                if (stats := pop_statistics()) is not None:
+                    profiler_records.add(name=func.__qualname__, data=stats)
+                return ret
+
+        return wrapper
+
+    return f
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index af6f2ece9..7a5299945 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -16,9 +16,11 @@
 
 import rmm.mr
 from rmm.statistics import (
+    ProfilerRecords,
     Statistics,
     get_statistics,
     pop_statistics,
+    profiler,
     push_statistics,
     statistics,
 )
@@ -251,3 +253,51 @@ def test_statistics_disabled():
     assert get_statistics() is None
     assert push_statistics() is None
     assert get_statistics() is None
+
+
+def test_function_statistics(stats_mr):
+    profiler_records = ProfilerRecords()
+    assert len(profiler_records.records) == 0
+    assert "No data" in profiler_records.pretty_print()
+
+    @profiler(profiler_records)
+    def f1():
+        b1 = rmm.DeviceBuffer(size=10)
+        b2 = rmm.DeviceBuffer(size=10)
+        del b1
+        return b2
+
+    b1 = f1()
+    b2 = f1()
+
+    @profiler(profiler_records)
+    def f2():
+        b1 = rmm.DeviceBuffer(size=10)
+
+        @profiler(profiler_records)
+        def g2(b1):
+            b2 = rmm.DeviceBuffer(size=10)
+            del b1
+            return b2
+
+        return g2(b1)
+
+    f2()
+    del b1
+    del b2
+
+    @profiler(profiler_records)
+    def f3():
+        return [rmm.DeviceBuffer(size=100) for _ in range(100)]
+
+    f3()
+
+    assert profiler_records.records[f1.__qualname__] == ProfilerRecords.Data(
+        num_calls=2, memory_total=32, memory_peak=32
+    )
+    assert profiler_records.records[f2.__qualname__] == ProfilerRecords.Data(
+        num_calls=1, memory_total=16, memory_peak=32
+    )
+    assert profiler_records.records[f3.__qualname__] == ProfilerRecords.Data(
+        num_calls=1, memory_total=11200, memory_peak=11200
+    )

From badbb56df2941dd896e71ed9373cb18e597decb9 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 27 May 2024 14:53:53 +0200
Subject: [PATCH 17/43] clean up

---
 python/rmm/rmm/tests/test_statistics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index 7a5299945..3eb89d0e0 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -255,7 +255,7 @@ def test_statistics_disabled():
     assert get_statistics() is None
 
 
-def test_function_statistics(stats_mr):
+def test_profiler(stats_mr):
     profiler_records = ProfilerRecords()
     assert len(profiler_records.records) == 0
     assert "No data" in profiler_records.pretty_print()

From 6f35d230a4f4bc309d44e903e620f85f12de3310 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 27 May 2024 15:31:38 +0200
Subject: [PATCH 18/43] fix typo

---
 python/rmm/rmm/statistics.py            | 2 +-
 python/rmm/rmm/tests/test_statistics.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 731de4d9a..b95d087dd 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -196,7 +196,7 @@ def add(self, name: str, data: Statistics) -> None:
             Memory statistics of `name`
         """
         self._records[name].add(
-            memory_total=data.current_bytes, memory_peak=data.peak_bytes
+            memory_total=data.total_bytes, memory_peak=data.peak_bytes
         )
 
     @property
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index 3eb89d0e0..2066e1d79 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -293,10 +293,10 @@ def f3():
     f3()
 
     assert profiler_records.records[f1.__qualname__] == ProfilerRecords.Data(
-        num_calls=2, memory_total=32, memory_peak=32
+        num_calls=2, memory_total=64, memory_peak=32
     )
     assert profiler_records.records[f2.__qualname__] == ProfilerRecords.Data(
-        num_calls=1, memory_total=16, memory_peak=32
+        num_calls=1, memory_total=32, memory_peak=32
     )
     assert profiler_records.records[f3.__qualname__] == ProfilerRecords.Data(
         num_calls=1, memory_total=11200, memory_peak=11200

From d8ee6337dde91fbb0a9ac1e4894cb443465306f0 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 27 May 2024 16:40:16 +0200
Subject: [PATCH 19/43] descriptive name

---
 python/rmm/rmm/statistics.py            | 35 ++++++++++++++++++++-----
 python/rmm/rmm/tests/test_statistics.py | 13 ++++++---
 2 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index b95d087dd..0ef3ad0d9 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import inspect
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -169,7 +170,7 @@ class ProfilerRecords:
 
     @dataclass
     class Data:
-        """Memory statistics of a single function"""
+        """Single record of memory statistics"""
 
         num_calls: int = 0
         memory_total: int = 0
@@ -186,12 +187,12 @@ def __init__(self) -> None:
         )
 
     def add(self, name: str, data: Statistics) -> None:
-        """Add memory statistics of the function named `name`
+        """Add memory statistics to the record named `name`
 
         Parameters
         ----------
         name
-            Name of the function
+            Name of the record
         data
             Memory statistics of `name`
         """
@@ -201,7 +202,7 @@ def add(self, name: str, data: Statistics) -> None:
 
     @property
     def records(self) -> Dict[str, Data]:
-        """Dictionary mapping function names to their memory statistics"""
+        """Dictionary mapping record names to their memory statistics"""
         return dict(self._records)
 
     def pretty_print(
@@ -233,7 +234,8 @@ def pretty_print(
         if len(records) == 0:
             return ret + "No data, maybe profiling wasn't enabled?"
         ret += f"Ordered by: {ordered_by}\n\n"
-        ret += "ncalls     memory_peak    memory_total  filename\n"
+        ret += "ncalls     memory_peak    memory_total  "
+        ret += "filename:lineno(function)\n"
         for name, data in records:
             ret += f"{data.num_calls:6,d} {data.memory_peak:15,d} "
             ret += f"{data.memory_total:15,d}  {name}\n"
@@ -246,6 +248,25 @@ def __str__(self) -> str:
         return self.pretty_print()
 
 
+def get_descriptive_name_of_object(obj: object) -> str:
+    """Get name of object, which include filename, sourceline, and object name
+
+    Parameters
+    ----------
+    obj
+        Object in question
+
+    Return
+    ------
+    Descriptive name of the object
+    """
+
+    obj = inspect.unwrap(obj)
+    _, linenumber = inspect.getsourcelines(obj)
+    filepath = inspect.getfile(obj)
+    return f"{filepath}:{linenumber}({obj.__qualname__})"
+
+
 def profiler(profiler_records: ProfilerRecords):
     """Decorator to memory profile function
 
@@ -262,6 +283,8 @@ def profiler(profiler_records: ProfilerRecords):
     """
 
     def f(func: callable):
+        name = get_descriptive_name_of_object(func)
+
         @wraps(func)
         def wrapper(*args, **kwargs):
             try:
@@ -269,7 +292,7 @@ def wrapper(*args, **kwargs):
                 ret = func(*args, **kwargs)
             finally:
                 if (stats := pop_statistics()) is not None:
-                    profiler_records.add(name=func.__qualname__, data=stats)
+                    profiler_records.add(name=name, data=stats)
                 return ret
 
         return wrapper
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index 2066e1d79..fbc5527c5 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -18,6 +18,7 @@
 from rmm.statistics import (
     ProfilerRecords,
     Statistics,
+    get_descriptive_name_of_object,
     get_statistics,
     pop_statistics,
     profiler,
@@ -282,9 +283,11 @@ def g2(b1):
 
         return g2(b1)
 
+    f2()
     f2()
     del b1
     del b2
+    f2()
 
     @profiler(profiler_records)
     def f3():
@@ -292,12 +295,14 @@ def f3():
 
     f3()
 
-    assert profiler_records.records[f1.__qualname__] == ProfilerRecords.Data(
+    print(profiler_records)
+    records = profiler_records.records
+    assert records[get_descriptive_name_of_object(f1)] == ProfilerRecords.Data(
         num_calls=2, memory_total=64, memory_peak=32
     )
-    assert profiler_records.records[f2.__qualname__] == ProfilerRecords.Data(
-        num_calls=1, memory_total=32, memory_peak=32
+    assert records[get_descriptive_name_of_object(f2)] == ProfilerRecords.Data(
+        num_calls=3, memory_total=96, memory_peak=32
     )
-    assert profiler_records.records[f3.__qualname__] == ProfilerRecords.Data(
+    assert records[get_descriptive_name_of_object(f3)] == ProfilerRecords.Data(
         num_calls=1, memory_total=11200, memory_peak=11200
     )

From 2df77d159a78c2e8ec9c5e80bcf7174997718f12 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 27 May 2024 17:03:58 +0200
Subject: [PATCH 20/43] default_profiler_records

---
 python/rmm/rmm/statistics.py            | 19 ++++++++++++++-----
 python/rmm/rmm/tests/test_statistics.py | 12 +++++++++++-
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 0ef3ad0d9..11da45564 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import inspect
+import threading
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -182,6 +183,7 @@ def add(self, memory_total: int, memory_peak: int):
             self.memory_peak = max(self.memory_peak, memory_peak)
 
     def __init__(self) -> None:
+        self._lock = threading.Lock()
         self._records: Dict[str, ProfilerRecords.Data] = defaultdict(
             ProfilerRecords.Data
         )
@@ -189,6 +191,8 @@ def __init__(self) -> None:
     def add(self, name: str, data: Statistics) -> None:
         """Add memory statistics to the record named `name`
 
+        This method is thread-safe.
+
         Parameters
         ----------
         name
@@ -196,9 +200,10 @@ def add(self, name: str, data: Statistics) -> None:
         data
             Memory statistics of `name`
         """
-        self._records[name].add(
-            memory_total=data.total_bytes, memory_peak=data.peak_bytes
-        )
+        with self._lock:
+            self._records[name].add(
+                memory_total=data.total_bytes, memory_peak=data.peak_bytes
+            )
 
     @property
     def records(self) -> Dict[str, Data]:
@@ -267,7 +272,10 @@ def get_descriptive_name_of_object(obj: object) -> str:
     return f"{filepath}:{linenumber}({obj.__qualname__})"
 
 
-def profiler(profiler_records: ProfilerRecords):
+default_profiler_records = ProfilerRecords()
+
+
+def profiler(profiler_records: ProfilerRecords = default_profiler_records):
     """Decorator to memory profile function
 
     If statistics are enabled (the current memory resource is not an
@@ -279,7 +287,8 @@ def profiler(profiler_records: ProfilerRecords):
     Parameters
     ----------
     profiler_records
-        The profiler records that the memory statistics are written to.
+        The profiler records that the memory statistics are written to. If
+        not set, a default profiler records are used.
     """
 
     def f(func: callable):
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index fbc5527c5..571d3eebf 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -18,6 +18,7 @@
 from rmm.statistics import (
     ProfilerRecords,
     Statistics,
+    default_profiler_records,
     get_descriptive_name_of_object,
     get_statistics,
     pop_statistics,
@@ -295,7 +296,12 @@ def f3():
 
     f3()
 
-    print(profiler_records)
+    @profiler()  # use the default profiler records
+    def f4():
+        return [rmm.DeviceBuffer(size=10) for _ in range(10)]
+
+    f4()
+
     records = profiler_records.records
     assert records[get_descriptive_name_of_object(f1)] == ProfilerRecords.Data(
         num_calls=2, memory_total=64, memory_peak=32
@@ -306,3 +312,7 @@ def f3():
     assert records[get_descriptive_name_of_object(f3)] == ProfilerRecords.Data(
         num_calls=1, memory_total=11200, memory_peak=11200
     )
+    records = default_profiler_records.records
+    assert records[get_descriptive_name_of_object(f4)] == ProfilerRecords.Data(
+        num_calls=1, memory_total=160, memory_peak=160
+    )

From 89827adcade2a726ae087779adcefa42c10eeca4 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 28 May 2024 08:22:53 +0200
Subject: [PATCH 21/43] tracking_resource_adaptor: use std::shared_mutex

Closes #1332
---
 include/rmm/mr/device/tracking_resource_adaptor.hpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/rmm/mr/device/tracking_resource_adaptor.hpp b/include/rmm/mr/device/tracking_resource_adaptor.hpp
index c49674849..d01d08b9c 100644
--- a/include/rmm/mr/device/tracking_resource_adaptor.hpp
+++ b/include/rmm/mr/device/tracking_resource_adaptor.hpp
@@ -53,11 +53,10 @@ namespace rmm::mr {
 template <typename Upstream>
 class tracking_resource_adaptor final : public device_memory_resource {
  public:
-  // can be a std::shared_mutex once C++17 is adopted
   using read_lock_t =
-    std::shared_lock<std::shared_timed_mutex>;  ///< Type of lock used to synchronize read access
+    std::shared_lock<std::shared_mutex>;  ///< Type of lock used to synchronize read access
   using write_lock_t =
-    std::unique_lock<std::shared_timed_mutex>;  ///< Type of lock used to synchronize write access
+    std::unique_lock<std::shared_mutex>;  ///< Type of lock used to synchronize write access
   /**
    * @brief Information stored about an allocation. Includes the size
    * and a stack trace if the `tracking_resource_adaptor` was initialized
@@ -271,7 +270,7 @@ class tracking_resource_adaptor final : public device_memory_resource {
   bool capture_stacks_;                           // whether or not to capture call stacks
   std::map<void*, allocation_info> allocations_;  // map of active allocations
   std::atomic<std::size_t> allocated_bytes_;      // number of bytes currently allocated
-  std::shared_timed_mutex mutable mtx_;           // mutex for thread safe access to allocations_
+  std::shared_mutex mutable mtx_;                 // mutex for thread safe access to allocations_
   Upstream* upstream_;  // the upstream resource used for satisfying allocation requests
 };
 

From 24157b57e6053c516f2a94b789b1ecb208f8d00b Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 28 May 2024 08:35:00 +0200
Subject: [PATCH 22/43] fix pytorch test

---
 python/rmm/rmm/tests/conftest.py         |  8 ++++++++
 python/rmm/rmm/tests/test_rmm_pytorch.py | 26 ++++++++++++++++++------
 python/rmm/rmm/tests/test_statistics.py  |  6 ------
 3 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/python/rmm/rmm/tests/conftest.py b/python/rmm/rmm/tests/conftest.py
index e7e74eebc..b6debd9a2 100644
--- a/python/rmm/rmm/tests/conftest.py
+++ b/python/rmm/rmm/tests/conftest.py
@@ -15,6 +15,7 @@
 import pytest
 
 import rmm
+import rmm.statistics
 
 
 @pytest.fixture(scope="function", autouse=True)
@@ -26,3 +27,10 @@ def rmm_auto_reinitialize():
     # test
 
     rmm.reinitialize()
+
+
+@pytest.fixture
+def stats_mr():
+    """Fixture that makes a StatisticsResourceAdaptor available to the test"""
+    with rmm.statistics.statistics():
+        yield rmm.mr.get_current_device_resource()
diff --git a/python/rmm/rmm/tests/test_rmm_pytorch.py b/python/rmm/rmm/tests/test_rmm_pytorch.py
index 065507b61..2c9a4af23 100644
--- a/python/rmm/rmm/tests/test_rmm_pytorch.py
+++ b/python/rmm/rmm/tests/test_rmm_pytorch.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import gc
 
 import pytest
@@ -17,21 +31,21 @@ def torch_allocator():
 
 
 def test_rmm_torch_allocator(torch_allocator, stats_mr):
-    assert stats_mr.allocation_counts["current_bytes"] == 0
+    assert stats_mr.allocation_counts.current_bytes == 0
     x = torch.tensor([1, 2]).cuda()
-    assert stats_mr.allocation_counts["current_bytes"] > 0
+    assert stats_mr.allocation_counts.current_bytes > 0
     del x
     gc.collect()
-    assert stats_mr.allocation_counts["current_bytes"] == 0
+    assert stats_mr.allocation_counts.current_bytes == 0
 
 
 def test_rmm_torch_allocator_using_stream(torch_allocator, stats_mr):
-    assert stats_mr.allocation_counts["current_bytes"] == 0
+    assert stats_mr.allocation_counts.current_bytes == 0
     s = torch.cuda.Stream()
     with torch.cuda.stream(s):
         x = torch.tensor([1, 2]).cuda()
     torch.cuda.current_stream().wait_stream(s)
-    assert stats_mr.allocation_counts["current_bytes"] > 0
+    assert stats_mr.allocation_counts.current_bytes > 0
     del x
     gc.collect()
-    assert stats_mr.allocation_counts["current_bytes"] == 0
+    assert stats_mr.allocation_counts.current_bytes == 0
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index 571d3eebf..98923b864 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -28,12 +28,6 @@
 )
 
 
-@pytest.fixture
-def stats_mr():
-    with statistics():
-        yield rmm.mr.get_current_device_resource()
-
-
 def test_context():
     mr0 = rmm.mr.get_current_device_resource()
     assert get_statistics() is None

From 8df2d0a0f6403023818783523d70041b5718643f Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 28 May 2024 08:40:38 +0200
Subject: [PATCH 23/43] pretty_print: added memory units

---
 python/rmm/rmm/statistics.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 11da45564..b6efadb69 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -238,7 +238,8 @@ def pretty_print(
         ret += "================\n\n"
         if len(records) == 0:
             return ret + "No data, maybe profiling wasn't enabled?"
-        ret += f"Ordered by: {ordered_by}\n\n"
+        ret += f"Ordered by:   {ordered_by}\n"
+        ret += "Memory units: bytes\n\n"
         ret += "ncalls     memory_peak    memory_total  "
         ret += "filename:lineno(function)\n"
         for name, data in records:

From a82a7b6b6e8e4bfd7eb1229ee8e71a1eb5593ea8 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 28 May 2024 08:48:27 +0200
Subject: [PATCH 24/43] doc

---
 python/rmm/rmm/statistics.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index b6efadb69..4f4d63a48 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -171,7 +171,17 @@ class ProfilerRecords:
 
     @dataclass
     class Data:
-        """Single record of memory statistics"""
+        """Memory statistics of a single code block
+
+        Attributes
+        ----------
+        num_calls
+            Number of times this code block was evoked.
+        memory_total
+            Total number of bytes allocated
+        memory_peak
+            Peak number of bytes allocated
+        """
 
         num_calls: int = 0
         memory_total: int = 0

From 2b4b7d37dd840713e8e9e8d7aa7ac2fb9ada18cd Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 28 May 2024 09:23:04 +0200
Subject: [PATCH 25/43] profiler: accept name argument

---
 python/rmm/rmm/statistics.py            | 19 +++++++++++++------
 python/rmm/rmm/tests/test_statistics.py | 11 +++++++----
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 4f4d63a48..59e9dc9bb 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -265,7 +265,7 @@ def __str__(self) -> str:
 
 
 def get_descriptive_name_of_object(obj: object) -> str:
-    """Get name of object, which include filename, sourceline, and object name
+    """Get name of object, which include filename, line number, and object name
 
     Parameters
     ----------
@@ -286,7 +286,11 @@ def get_descriptive_name_of_object(obj: object) -> str:
 default_profiler_records = ProfilerRecords()
 
 
-def profiler(profiler_records: ProfilerRecords = default_profiler_records):
+def profiler(
+    *,
+    records: ProfilerRecords = default_profiler_records,
+    name: str = "",
+):
     """Decorator to memory profile function
 
     If statistics are enabled (the current memory resource is not an
@@ -297,22 +301,25 @@ def profiler(profiler_records: ProfilerRecords = default_profiler_records):
 
     Parameters
     ----------
-    profiler_records
+    records
         The profiler records that the memory statistics are written to. If
         not set, a default profiler records are used.
+    name
+        The name of the memory profile. If None, a descriptive name is used.
+        Typically includes the filename and line number.
     """
 
     def f(func: callable):
-        name = get_descriptive_name_of_object(func)
+        _name = name or get_descriptive_name_of_object(func)
 
         @wraps(func)
         def wrapper(*args, **kwargs):
+            push_statistics()
             try:
-                push_statistics()
                 ret = func(*args, **kwargs)
             finally:
                 if (stats := pop_statistics()) is not None:
-                    profiler_records.add(name=name, data=stats)
+                    records.add(name=_name, data=stats)
                 return ret
 
         return wrapper
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index 98923b864..814da73f4 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -256,7 +256,7 @@ def test_profiler(stats_mr):
     assert len(profiler_records.records) == 0
     assert "No data" in profiler_records.pretty_print()
 
-    @profiler(profiler_records)
+    @profiler(records=profiler_records)
     def f1():
         b1 = rmm.DeviceBuffer(size=10)
         b2 = rmm.DeviceBuffer(size=10)
@@ -266,11 +266,11 @@ def f1():
     b1 = f1()
     b2 = f1()
 
-    @profiler(profiler_records)
+    @profiler(records=profiler_records)
     def f2():
         b1 = rmm.DeviceBuffer(size=10)
 
-        @profiler(profiler_records)
+        @profiler(records=profiler_records, name="g2")
         def g2(b1):
             b2 = rmm.DeviceBuffer(size=10)
             del b1
@@ -284,7 +284,7 @@ def g2(b1):
     del b2
     f2()
 
-    @profiler(profiler_records)
+    @profiler(records=profiler_records)
     def f3():
         return [rmm.DeviceBuffer(size=100) for _ in range(100)]
 
@@ -303,6 +303,9 @@ def f4():
     assert records[get_descriptive_name_of_object(f2)] == ProfilerRecords.Data(
         num_calls=3, memory_total=96, memory_peak=32
     )
+    assert records["g2"] == ProfilerRecords.Data(
+        num_calls=3, memory_total=48, memory_peak=16
+    )
     assert records[get_descriptive_name_of_object(f3)] == ProfilerRecords.Data(
         num_calls=1, memory_total=11200, memory_peak=11200
     )

From 0067c0537044ef7946d11064578b0ae2fd8fd6a6 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 28 May 2024 09:48:21 +0200
Subject: [PATCH 26/43] profiler: now also a context manager

---
 python/rmm/rmm/statistics.py            | 53 ++++++++++++++++---------
 python/rmm/rmm/tests/test_statistics.py | 26 +++++++++---
 2 files changed, 55 insertions(+), 24 deletions(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 59e9dc9bb..1612e6e59 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -291,13 +291,13 @@ def profiler(
     records: ProfilerRecords = default_profiler_records,
     name: str = "",
 ):
-    """Decorator to memory profile function
+    """Decorator and context to profile function or code block
 
     If statistics are enabled (the current memory resource is not an
     instance of StatisticsResourceAdaptor), this decorator records the
-    memory statistics of the decorated function.
+    memory statistics of the decorated function or code block.
 
-    If statistics are disabled, this decorator is a no-op.
+    If statistics are disabled, this decorator/context is a no-op.
 
     Parameters
     ----------
@@ -305,23 +305,40 @@ def profiler(
         The profiler records that the memory statistics are written to. If
         not set, a default profiler records are used.
     name
-        The name of the memory profile. If None, a descriptive name is used.
-        Typically includes the filename and line number.
+        The name of the memory profile, which is mandatory when the profiler
+        is used as a context manager. If used as a decorator, an empty name
+        is allowed. In this case, the name is the filename, line number, and
+        function name.
     """
 
-    def f(func: callable):
-        _name = name or get_descriptive_name_of_object(func)
-
-        @wraps(func)
-        def wrapper(*args, **kwargs):
+    class ProfilerContext:
+        def __call__(self, func: callable) -> callable:
+            _name = name or get_descriptive_name_of_object(func)
+
+            @wraps(func)
+            def wrapper(*args, **kwargs):
+                push_statistics()
+                try:
+                    ret = func(*args, **kwargs)
+                finally:
+                    if (stats := pop_statistics()) is not None:
+                        records.add(name=_name, data=stats)
+                    return ret
+
+            return wrapper
+
+        def __enter__(self):
+            if not name:
+                raise ValueError(
+                    "when profiler is used as a context mamanger, "
+                    "a name must be provided"
+                )
             push_statistics()
-            try:
-                ret = func(*args, **kwargs)
-            finally:
-                if (stats := pop_statistics()) is not None:
-                    records.add(name=_name, data=stats)
-                return ret
+            return self
 
-        return wrapper
+        def __exit__(self, *exc):
+            if (stats := pop_statistics()) is not None:
+                records.add(name=name, data=stats)
+            return False
 
-    return f
+    return ProfilerContext()
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index 814da73f4..c735634de 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -290,12 +290,6 @@ def f3():
 
     f3()
 
-    @profiler()  # use the default profiler records
-    def f4():
-        return [rmm.DeviceBuffer(size=10) for _ in range(10)]
-
-    f4()
-
     records = profiler_records.records
     assert records[get_descriptive_name_of_object(f1)] == ProfilerRecords.Data(
         num_calls=2, memory_total=64, memory_peak=32
@@ -309,7 +303,27 @@ def f4():
     assert records[get_descriptive_name_of_object(f3)] == ProfilerRecords.Data(
         num_calls=1, memory_total=11200, memory_peak=11200
     )
+
+    @profiler()  # use the default profiler records
+    def f4():
+        return [rmm.DeviceBuffer(size=10) for _ in range(10)]
+
+    f4()
+
+    with profiler(name="b1 and b2"):  # use the profiler as a context manager
+        b1 = rmm.DeviceBuffer(size=100)
+        b2 = rmm.DeviceBuffer(size=100)
+        with profiler(name="del b1 and b2"):
+            del b1
+            del b2
+
     records = default_profiler_records.records
     assert records[get_descriptive_name_of_object(f4)] == ProfilerRecords.Data(
         num_calls=1, memory_total=160, memory_peak=160
     )
+    assert records["b1 and b2"] == ProfilerRecords.Data(
+        num_calls=1, memory_total=224, memory_peak=224
+    )
+    assert records["del b1 and b2"] == ProfilerRecords.Data(
+        num_calls=1, memory_total=0, memory_peak=0
+    )

From ab97d2adefc4d2cd3d58a99de71b19fb2db5160e Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 28 May 2024 13:28:09 +0200
Subject: [PATCH 27/43] cleanup

---
 python/rmm/rmm/statistics.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 1612e6e59..c5fc4c650 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -319,11 +319,10 @@ def __call__(self, func: callable) -> callable:
             def wrapper(*args, **kwargs):
                 push_statistics()
                 try:
-                    ret = func(*args, **kwargs)
+                    return func(*args, **kwargs)
                 finally:
                     if (stats := pop_statistics()) is not None:
                         records.add(name=_name, data=stats)
-                    return ret
 
             return wrapper
 

From 189ca3031f2b612fbda5dfdd9073c3c53947d7e0 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 28 May 2024 16:23:16 +0200
Subject: [PATCH 28/43] pretty_print: output format

---
 python/rmm/rmm/statistics.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index c5fc4c650..878906f87 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -248,9 +248,15 @@ def pretty_print(
         ret += "================\n\n"
         if len(records) == 0:
             return ret + "No data, maybe profiling wasn't enabled?"
-        ret += f"Ordered by:   {ordered_by}\n"
-        ret += "Memory units: bytes\n\n"
-        ret += "ncalls     memory_peak    memory_total  "
+        ret += (
+            "Legends:\n"
+            "  ncalls       - number of time the function or code block "
+            "was called\n"
+            "  memory_peak  - peak memory allocated in bytes\n"
+            "  memory_total - total memory allocated in bytes\n"
+        )
+        ret += f"\nOrdered by: {ordered_by}\n"
+        ret += "\nncalls     memory_peak    memory_total  "
         ret += "filename:lineno(function)\n"
         for name, data in records:
             ret += f"{data.num_calls:6,d} {data.memory_peak:15,d} "

From 6debd8391f65567189a26702748930e91ef7ab76 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 28 May 2024 16:35:43 +0200
Subject: [PATCH 29/43] fix doc build

---
 python/rmm/docs/python_api.rst | 9 +++++++++
 python/rmm/rmm/statistics.py   | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/python/rmm/docs/python_api.rst b/python/rmm/docs/python_api.rst
index b229d8214..a62304d21 100644
--- a/python/rmm/docs/python_api.rst
+++ b/python/rmm/docs/python_api.rst
@@ -37,3 +37,12 @@ Memory Allocators
    :members:
    :undoc-members:
    :show-inheritance:
+
+Memory Statistics
+-----------------
+
+.. automodule:: rmm.statistics
+   :members:
+   :inherited-members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 878906f87..12c30433e 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -57,8 +57,8 @@ def enable_statistics() -> None:
     This function is idempotent, if statistics has been enabled for the
     current RMM resource stack, this is a no-op.
 
-    Warning
-    -------
+    Warnings
+    --------
     This modifies the current RMM memory resource. StatisticsResourceAdaptor
     is pushed onto the current RMM memory resource stack and must remain the
     the top must resource throughout the statistics gathering.

From 796e159c1b9544213aba301128c01370b1269491 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 May 2024 08:27:57 +0200
Subject: [PATCH 30/43] Apply suggestions from code review

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 .../rmm/mr/device/statistics_resource_adaptor.hpp |  4 ++--
 python/rmm/rmm/_lib/memory_resource.pyx           |  4 ++--
 python/rmm/rmm/statistics.py                      | 15 +++++++--------
 python/rmm/rmm/tests/test_statistics.py           | 15 +++++++--------
 4 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/include/rmm/mr/device/statistics_resource_adaptor.hpp b/include/rmm/mr/device/statistics_resource_adaptor.hpp
index 7819d688d..c8906afd2 100644
--- a/include/rmm/mr/device/statistics_resource_adaptor.hpp
+++ b/include/rmm/mr/device/statistics_resource_adaptor.hpp
@@ -42,8 +42,8 @@ namespace rmm::mr {
  * This resource supports nested statistics, which makes it possible to track statistics
  * of a code block. Use `.push_counters()` to start tracking statistics on a code block
  * and use `.pop_counters()` to stop the tracking. The nested statistics are cascading
- * such that the statistics tracked by a code block includes the statistics tracked in
- * all its tracked sub code block.
+ * such that the statistics tracked by a code block include the statistics tracked in
+ * all its tracked sub code blocks.
  *
  * `statistics_resource_adaptor` is intended as a debug adaptor and shouldn't be
  * used in performance-sensitive code.
diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx
index f7b18d3d1..0db05eeb3 100644
--- a/python/rmm/rmm/_lib/memory_resource.pyx
+++ b/python/rmm/rmm/_lib/memory_resource.pyx
@@ -794,8 +794,8 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
         allocations/deallocations performed by an upstream memory resource.
         Includes the ability to query these statistics at any time.
 
-        A stack of counters is maintained. Use `.push_counters()` and
-        `.pop_counters()` to track statistics at different nesting levels.
+        A stack of counters is maintained. Use :meth:`push_counters` and
+        :meth:`pop_counters` to track statistics at different nesting levels.
 
         Parameters
         ----------
diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 12c30433e..8be50a11b 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -25,7 +25,7 @@
 
 @dataclass
 class Statistics:
-    """Statistics returned by `{get,push,pop}_statistics()`
+    """Statistics returned by ``{get,push,pop}_statistics()``
 
     Attributes
     ----------
@@ -54,14 +54,14 @@ class Statistics:
 def enable_statistics() -> None:
     """Enable allocation statistics
 
-    This function is idempotent, if statistics has been enabled for the
+    This function is idempotent. If statistics have been enabled for the
     current RMM resource stack, this is a no-op.
 
     Warnings
     --------
     This modifies the current RMM memory resource. StatisticsResourceAdaptor
     is pushed onto the current RMM memory resource stack and must remain the
-    the top must resource throughout the statistics gathering.
+    the topmost resource throughout the statistics gathering.
     """
 
     mr = rmm.mr.get_current_device_resource()
@@ -144,12 +144,11 @@ def statistics():
         If the current RMM memory source was changed while in the context.
     """
 
+    prior_non_stats_mr = None
     if push_statistics() is None:
         # Save the current non-statistics memory resource for later cleanup
         prior_non_stats_mr = rmm.mr.get_current_device_resource()
         enable_statistics()
-    else:
-        prior_non_stats_mr = None
 
     try:
         current_mr = rmm.mr.get_current_device_resource()
@@ -176,7 +175,7 @@ class Data:
         Attributes
         ----------
         num_calls
-            Number of times this code block was evoked.
+            Number of times this code block was invoked.
         memory_total
             Total number of bytes allocated
         memory_peak
@@ -270,7 +269,7 @@ def __str__(self) -> str:
         return self.pretty_print()
 
 
-def get_descriptive_name_of_object(obj: object) -> str:
+def _get_descriptive_name_of_object(obj: object) -> str:
     """Get name of object, which include filename, line number, and object name
 
     Parameters
@@ -335,7 +334,7 @@ def wrapper(*args, **kwargs):
         def __enter__(self):
             if not name:
                 raise ValueError(
-                    "when profiler is used as a context mamanger, "
+                    "When profiler is used as a context manager, "
                     "a name must be provided"
                 )
             push_statistics()
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index c735634de..6fb847f15 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -77,14 +77,13 @@ def test_multiple_mr(stats_mr):
     for i in range(9, 0, -2):
         del buffers[i]
 
-    assert stats_mr.allocation_counts == Statistics(
-        current_bytes=5040,
-        current_count=5,
-        peak_bytes=10080,
-        peak_count=10,
-        total_bytes=10080,
-        total_count=10,
-    )
+    stats = stats_mr.allocation_counts
+    assert stats.current_bytes == 5040
+    assert stats.current_count == 5
+    assert stats.peak_bytes == 10080
+    assert stats.peak_count == 10
+    assert stats.total_bytes == 10080
+    assert stats.total_count == 10
 
     # Push a new Tracking adaptor
     mr2 = rmm.mr.StatisticsResourceAdaptor(stats_mr)

From d2d64a16280be8a4de317b30b2937413e5846b22 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 May 2024 08:36:28 +0200
Subject: [PATCH 31/43] style clean up

---
 python/rmm/rmm/statistics.py            |   2 +-
 python/rmm/rmm/tests/test_statistics.py | 320 ++++++++++++------------
 2 files changed, 162 insertions(+), 160 deletions(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 8be50a11b..fd7636a3b 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -318,7 +318,7 @@ def profiler(
 
     class ProfilerContext:
         def __call__(self, func: callable) -> callable:
-            _name = name or get_descriptive_name_of_object(func)
+            _name = name or _get_descriptive_name_of_object(func)
 
             @wraps(func)
             def wrapper(*args, **kwargs):
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index 6fb847f15..35337ea0d 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -17,9 +17,8 @@
 import rmm.mr
 from rmm.statistics import (
     ProfilerRecords,
-    Statistics,
+    _get_descriptive_name_of_object,
     default_profiler_records,
-    get_descriptive_name_of_object,
     get_statistics,
     pop_statistics,
     profiler,
@@ -38,34 +37,34 @@ def test_context():
             rmm.mr.StatisticsResourceAdaptor,
         )
         b1 = rmm.DeviceBuffer(size=20)
-        assert get_statistics() == Statistics(
-            current_bytes=32,
-            current_count=1,
-            peak_bytes=32,
-            peak_count=1,
-            total_bytes=32,
-            total_count=1,
-        )
+        stats = get_statistics()
+        assert stats.current_bytes == 32
+        assert stats.current_count == 1
+        assert stats.peak_bytes == 32
+        assert stats.peak_count == 1
+        assert stats.total_bytes == 32
+        assert stats.total_count == 1
+
         with statistics():
             mr2 = rmm.mr.get_current_device_resource()
             assert mr1 is mr2
             b2 = rmm.DeviceBuffer(size=10)
-            assert get_statistics() == Statistics(
-                current_bytes=16,
-                current_count=1,
-                peak_bytes=16,
-                peak_count=1,
-                total_bytes=16,
-                total_count=1,
-            )
-        assert get_statistics() == Statistics(
-            current_bytes=48,
-            current_count=2,
-            peak_bytes=48,
-            peak_count=2,
-            total_bytes=48,
-            total_count=2,
-        )
+            stats = get_statistics()
+            assert stats.current_bytes == 16
+            assert stats.current_count == 1
+            assert stats.peak_bytes == 16
+            assert stats.peak_count == 1
+            assert stats.total_bytes == 16
+            assert stats.total_count == 1
+
+        stats = get_statistics()
+        assert stats.current_bytes == 48
+        assert stats.current_count == 2
+        assert stats.peak_bytes == 48
+        assert stats.peak_count == 2
+        assert stats.total_bytes == 48
+        assert stats.total_count == 2
+
         del b1
         del b2
     assert rmm.mr.get_current_device_resource() is mr0
@@ -92,39 +91,39 @@ def test_multiple_mr(stats_mr):
         for _ in range(2):
             buffers.append(rmm.DeviceBuffer(size=1000))
 
-        assert mr2.allocation_counts == Statistics(
-            current_bytes=2016,
-            current_count=2,
-            peak_bytes=2016,
-            peak_count=2,
-            total_bytes=2016,
-            total_count=2,
-        )
-        assert stats_mr.allocation_counts == Statistics(
-            current_bytes=7056,
-            current_count=7,
-            peak_bytes=10080,
-            peak_count=10,
-            total_bytes=12096,
-            total_count=12,
-        )
+        stats = mr2.allocation_counts
+        assert stats.current_bytes == 2016
+        assert stats.current_count == 2
+        assert stats.peak_bytes == 2016
+        assert stats.peak_count == 2
+        assert stats.total_bytes == 2016
+        assert stats.total_count == 2
+
+        stats = stats_mr.allocation_counts
+        assert stats.current_bytes == 7056
+        assert stats.current_count == 7
+        assert stats.peak_bytes == 10080
+        assert stats.peak_count == 10
+        assert stats.total_bytes == 12096
+        assert stats.total_count == 12
+
         del buffers
-        assert mr2.allocation_counts == Statistics(
-            current_bytes=0,
-            current_count=0,
-            peak_bytes=2016,
-            peak_count=2,
-            total_bytes=2016,
-            total_count=2,
-        )
-        assert stats_mr.allocation_counts == Statistics(
-            current_bytes=0,
-            current_count=0,
-            peak_bytes=10080,
-            peak_count=10,
-            total_bytes=12096,
-            total_count=12,
-        )
+        stats = mr2.allocation_counts
+        assert stats.current_bytes == 0
+        assert stats.current_count == 0
+        assert stats.peak_bytes == 2016
+        assert stats.peak_count == 2
+        assert stats.total_bytes == 2016
+        assert stats.total_count == 2
+
+        stats = stats_mr.allocation_counts
+        assert stats.current_bytes == 0
+        assert stats.current_count == 0
+        assert stats.peak_bytes == 10080
+        assert stats.peak_count == 10
+        assert stats.total_bytes == 12096
+        assert stats.total_count == 12
+
     finally:
         rmm.mr.set_current_device_resource(stats_mr)
 
@@ -133,73 +132,75 @@ def test_counter_stack(stats_mr):
     buffers = [rmm.DeviceBuffer(size=10) for _ in range(10)]
 
     # push returns the stats from the top before the push
-    assert stats_mr.push_counters() == Statistics(  # stats from stack level 0
-        current_bytes=160,
-        current_count=10,
-        peak_bytes=160,
-        peak_count=10,
-        total_bytes=160,
-        total_count=10,
-    )
+    stats = stats_mr.push_counters()  # stats from stack level 0
+    assert stats.current_bytes == 160
+    assert stats.current_count == 10
+    assert stats.peak_bytes == 160
+    assert stats.peak_count == 10
+    assert stats.total_bytes == 160
+    assert stats.total_count == 10
+
     b1 = rmm.DeviceBuffer(size=10)
-    assert stats_mr.push_counters() == Statistics(  # stats from stack level 1
-        current_bytes=16,
-        current_count=1,
-        peak_bytes=16,
-        peak_count=1,
-        total_bytes=16,
-        total_count=1,
-    )
+
+    stats = stats_mr.push_counters()  # stats from stack level 1
+    assert stats.current_bytes == 16
+    assert stats.current_count == 1
+    assert stats.peak_bytes == 16
+    assert stats.peak_count == 1
+    assert stats.total_bytes == 16
+    assert stats.total_count == 1
+
     del b1
+
     # pop returns the popped stats
     # Note, the bytes and counts can be negative
-    assert stats_mr.pop_counters() == Statistics(  # stats from stack level 2
-        current_bytes=-16,
-        current_count=-1,
-        peak_bytes=0,
-        peak_count=0,
-        total_bytes=0,
-        total_count=0,
-    )
+    stats = stats_mr.pop_counters()  # stats from stack level 2
+    assert stats.current_bytes == -16
+    assert stats.current_count == -1
+    assert stats.peak_bytes == 0
+    assert stats.peak_count == 0
+    assert stats.total_bytes == 0
+    assert stats.total_count == 0
+
     b1 = rmm.DeviceBuffer(size=10)
-    assert stats_mr.push_counters() == Statistics(  # stats from stack level 1
-        current_bytes=16,
-        current_count=1,
-        peak_bytes=16,
-        peak_count=1,
-        total_bytes=32,
-        total_count=2,
-    )
+
+    stats = stats_mr.push_counters()  # stats from stack level 1
+    assert stats.current_bytes == 16
+    assert stats.current_count == 1
+    assert stats.peak_bytes == 16
+    assert stats.peak_count == 1
+    assert stats.total_bytes == 32
+    assert stats.total_count == 2
+
     b2 = rmm.DeviceBuffer(size=10)
-    assert stats_mr.pop_counters() == Statistics(  # stats from stack level 2
-        current_bytes=16,
-        current_count=1,
-        peak_bytes=16,
-        peak_count=1,
-        total_bytes=16,
-        total_count=1,
-    )
-    assert stats_mr.pop_counters() == Statistics(  # stats from stack level 1
-        current_bytes=32,
-        current_count=2,
-        peak_bytes=32,
-        peak_count=2,
-        total_bytes=48,
-        total_count=3,
-    )
+
+    stats = stats_mr.pop_counters()  # stats from stack level 2
+    assert stats.current_bytes == 16
+    assert stats.current_count == 1
+    assert stats.peak_bytes == 16
+    assert stats.peak_count == 1
+    assert stats.total_bytes == 16
+    assert stats.total_count == 1
+
+    stats = stats_mr.pop_counters()  # stats from stack level 1
+    assert stats.current_bytes == 32
+    assert stats.current_count == 2
+    assert stats.peak_bytes == 32
+    assert stats.peak_count == 2
+    assert stats.total_bytes == 48
+    assert stats.total_count == 3
+
     del b1
     del b2
-    assert (
-        stats_mr.allocation_counts
-        == Statistics(  # stats from stack level 0
-            current_bytes=160,
-            current_count=10,
-            peak_bytes=192,
-            peak_count=12,
-            total_bytes=208,
-            total_count=13,
-        )
-    )
+
+    stats = stats_mr.allocation_counts  # stats from stack level 0
+    assert stats.current_bytes == 160
+    assert stats.current_count == 10
+    assert stats.peak_bytes == 192
+    assert stats.peak_count == 12
+    assert stats.total_bytes == 208
+    assert stats.total_count == 13
+
     del buffers
     with pytest.raises(IndexError, match="cannot pop the last counter pair"):
         stats_mr.pop_counters()
@@ -207,41 +208,40 @@ def test_counter_stack(stats_mr):
 
 def test_current_statistics(stats_mr):
     b1 = rmm.DeviceBuffer(size=10)
-    assert get_statistics() == Statistics(
-        current_bytes=16,
-        current_count=1,
-        peak_bytes=16,
-        peak_count=1,
-        total_bytes=16,
-        total_count=1,
-    )
+    stats = get_statistics()
+    assert stats.current_bytes == 16
+    assert stats.current_count == 1
+    assert stats.peak_bytes == 16
+    assert stats.peak_count == 1
+    assert stats.total_bytes == 16
+    assert stats.total_count == 1
+
     b2 = rmm.DeviceBuffer(size=20)
-    assert push_statistics() == Statistics(
-        current_bytes=48,
-        current_count=2,
-        peak_bytes=48,
-        peak_count=2,
-        total_bytes=48,
-        total_count=2,
-    )
+    stats = push_statistics()
+    assert stats.current_bytes == 48
+    assert stats.current_count == 2
+    assert stats.peak_bytes == 48
+    assert stats.peak_count == 2
+    assert stats.total_bytes == 48
+    assert stats.total_count == 2
+
     del b1
-    assert pop_statistics() == Statistics(
-        current_bytes=-16,
-        current_count=-1,
-        peak_bytes=0,
-        peak_count=0,
-        total_bytes=0,
-        total_count=0,
-    )
+    stats = pop_statistics()
+    assert stats.current_bytes == -16
+    assert stats.current_count == -1
+    assert stats.peak_bytes == 0
+    assert stats.peak_count == 0
+    assert stats.total_bytes == 0
+    assert stats.total_count == 0
+
     del b2
-    assert get_statistics() == Statistics(
-        current_bytes=0,
-        current_count=0,
-        peak_bytes=48,
-        peak_count=2,
-        total_bytes=48,
-        total_count=2,
-    )
+    stats = get_statistics()
+    assert stats.current_bytes == 0
+    assert stats.current_count == 0
+    assert stats.peak_bytes == 48
+    assert stats.peak_count == 2
+    assert stats.total_bytes == 48
+    assert stats.total_count == 2
 
 
 def test_statistics_disabled():
@@ -290,16 +290,18 @@ def f3():
     f3()
 
     records = profiler_records.records
-    assert records[get_descriptive_name_of_object(f1)] == ProfilerRecords.Data(
-        num_calls=2, memory_total=64, memory_peak=32
-    )
-    assert records[get_descriptive_name_of_object(f2)] == ProfilerRecords.Data(
-        num_calls=3, memory_total=96, memory_peak=32
-    )
+    assert records[
+        _get_descriptive_name_of_object(f1)
+    ] == ProfilerRecords.Data(num_calls=2, memory_total=64, memory_peak=32)
+    assert records[
+        _get_descriptive_name_of_object(f2)
+    ] == ProfilerRecords.Data(num_calls=3, memory_total=96, memory_peak=32)
     assert records["g2"] == ProfilerRecords.Data(
         num_calls=3, memory_total=48, memory_peak=16
     )
-    assert records[get_descriptive_name_of_object(f3)] == ProfilerRecords.Data(
+    assert records[
+        _get_descriptive_name_of_object(f3)
+    ] == ProfilerRecords.Data(
         num_calls=1, memory_total=11200, memory_peak=11200
     )
 
@@ -317,9 +319,9 @@ def f4():
             del b2
 
     records = default_profiler_records.records
-    assert records[get_descriptive_name_of_object(f4)] == ProfilerRecords.Data(
-        num_calls=1, memory_total=160, memory_peak=160
-    )
+    assert records[
+        _get_descriptive_name_of_object(f4)
+    ] == ProfilerRecords.Data(num_calls=1, memory_total=160, memory_peak=160)
     assert records["b1 and b2"] == ProfilerRecords.Data(
         num_calls=1, memory_total=224, memory_peak=224
     )

From 394d39f36de112ce73c04bf10a4f50f452e00114 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 May 2024 08:52:59 +0200
Subject: [PATCH 32/43] doc

---
 python/rmm/rmm/_lib/memory_resource.pyx | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx
index 0db05eeb3..61a4505d4 100644
--- a/python/rmm/rmm/_lib/memory_resource.pyx
+++ b/python/rmm/rmm/_lib/memory_resource.pyx
@@ -833,6 +833,10 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
     def pop_counters(self) -> Statistics:
         """
         Pop a counter pair (bytes and allocations) from the stack
+
+        Returns
+        -------
+        The popped statistics
         """
         cdef statistics_resource_adaptor[device_memory_resource]* mr = \
             <statistics_resource_adaptor[device_memory_resource]*> self.c_obj.get()
@@ -850,6 +854,10 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
     def push_counters(self) -> Statistics:
         """
         Push a new counter pair (bytes and allocations) on the stack
+
+        Returns
+        -------
+        The statistics _before_ the push
         """
 
         cdef statistics_resource_adaptor[device_memory_resource]* mr = \

From 499c173381b132411798cb8aadb559ee58addf9e Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 May 2024 09:00:51 +0200
Subject: [PATCH 33/43] rename Data => MemoryRecord

---
 python/rmm/rmm/statistics.py            |  8 ++++----
 python/rmm/rmm/tests/test_statistics.py | 20 +++++++++++++-------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index fd7636a3b..0fcf8818e 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -169,7 +169,7 @@ class ProfilerRecords:
     """Records of the memory statistics recorded by a profiler"""
 
     @dataclass
-    class Data:
+    class MemoryRecord:
         """Memory statistics of a single code block
 
         Attributes
@@ -193,8 +193,8 @@ def add(self, memory_total: int, memory_peak: int):
 
     def __init__(self) -> None:
         self._lock = threading.Lock()
-        self._records: Dict[str, ProfilerRecords.Data] = defaultdict(
-            ProfilerRecords.Data
+        self._records: Dict[str, ProfilerRecords.MemoryRecord] = defaultdict(
+            ProfilerRecords.MemoryRecord
         )
 
     def add(self, name: str, data: Statistics) -> None:
@@ -215,7 +215,7 @@ def add(self, name: str, data: Statistics) -> None:
             )
 
     @property
-    def records(self) -> Dict[str, Data]:
+    def records(self) -> Dict[str, MemoryRecord]:
         """Dictionary mapping record names to their memory statistics"""
         return dict(self._records)
 
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index 35337ea0d..1e9c6894a 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -292,16 +292,20 @@ def f3():
     records = profiler_records.records
     assert records[
         _get_descriptive_name_of_object(f1)
-    ] == ProfilerRecords.Data(num_calls=2, memory_total=64, memory_peak=32)
+    ] == ProfilerRecords.MemoryRecord(
+        num_calls=2, memory_total=64, memory_peak=32
+    )
     assert records[
         _get_descriptive_name_of_object(f2)
-    ] == ProfilerRecords.Data(num_calls=3, memory_total=96, memory_peak=32)
-    assert records["g2"] == ProfilerRecords.Data(
+    ] == ProfilerRecords.MemoryRecord(
+        num_calls=3, memory_total=96, memory_peak=32
+    )
+    assert records["g2"] == ProfilerRecords.MemoryRecord(
         num_calls=3, memory_total=48, memory_peak=16
     )
     assert records[
         _get_descriptive_name_of_object(f3)
-    ] == ProfilerRecords.Data(
+    ] == ProfilerRecords.MemoryRecord(
         num_calls=1, memory_total=11200, memory_peak=11200
     )
 
@@ -321,10 +325,12 @@ def f4():
     records = default_profiler_records.records
     assert records[
         _get_descriptive_name_of_object(f4)
-    ] == ProfilerRecords.Data(num_calls=1, memory_total=160, memory_peak=160)
-    assert records["b1 and b2"] == ProfilerRecords.Data(
+    ] == ProfilerRecords.MemoryRecord(
+        num_calls=1, memory_total=160, memory_peak=160
+    )
+    assert records["b1 and b2"] == ProfilerRecords.MemoryRecord(
         num_calls=1, memory_total=224, memory_peak=224
     )
-    assert records["del b1 and b2"] == ProfilerRecords.Data(
+    assert records["del b1 and b2"] == ProfilerRecords.MemoryRecord(
         num_calls=1, memory_total=0, memory_peak=0
     )

From 463172d50b69268259c3c03ea692a26544d7b8a5 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 May 2024 09:02:52 +0200
Subject: [PATCH 34/43] rename pretty_print => report

---
 python/rmm/rmm/statistics.py            | 4 ++--
 python/rmm/rmm/tests/test_statistics.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 0fcf8818e..7596f57f2 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -219,7 +219,7 @@ def records(self) -> Dict[str, MemoryRecord]:
         """Dictionary mapping record names to their memory statistics"""
         return dict(self._records)
 
-    def pretty_print(
+    def report(
         self,
         ordered_by: Literal[
             "num_calls", "memory_peak", "memory_total"
@@ -266,7 +266,7 @@ def __repr__(self) -> str:
         return f"{self.__class__.__name__}({self.records})"
 
     def __str__(self) -> str:
-        return self.pretty_print()
+        return self.report()
 
 
 def _get_descriptive_name_of_object(obj: object) -> str:
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
index 1e9c6894a..7ba09a92f 100644
--- a/python/rmm/rmm/tests/test_statistics.py
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -253,7 +253,7 @@ def test_statistics_disabled():
 def test_profiler(stats_mr):
     profiler_records = ProfilerRecords()
     assert len(profiler_records.records) == 0
-    assert "No data" in profiler_records.pretty_print()
+    assert "No data" in profiler_records.report()
 
     @profiler(records=profiler_records)
     def f1():

From c11b1c5bae54bb88af4b7e8b1cd7a2c1e7c645c8 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 May 2024 09:15:10 +0200
Subject: [PATCH 35/43] ruff check --fix --select D400

---
 python/rmm/rmm/statistics.py | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 7596f57f2..73b7c96ff 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -25,7 +25,7 @@
 
 @dataclass
 class Statistics:
-    """Statistics returned by ``{get,push,pop}_statistics()``
+    """Statistics returned by ``{get,push,pop}_statistics()``.
 
     Attributes
     ----------
@@ -52,7 +52,7 @@ class Statistics:
 
 
 def enable_statistics() -> None:
-    """Enable allocation statistics
+    """Enable allocation statistics.
 
     This function is idempotent. If statistics have been enabled for the
     current RMM resource stack, this is a no-op.
@@ -72,7 +72,7 @@ def enable_statistics() -> None:
 
 
 def get_statistics() -> Optional[Statistics]:
-    """Get the current allocation statistics
+    """Get the current allocation statistics.
 
     Return
     ------
@@ -86,7 +86,7 @@ def get_statistics() -> Optional[Statistics]:
 
 
 def push_statistics() -> Optional[Statistics]:
-    """Push new counters on the current allocation statistics stack
+    """Push new counters on the current allocation statistics stack.
 
     This returns the current tracked statistics and pushes a new set
     of zero counters on the stack of statistics.
@@ -106,7 +106,7 @@ def push_statistics() -> Optional[Statistics]:
 
 
 def pop_statistics() -> Optional[Statistics]:
-    """Pop the counters of the current allocation statistics stack
+    """Pop the counters of the current allocation statistics stack.
 
     This returns the counters of current tracked statistics and pops
     them from the stack.
@@ -166,20 +166,20 @@ def statistics():
 
 
 class ProfilerRecords:
-    """Records of the memory statistics recorded by a profiler"""
+    """Records of the memory statistics recorded by a profiler."""
 
     @dataclass
     class MemoryRecord:
-        """Memory statistics of a single code block
+        """Memory statistics of a single code block.
 
         Attributes
         ----------
         num_calls
             Number of times this code block was invoked.
         memory_total
-            Total number of bytes allocated
+            Total number of bytes allocated.
         memory_peak
-            Peak number of bytes allocated
+            Peak number of bytes allocated.
         """
 
         num_calls: int = 0
@@ -198,16 +198,16 @@ def __init__(self) -> None:
         )
 
     def add(self, name: str, data: Statistics) -> None:
-        """Add memory statistics to the record named `name`
+        """Add memory statistics to the record named `name`.
 
         This method is thread-safe.
 
         Parameters
         ----------
         name
-            Name of the record
+            Name of the record.
         data
-            Memory statistics of `name`
+            Memory statistics of `name`.
         """
         with self._lock:
             self._records[name].add(
@@ -216,7 +216,7 @@ def add(self, name: str, data: Statistics) -> None:
 
     @property
     def records(self) -> Dict[str, MemoryRecord]:
-        """Dictionary mapping record names to their memory statistics"""
+        """Dictionary mapping record names to their memory statistics."""
         return dict(self._records)
 
     def report(
@@ -225,7 +225,7 @@ def report(
             "num_calls", "memory_peak", "memory_total"
         ] = "memory_peak",
     ) -> str:
-        """Pretty format the recorded memory statistics
+        """Pretty format the recorded memory statistics.
 
         Parameters
         ----------
@@ -270,7 +270,7 @@ def __str__(self) -> str:
 
 
 def _get_descriptive_name_of_object(obj: object) -> str:
-    """Get name of object, which include filename, line number, and object name
+    """Get descriptive name of object.
 
     Parameters
     ----------
@@ -279,7 +279,7 @@ def _get_descriptive_name_of_object(obj: object) -> str:
 
     Return
     ------
-    Descriptive name of the object
+    A string including filename, line number, and object name.
     """
 
     obj = inspect.unwrap(obj)
@@ -296,7 +296,7 @@ def profiler(
     records: ProfilerRecords = default_profiler_records,
     name: str = "",
 ):
-    """Decorator and context to profile function or code block
+    """Decorator and context to profile function or code block.
 
     If statistics are enabled (the current memory resource is not an
     instance of StatisticsResourceAdaptor), this decorator records the

From 3d929d671ea66016ee2c4c3f60bbcd5f9f16d63d Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 May 2024 10:06:52 +0200
Subject: [PATCH 36/43] report: style

---
 python/rmm/rmm/statistics.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 73b7c96ff..4a0af7467 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -249,10 +249,12 @@ def report(
             return ret + "No data, maybe profiling wasn't enabled?"
         ret += (
             "Legends:\n"
-            "  ncalls       - number of time the function or code block "
+            "  ncalls       - number of times the function or code block "
             "was called\n"
-            "  memory_peak  - peak memory allocated in bytes\n"
-            "  memory_total - total memory allocated in bytes\n"
+            "  memory_peak  - peak memory allocated in function or code "
+            "block (in bytes)\n"
+            "  memory_total - total memory allocated in function or code "
+            "block (in bytes)\n"
         )
         ret += f"\nOrdered by: {ordered_by}\n"
         ret += "\nncalls     memory_peak    memory_total  "

From 62a387079a8c2e6b370b028a2e37fdaa1a626e12 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 May 2024 15:25:34 +0200
Subject: [PATCH 37/43] doc

---
 python/rmm/docs/guide.md     | 78 ++++++++++++++++++++++++++++++++++++
 python/rmm/rmm/statistics.py |  2 +-
 2 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/python/rmm/docs/guide.md b/python/rmm/docs/guide.md
index 968be8586..6eb946343 100644
--- a/python/rmm/docs/guide.md
+++ b/python/rmm/docs/guide.md
@@ -187,3 +187,81 @@ allocator.
 
 >>> torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
 ```
+
+
+
+## Memory statistics and profiling
+
+RMM has a tool for tracking memory statistics and memory profiling. It can be enabled in two ways:
+  - Use the context manager `rmm.statistics.statistics()` to enable statistics tracking for a specific code block.
+  - Call `rmm.statistics.enable_statistics()` to enable statistics tracking globally.
+
+Common for both are that they modifies the current RMM memory resource. `StatisticsResourceAdaptor` is pushed onto the current RMM memory resource stack and must remain the topmost resource throughout the statistics tracking.
+
+```python
+>>> import rmm.statistics
+
+>>> # We start with the default cuda memory resource
+>>> rmm.mr.get_current_device_resource()
+<rmm._lib.memory_resource.CudaMemoryResource at 0x7f7e6c0a1ce0>
+
+>>> # When using statistics, we get a StatisticsResourceAdaptor with the context
+>>> with rmm.statistics.statistics():
+...     rmm.mr.get_current_device_resource()
+<rmm._lib.memory_resource.StatisticsResourceAdaptor at 0x7f7e6c524900>
+
+>>> # We can also enable statistics globally
+>>> rmm.statistics.enable_statistics()
+>>> print(rmm.mr.get_current_device_resource())
+<rmm._lib.memory_resource.StatisticsResourceAdaptor at 0x7f662c2bb3c0>
+```
+
+When statistic is enabled, we can get statistics of all allocations done through the current RMM memory resource.
+```python
+>>> buf = rmm.DeviceBuffer(size=10)
+>>> rmm.statistics.get_statistics()
+Statistics(current_bytes=16, current_count=1, peak_bytes=16, peak_count=1, total_bytes=16, total_count=1)
+```
+
+Maybe more useful, we can profile code blocks when memory statistics is enabled. One way to do this is using `profiler` as a function decorator.
+```python
+>>> @rmm.statistics.profiler()
+... def f(size):
+...   rmm.DeviceBuffer(size=size)
+>>> f(1000)
+
+>>> # By default, the profiler write to rmm.statistics.default_profiler_records
+>>> print(rmm.statistics.default_profiler_records.report())
+Memory Profiling
+================
+
+Legends:
+  ncalls       - number of times the function or code block was called
+  memory_peak  - peak memory allocated in function or code block (in bytes)
+  memory_total - total memory allocated in function or code block (in bytes)
+
+Ordered by: memory_peak
+
+ncalls     memory_peak    memory_total  filename:lineno(function)
+     1           1,008           1,008  <ipython-input-11-5fc63161ac29>:1(f)
+```
+
+We can also profile a code block by using `profiler` as a context manager.
+```python
+>>> with rmm.statistics.profiler(name="my code block"):
+...     rmm.DeviceBuffer(size=20)
+>>> print(rmm.statistics.default_profiler_records.report())
+Memory Profiling
+================
+
+Legends:
+  ncalls       - number of times the function or code block was called
+  memory_peak  - peak memory allocated in function or code block (in bytes)
+  memory_total - total memory allocated in function or code block (in bytes)
+
+Ordered by: memory_peak
+
+ncalls     memory_peak    memory_total  filename:lineno(function)
+     1           1,008           1,008  <ipython-input-11-5fc63161ac29>:1(f)
+     1              32              32  my code block
+```
diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 4a0af7467..cb02d8949 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -61,7 +61,7 @@ def enable_statistics() -> None:
     --------
     This modifies the current RMM memory resource. StatisticsResourceAdaptor
     is pushed onto the current RMM memory resource stack and must remain the
-    the topmost resource throughout the statistics gathering.
+    topmost resource throughout the statistics gathering.
     """
 
     mr = rmm.mr.get_current_device_resource()

From 8d71415d7508b8c8d19298255a9f5851a213c214 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 30 May 2024 16:17:31 +0200
Subject: [PATCH 38/43] spelling

Co-authored-by: Lawrence Mitchell <wence@gmx.li>
---
 python/rmm/docs/guide.md     | 4 ++--
 python/rmm/rmm/statistics.py | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/rmm/docs/guide.md b/python/rmm/docs/guide.md
index 6eb946343..8c1bc0383 100644
--- a/python/rmm/docs/guide.md
+++ b/python/rmm/docs/guide.md
@@ -196,7 +196,7 @@ RMM has a tool for tracking memory statistics and memory profiling. It can be en
   - Use the context manager `rmm.statistics.statistics()` to enable statistics tracking for a specific code block.
   - Call `rmm.statistics.enable_statistics()` to enable statistics tracking globally.
 
-Common for both are that they modifies the current RMM memory resource. `StatisticsResourceAdaptor` is pushed onto the current RMM memory resource stack and must remain the topmost resource throughout the statistics tracking.
+Common to both usages is that they modify the currently active RMM memory resource. A `StatisticsResourceAdaptor` is pushed onto the current RMM memory resource stack and must remain the topmost resource throughout the statistics tracking.
 
 ```python
 >>> import rmm.statistics
@@ -216,7 +216,7 @@ Common for both are that they modifies the current RMM memory resource. `Statist
 <rmm._lib.memory_resource.StatisticsResourceAdaptor at 0x7f662c2bb3c0>
 ```
 
-When statistic is enabled, we can get statistics of all allocations done through the current RMM memory resource.
+When statistics are enabled, we can get statistics of all allocations performed by the current RMM memory resource.
 ```python
 >>> buf = rmm.DeviceBuffer(size=10)
 >>> rmm.statistics.get_statistics()
diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index cb02d8949..53e6154d1 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -129,12 +129,12 @@ def pop_statistics() -> Optional[Statistics]:
 def statistics():
     """Context to enable allocation statistics.
 
-    If statistics has been enabled already (the current memory resource is an
+    If statistics have been enabled already (the current memory resource is an
     instance of StatisticsResourceAdaptor), new counters are pushed on the
     current allocation statistics stack when entering the context and popped
     again when exiting using `push_statistics()` and `push_statistics()`.
 
-    If statistics has not been enabled, StatisticsResourceAdaptor is set as
+    If statistics have not been enabled, a new StatisticsResourceAdaptor is set as
     the current RMM memory resource when entering the context and removed
     again when exiting.
 
@@ -300,7 +300,7 @@ def profiler(
 ):
     """Decorator and context to profile function or code block.
 
-    If statistics are enabled (the current memory resource is not an
+    If statistics are enabled (the current memory resource is an
     instance of StatisticsResourceAdaptor), this decorator records the
     memory statistics of the decorated function or code block.
 
@@ -312,7 +312,7 @@ def profiler(
         The profiler records that the memory statistics are written to. If
         not set, a default profiler records are used.
     name
-        The name of the memory profile, which is mandatory when the profiler
+        The name of the memory profile, mandatory when the profiler
         is used as a context manager. If used as a decorator, an empty name
         is allowed. In this case, the name is the filename, line number, and
         function name.

From a230794d96b8b10c9383dfdb7658bbc5079755db Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 30 May 2024 16:29:29 +0200
Subject: [PATCH 39/43] style

---
 python/rmm/rmm/statistics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 53e6154d1..279e45dc6 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -134,8 +134,8 @@ def statistics():
     current allocation statistics stack when entering the context and popped
     again when exiting using `push_statistics()` and `push_statistics()`.
 
-    If statistics have not been enabled, a new StatisticsResourceAdaptor is set as
-    the current RMM memory resource when entering the context and removed
+    If statistics have not been enabled, a new StatisticsResourceAdaptor is set
+    as the current RMM memory resource when entering the context and removed
     again when exiting.
 
     Raises

From 17d9fd99009f3bdeadb523f6a440c1c29ff4c610 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 30 May 2024 16:44:35 +0200
Subject: [PATCH 40/43] doc

---
 python/rmm/docs/guide.md | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/python/rmm/docs/guide.md b/python/rmm/docs/guide.md
index 8c1bc0383..d476df483 100644
--- a/python/rmm/docs/guide.md
+++ b/python/rmm/docs/guide.md
@@ -196,9 +196,9 @@ RMM has a tool for tracking memory statistics and memory profiling. It can be en
   - Use the context manager `rmm.statistics.statistics()` to enable statistics tracking for a specific code block.
   - Call `rmm.statistics.enable_statistics()` to enable statistics tracking globally.
 
-Common to both usages is that they modify the currently active RMM memory resource. A `StatisticsResourceAdaptor` is pushed onto the current RMM memory resource stack and must remain the topmost resource throughout the statistics tracking.
-
+Common to both usages is that they modify the currently active RMM memory resource. A `StatisticsResourceAdaptor` is pushed onto the current RMM memory resource stack and must remain the topmost resource throughout the statistics tracking:
 ```python
+>>> import rmm
 >>> import rmm.statistics
 
 >>> # We start with the default cuda memory resource
@@ -216,14 +216,17 @@ Common to both usages is that they modify the currently active RMM memory resour
 <rmm._lib.memory_resource.StatisticsResourceAdaptor at 0x7f662c2bb3c0>
 ```
 
-When statistics are enabled, we can get statistics of all allocations performed by the current RMM memory resource.
+When statistics are enabled, we can get statistics of all allocations performed by the current RMM memory resource:
 ```python
 >>> buf = rmm.DeviceBuffer(size=10)
 >>> rmm.statistics.get_statistics()
 Statistics(current_bytes=16, current_count=1, peak_bytes=16, peak_count=1, total_bytes=16, total_count=1)
 ```
 
-Maybe more useful, we can profile code blocks when memory statistics is enabled. One way to do this is using `profiler` as a function decorator.
+### Memory Profiler
+It is also possible to profile a specific block of code. It requires that memory statistics has been enabled e.g. by calling `rmm.statistics.enable_statistics()`.
+
+To profile a function, we can use the `profiler` as a function decorator like:
 ```python
 >>> @rmm.statistics.profiler()
 ... def f(size):
@@ -246,7 +249,7 @@ ncalls     memory_peak    memory_total  filename:lineno(function)
      1           1,008           1,008  <ipython-input-11-5fc63161ac29>:1(f)
 ```
 
-We can also profile a code block by using `profiler` as a context manager.
+We can also profile a code block by using `profiler` as a context manager:
 ```python
 >>> with rmm.statistics.profiler(name="my code block"):
 ...     rmm.DeviceBuffer(size=20)
@@ -265,3 +268,27 @@ ncalls     memory_peak    memory_total  filename:lineno(function)
      1           1,008           1,008  <ipython-input-11-5fc63161ac29>:1(f)
      1              32              32  my code block
 ```
+
+The `profiler` support nesting:
+```python
+>>> with rmm.statistics.profiler(name="outer"):
+...     buf1 = rmm.DeviceBuffer(size=10)
+...     with rmm.statistics.profiler(name="inner"):
+...         buf2 = rmm.DeviceBuffer(size=10)
+>>> print(rmm.statistics.default_profiler_records.report())
+Memory Profiling
+================
+
+Legends:
+  ncalls       - number of times the function or code block was called
+  memory_peak  - peak memory allocated in function or code block (in bytes)
+  memory_total - total memory allocated in function or code block (in bytes)
+
+Ordered by: memory_peak
+
+ncalls     memory_peak    memory_total  filename:lineno(function)
+     1           1,008           1,008  <ipython-input-4-865fbe04e29f>:1(f)
+     1              32              32  my code block
+     1              32              32  outer
+     1              16              16  inner
+```

From 23eb0751a9fb205bc16ac0446bdde75ff3d22627 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 4 Jun 2024 17:15:52 +0200
Subject: [PATCH 41/43] doc

Co-authored-by: Mark Harris <783069+harrism@users.noreply.github.com>
---
 python/rmm/docs/guide.md | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/python/rmm/docs/guide.md b/python/rmm/docs/guide.md
index d476df483..8f0b7c8c6 100644
--- a/python/rmm/docs/guide.md
+++ b/python/rmm/docs/guide.md
@@ -192,11 +192,11 @@ allocator.
 
 ## Memory statistics and profiling
 
-RMM has a tool for tracking memory statistics and memory profiling. It can be enabled in two ways:
+RMM can profile memory usage and track memory statistics by using either of the following:
   - Use the context manager `rmm.statistics.statistics()` to enable statistics tracking for a specific code block.
   - Call `rmm.statistics.enable_statistics()` to enable statistics tracking globally.
 
-Common to both usages is that they modify the currently active RMM memory resource. A `StatisticsResourceAdaptor` is pushed onto the current RMM memory resource stack and must remain the topmost resource throughout the statistics tracking:
+Both methods modify the currently active RMM memory resource. A `StatisticsResourceAdaptor` is pushed onto the current RMM memory resource stack and must remain the topmost resource throughout the statistics tracking:
 ```python
 >>> import rmm
 >>> import rmm.statistics
@@ -216,7 +216,7 @@ Common to both usages is that they modify the currently active RMM memory resour
 <rmm._lib.memory_resource.StatisticsResourceAdaptor at 0x7f662c2bb3c0>
 ```
 
-When statistics are enabled, we can get statistics of all allocations performed by the current RMM memory resource:
+With statistics enabled, you can query statistics of the current and peak bytes and number of allocations performed by the current RMM memory resource:
 ```python
 >>> buf = rmm.DeviceBuffer(size=10)
 >>> rmm.statistics.get_statistics()
@@ -224,9 +224,7 @@ Statistics(current_bytes=16, current_count=1, peak_bytes=16, peak_count=1, total
 ```
 
 ### Memory Profiler
-It is also possible to profile a specific block of code. It requires that memory statistics has been enabled e.g. by calling `rmm.statistics.enable_statistics()`.
-
-To profile a function, we can use the `profiler` as a function decorator like:
+To profile a specific block of code, first enable memory statistics by calling `rmm.statistics.enable_statistics()`. To profile a function, use `profiler` as a function decorator:
 ```python
 >>> @rmm.statistics.profiler()
 ... def f(size):
@@ -249,7 +247,7 @@ ncalls     memory_peak    memory_total  filename:lineno(function)
      1           1,008           1,008  <ipython-input-11-5fc63161ac29>:1(f)
 ```
 
-We can also profile a code block by using `profiler` as a context manager:
+To profile a code block, use `profiler` as a context manager:
 ```python
 >>> with rmm.statistics.profiler(name="my code block"):
 ...     rmm.DeviceBuffer(size=20)
@@ -269,7 +267,7 @@ ncalls     memory_peak    memory_total  filename:lineno(function)
      1              32              32  my code block
 ```
 
-The `profiler` support nesting:
+The `profiler` supports nesting:
 ```python
 >>> with rmm.statistics.profiler(name="outer"):
 ...     buf1 = rmm.DeviceBuffer(size=10)

From 9e9241460d591d373d67643b63018a29a99132b2 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 4 Jun 2024 17:19:48 +0200
Subject: [PATCH 42/43] doc

---
 python/rmm/docs/guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/rmm/docs/guide.md b/python/rmm/docs/guide.md
index 8f0b7c8c6..40e53a449 100644
--- a/python/rmm/docs/guide.md
+++ b/python/rmm/docs/guide.md
@@ -196,7 +196,7 @@ RMM can profile memory usage and track memory statistics by using either of the
   - Use the context manager `rmm.statistics.statistics()` to enable statistics tracking for a specific code block.
   - Call `rmm.statistics.enable_statistics()` to enable statistics tracking globally.
 
-Both methods modify the currently active RMM memory resource. A `StatisticsResourceAdaptor` is pushed onto the current RMM memory resource stack and must remain the topmost resource throughout the statistics tracking:
+Common to both usages is that they modify the currently active RMM memory resource. The current device resource is wrapped with a `StatisticsResourceAdaptor` which must remain the topmost resource throughout the statistics tracking:
 ```python
 >>> import rmm
 >>> import rmm.statistics

From 8b52c834d71bcd47bdd27c9a32a606da20a17c2f Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 6 Jun 2024 08:44:55 +0200
Subject: [PATCH 43/43] Update python/rmm/docs/guide.md

Co-authored-by: Mark Harris <783069+harrism@users.noreply.github.com>
---
 python/rmm/docs/guide.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/rmm/docs/guide.md b/python/rmm/docs/guide.md
index 40e53a449..bfba0800b 100644
--- a/python/rmm/docs/guide.md
+++ b/python/rmm/docs/guide.md
@@ -188,8 +188,6 @@ allocator.
 >>> torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
 ```
 
-
-
 ## Memory statistics and profiling
 
 RMM can profile memory usage and track memory statistics by using either of the following: