diff --git a/include/rmm/mr/device/statistics_resource_adaptor.hpp b/include/rmm/mr/device/statistics_resource_adaptor.hpp
index d072d5886..c8906afd2 100644
--- a/include/rmm/mr/device/statistics_resource_adaptor.hpp
+++ b/include/rmm/mr/device/statistics_resource_adaptor.hpp
@@ -21,6 +21,7 @@
 #include <cstddef>
 #include <mutex>
 #include <shared_mutex>
+#include <stack>
 
 namespace rmm::mr {
 /**
@@ -36,8 +37,16 @@ namespace rmm::mr {
  * resource in order to satisfy allocation requests, but any existing
  * allocations will be untracked. Tracking statistics stores the current, peak
  * and total memory allocations for both the number of bytes and number of calls
- * to the memory resource. `statistics_resource_adaptor` is intended as a debug
- * adaptor and shouldn't be used in performance-sensitive code.
+ * to the memory resource.
+ *
+ * This resource supports nested statistics, which makes it possible to track statistics
+ * of a code block. Use `.push_counters()` to start tracking statistics on a code block
+ * and use `.pop_counters()` to stop the tracking. The nested statistics are cascading
+ * such that the statistics tracked by a code block include the statistics tracked in
+ * all its tracked sub code blocks.
+ *
+ * `statistics_resource_adaptor` is intended as a debug adaptor and shouldn't be
+ * used in performance-sensitive code.
  *
  * @tparam Upstream Type of the upstream resource used for
  * allocation/deallocation.
@@ -45,11 +54,10 @@ namespace rmm::mr {
 template <typename Upstream>
 class statistics_resource_adaptor final : public device_memory_resource {
  public:
-  // can be a std::shared_mutex once C++17 is adopted
   using read_lock_t =
-    std::shared_lock<std::shared_timed_mutex>;  ///< Type of lock used to synchronize read access
+    std::shared_lock<std::shared_mutex>;  ///< Type of lock used to synchronize read access
   using write_lock_t =
-    std::unique_lock<std::shared_timed_mutex>;  ///< Type of lock used to synchronize write access
+    std::unique_lock<std::shared_mutex>;  ///< Type of lock used to synchronize write access
   /**
    * @brief Utility struct for counting the current, peak, and total value of a number
    */
@@ -83,6 +91,24 @@ class statistics_resource_adaptor final : public device_memory_resource {
       value -= val;
       return *this;
     }
+
+    /**
+     * @brief Add `val` to the current value and update the peak value if necessary
+     *
+     * When updating the peak value, we assume that `val` is tracking a code block inside the
+     * code block tracked by `this`. Because nested statistics are cascading, we have to convert
+     * `val.peak` to the peak it would have been if it was part of the statistics tracked by `this`.
+     * We do this by adding the current value that was active when `val` started tracking such that
+     * we get `std::max(value + val.peak, peak)`.
+     *
+     * @param val Value to add
+     */
+    void add_counters_from_tracked_sub_block(const counter& val)
+    {
+      peak = std::max(value + val.peak, peak);
+      value += val.value;
+      total += val.total;
+    }
   };
 
   /**
@@ -96,6 +122,8 @@ class statistics_resource_adaptor final : public device_memory_resource {
   statistics_resource_adaptor(Upstream* upstream) : upstream_{upstream}
   {
     RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
+    // Initially, we push a single counter pair on the stack
+    push_counters();
   }
 
   statistics_resource_adaptor()                                              = delete;
@@ -131,7 +159,7 @@ class statistics_resource_adaptor final : public device_memory_resource {
   {
     read_lock_t lock(mtx_);
 
-    return bytes_;
+    return counter_stack_.top().first;
   }
 
   /**
@@ -145,7 +173,40 @@ class statistics_resource_adaptor final : public device_memory_resource {
   {
     read_lock_t lock(mtx_);
 
-    return allocations_;
+    return counter_stack_.top().second;
+  }
+
+  /**
+   * @brief Push a pair of zero counters on the stack, which becomes the new
+   * counters returned by `get_bytes_counter()` and `get_allocations_counter()`
+   *
+   * @return top pair of counters <bytes, allocations> from the stack _before_
+   * the push
+   */
+  std::pair<counter, counter> push_counters()
+  {
+    write_lock_t lock(mtx_);
+    auto ret = counter_stack_.top();
+    counter_stack_.push(std::make_pair(counter{}, counter{}));
+    return ret;
+  }
+
+  /**
+   * @brief Pop a pair of counters from the stack
+   *
+   * @return top pair of counters <bytes, allocations> from the stack _before_
+   * the pop
+   */
+  std::pair<counter, counter> pop_counters()
+  {
+    write_lock_t lock(mtx_);
+    if (counter_stack_.size() < 2) { throw std::out_of_range("cannot pop the last counter pair"); }
+    auto ret = counter_stack_.top();
+    counter_stack_.pop();
+    // Update the new top pair of counters
+    counter_stack_.top().first.add_counters_from_tracked_sub_block(ret.first);
+    counter_stack_.top().second.add_counters_from_tracked_sub_block(ret.second);
+    return ret;
   }
 
  private:
@@ -171,8 +232,8 @@ class statistics_resource_adaptor final : public device_memory_resource {
       write_lock_t lock(mtx_);
 
       // Increment the allocation_count_ while we have the lock
-      bytes_ += bytes;
-      allocations_ += 1;
+      counter_stack_.top().first += bytes;
+      counter_stack_.top().second += 1;
     }
 
     return ptr;
@@ -193,8 +254,8 @@ class statistics_resource_adaptor final : public device_memory_resource {
       write_lock_t lock(mtx_);
 
       // Decrement the current allocated counts.
-      bytes_ -= bytes;
-      allocations_ -= 1;
+      counter_stack_.top().first -= bytes;
+      counter_stack_.top().second -= 1;
     }
   }
 
@@ -213,10 +274,10 @@ class statistics_resource_adaptor final : public device_memory_resource {
     return get_upstream_resource() == cast->get_upstream_resource();
   }
 
-  counter bytes_;                        // peak, current and total allocated bytes
-  counter allocations_;                  // peak, current and total allocation count
-  std::shared_timed_mutex mutable mtx_;  // mutex for thread safe access to allocations_
-  Upstream* upstream_;  // the upstream resource used for satisfying allocation requests
+  // Stack of counter pairs <bytes, allocations>
+  std::stack<std::pair<counter, counter>> counter_stack_;
+  std::shared_mutex mutable mtx_;  // mutex for thread safe access to allocations_
+  Upstream* upstream_;             // the upstream resource used for satisfying allocation requests
 };
 
 /**
diff --git a/include/rmm/mr/device/tracking_resource_adaptor.hpp b/include/rmm/mr/device/tracking_resource_adaptor.hpp
index c49674849..d01d08b9c 100644
--- a/include/rmm/mr/device/tracking_resource_adaptor.hpp
+++ b/include/rmm/mr/device/tracking_resource_adaptor.hpp
@@ -53,11 +53,10 @@ namespace rmm::mr {
 template <typename Upstream>
 class tracking_resource_adaptor final : public device_memory_resource {
  public:
-  // can be a std::shared_mutex once C++17 is adopted
   using read_lock_t =
-    std::shared_lock<std::shared_timed_mutex>;  ///< Type of lock used to synchronize read access
+    std::shared_lock<std::shared_mutex>;  ///< Type of lock used to synchronize read access
   using write_lock_t =
-    std::unique_lock<std::shared_timed_mutex>;  ///< Type of lock used to synchronize write access
+    std::unique_lock<std::shared_mutex>;  ///< Type of lock used to synchronize write access
   /**
    * @brief Information stored about an allocation. Includes the size
    * and a stack trace if the `tracking_resource_adaptor` was initialized
@@ -271,7 +270,7 @@ class tracking_resource_adaptor final : public device_memory_resource {
   bool capture_stacks_;                           // whether or not to capture call stacks
   std::map<void*, allocation_info> allocations_;  // map of active allocations
   std::atomic<std::size_t> allocated_bytes_;      // number of bytes currently allocated
-  std::shared_timed_mutex mutable mtx_;           // mutex for thread safe access to allocations_
+  std::shared_mutex mutable mtx_;                 // mutex for thread safe access to allocations_
   Upstream* upstream_;  // the upstream resource used for satisfying allocation requests
 };
 
diff --git a/python/rmm/docs/guide.md b/python/rmm/docs/guide.md
index 968be8586..bfba0800b 100644
--- a/python/rmm/docs/guide.md
+++ b/python/rmm/docs/guide.md
@@ -187,3 +187,104 @@ allocator.
 
 >>> torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
 ```
+
+## Memory statistics and profiling
+
+RMM can profile memory usage and track memory statistics by using either of the following:
+  - Use the context manager `rmm.statistics.statistics()` to enable statistics tracking for a specific code block.
+  - Call `rmm.statistics.enable_statistics()` to enable statistics tracking globally.
+
+Common to both usages is that they modify the currently active RMM memory resource. The current device resource is wrapped with a `StatisticsResourceAdaptor` which must remain the topmost resource throughout the statistics tracking:
+```python
+>>> import rmm
+>>> import rmm.statistics
+
+>>> # We start with the default cuda memory resource
+>>> rmm.mr.get_current_device_resource()
+<rmm._lib.memory_resource.CudaMemoryResource at 0x7f7e6c0a1ce0>
+
+>>> # When using statistics, we get a StatisticsResourceAdaptor with the context
+>>> with rmm.statistics.statistics():
+...     rmm.mr.get_current_device_resource()
+<rmm._lib.memory_resource.StatisticsResourceAdaptor at 0x7f7e6c524900>
+
+>>> # We can also enable statistics globally
+>>> rmm.statistics.enable_statistics()
+>>> print(rmm.mr.get_current_device_resource())
+<rmm._lib.memory_resource.StatisticsResourceAdaptor at 0x7f662c2bb3c0>
+```
+
+With statistics enabled, you can query statistics of the current and peak bytes and number of allocations performed by the current RMM memory resource:
+```python
+>>> buf = rmm.DeviceBuffer(size=10)
+>>> rmm.statistics.get_statistics()
+Statistics(current_bytes=16, current_count=1, peak_bytes=16, peak_count=1, total_bytes=16, total_count=1)
+```
+
+### Memory Profiler
+To profile a specific block of code, first enable memory statistics by calling `rmm.statistics.enable_statistics()`. To profile a function, use `profiler` as a function decorator:
+```python
+>>> @rmm.statistics.profiler()
+... def f(size):
+...   rmm.DeviceBuffer(size=size)
+>>> f(1000)
+
+>>> # By default, the profiler write to rmm.statistics.default_profiler_records
+>>> print(rmm.statistics.default_profiler_records.report())
+Memory Profiling
+================
+
+Legends:
+  ncalls       - number of times the function or code block was called
+  memory_peak  - peak memory allocated in function or code block (in bytes)
+  memory_total - total memory allocated in function or code block (in bytes)
+
+Ordered by: memory_peak
+
+ncalls     memory_peak    memory_total  filename:lineno(function)
+     1           1,008           1,008  <ipython-input-11-5fc63161ac29>:1(f)
+```
+
+To profile a code block, use `profiler` as a context manager:
+```python
+>>> with rmm.statistics.profiler(name="my code block"):
+...     rmm.DeviceBuffer(size=20)
+>>> print(rmm.statistics.default_profiler_records.report())
+Memory Profiling
+================
+
+Legends:
+  ncalls       - number of times the function or code block was called
+  memory_peak  - peak memory allocated in function or code block (in bytes)
+  memory_total - total memory allocated in function or code block (in bytes)
+
+Ordered by: memory_peak
+
+ncalls     memory_peak    memory_total  filename:lineno(function)
+     1           1,008           1,008  <ipython-input-11-5fc63161ac29>:1(f)
+     1              32              32  my code block
+```
+
+The `profiler` supports nesting:
+```python
+>>> with rmm.statistics.profiler(name="outer"):
+...     buf1 = rmm.DeviceBuffer(size=10)
+...     with rmm.statistics.profiler(name="inner"):
+...         buf2 = rmm.DeviceBuffer(size=10)
+>>> print(rmm.statistics.default_profiler_records.report())
+Memory Profiling
+================
+
+Legends:
+  ncalls       - number of times the function or code block was called
+  memory_peak  - peak memory allocated in function or code block (in bytes)
+  memory_total - total memory allocated in function or code block (in bytes)
+
+Ordered by: memory_peak
+
+ncalls     memory_peak    memory_total  filename:lineno(function)
+     1           1,008           1,008  <ipython-input-4-865fbe04e29f>:1(f)
+     1              32              32  my code block
+     1              32              32  outer
+     1              16              16  inner
+```
diff --git a/python/rmm/docs/python_api.rst b/python/rmm/docs/python_api.rst
index b229d8214..a62304d21 100644
--- a/python/rmm/docs/python_api.rst
+++ b/python/rmm/docs/python_api.rst
@@ -37,3 +37,12 @@ Memory Allocators
    :members:
    :undoc-members:
    :show-inheritance:
+
+Memory Statistics
+-----------------
+
+.. automodule:: rmm.statistics
+   :members:
+   :inherited-members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx
index 992203c27..e181bff4c 100644
--- a/python/rmm/rmm/_lib/memory_resource.pyx
+++ b/python/rmm/rmm/_lib/memory_resource.pyx
@@ -32,9 +32,7 @@ from libcpp.string cimport string
 from cuda.cudart import cudaError_t
 
 from rmm._cuda.gpu import CUDARuntimeError, getDevice, setDevice
-
 from rmm._cuda.stream cimport Stream
-
 from rmm._cuda.stream import DEFAULT_STREAM
 
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
@@ -46,6 +44,7 @@ from rmm._lib.per_device_resource cimport (
     cuda_device_id,
     set_per_device_resource as cpp_set_per_device_resource,
 )
+from rmm.statistics import Statistics
 
 # Transparent handle of a C++ exception
 ctypedef pair[int, string] CppExcept
@@ -182,8 +181,7 @@ cdef extern from "rmm/mr/device/logging_resource_adaptor.hpp" \
 
 cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \
         namespace "rmm::mr" nogil:
-    cdef cppclass statistics_resource_adaptor[Upstream](
-            device_memory_resource):
+    cdef cppclass statistics_resource_adaptor[Upstream](device_memory_resource):
         struct counter:
             counter()
 
@@ -191,11 +189,12 @@ cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \
             int64_t peak
             int64_t total
 
-        statistics_resource_adaptor(
-            Upstream* upstream_mr) except +
+        statistics_resource_adaptor(Upstream* upstream_mr) except +
 
         counter get_bytes_counter() except +
         counter get_allocations_counter() except +
+        pair[counter, counter] pop_counters() except +
+        pair[counter, counter] push_counters() except +
 
 cdef extern from "rmm/mr/device/tracking_resource_adaptor.hpp" \
         namespace "rmm::mr" nogil:
@@ -798,6 +797,9 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
         allocations/deallocations performed by an upstream memory resource.
         Includes the ability to query these statistics at any time.
 
+        A stack of counters is maintained. Use :meth:`push_counters` and
+        :meth:`pop_counters` to track statistics at different nesting levels.
+
         Parameters
         ----------
         upstream : DeviceMemoryResource
@@ -806,7 +808,7 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
         pass
 
     @property
-    def allocation_counts(self) -> dict:
+    def allocation_counts(self) -> Statistics:
         """
         Gets the current, peak, and total allocated bytes and number of
         allocations.
@@ -817,20 +819,62 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
         Returns:
             dict: Dictionary containing allocation counts and bytes.
         """
+        cdef statistics_resource_adaptor[device_memory_resource]* mr = \
+            <statistics_resource_adaptor[device_memory_resource]*> self.c_obj.get()
+
+        counts = deref(mr).get_allocations_counter()
+        byte_counts = deref(mr).get_bytes_counter()
+        return Statistics(
+            current_bytes=byte_counts.value,
+            current_count=counts.value,
+            peak_bytes=byte_counts.peak,
+            peak_count=counts.peak,
+            total_bytes=byte_counts.total,
+            total_count=counts.total,
+        )
+
+    def pop_counters(self) -> Statistics:
+        """
+        Pop a counter pair (bytes and allocations) from the stack
 
-        counts = (<statistics_resource_adaptor[device_memory_resource]*>(
-            self.c_obj.get()))[0].get_allocations_counter()
-        byte_counts = (<statistics_resource_adaptor[device_memory_resource]*>(
-            self.c_obj.get()))[0].get_bytes_counter()
-
-        return {
-            "current_bytes": byte_counts.value,
-            "current_count": counts.value,
-            "peak_bytes": byte_counts.peak,
-            "peak_count": counts.peak,
-            "total_bytes": byte_counts.total,
-            "total_count": counts.total,
-        }
+        Returns
+        -------
+        The popped statistics
+        """
+        cdef statistics_resource_adaptor[device_memory_resource]* mr = \
+            <statistics_resource_adaptor[device_memory_resource]*> self.c_obj.get()
+
+        bytes_and_allocs = deref(mr).pop_counters()
+        return Statistics(
+            current_bytes=bytes_and_allocs.first.value,
+            current_count=bytes_and_allocs.second.value,
+            peak_bytes=bytes_and_allocs.first.peak,
+            peak_count=bytes_and_allocs.second.peak,
+            total_bytes=bytes_and_allocs.first.total,
+            total_count=bytes_and_allocs.second.total,
+        )
+
+    def push_counters(self) -> Statistics:
+        """
+        Push a new counter pair (bytes and allocations) on the stack
+
+        Returns
+        -------
+        The statistics _before_ the push
+        """
+
+        cdef statistics_resource_adaptor[device_memory_resource]* mr = \
+            <statistics_resource_adaptor[device_memory_resource]*> self.c_obj.get()
+
+        bytes_and_allocs = deref(mr).push_counters()
+        return Statistics(
+            current_bytes=bytes_and_allocs.first.value,
+            current_count=bytes_and_allocs.second.value,
+            peak_bytes=bytes_and_allocs.first.peak,
+            peak_count=bytes_and_allocs.second.peak,
+            total_bytes=bytes_and_allocs.first.total,
+            total_count=bytes_and_allocs.second.total,
+        )
 
 cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor):
 
diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
new file mode 100644
index 000000000..279e45dc6
--- /dev/null
+++ b/python/rmm/rmm/statistics.py
@@ -0,0 +1,350 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import threading
+from collections import defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass
+from functools import wraps
+from typing import Dict, Literal, Optional
+
+import rmm.mr
+
+
+@dataclass
+class Statistics:
+    """Statistics returned by ``{get,push,pop}_statistics()``.
+
+    Attributes
+    ----------
+    current_bytes
+        Current number of bytes allocated
+    current_count
+        Current number of allocations allocated
+    peak_bytes
+        Peak number of bytes allocated
+    peak_count
+        Peak number of allocations allocated
+    total_bytes
+        Total number of bytes allocated
+    total_count
+        Total number of allocations allocated
+    """
+
+    current_bytes: int
+    current_count: int
+    peak_bytes: int
+    peak_count: int
+    total_bytes: int
+    total_count: int
+
+
+def enable_statistics() -> None:
+    """Enable allocation statistics.
+
+    This function is idempotent. If statistics have been enabled for the
+    current RMM resource stack, this is a no-op.
+
+    Warnings
+    --------
+    This modifies the current RMM memory resource. StatisticsResourceAdaptor
+    is pushed onto the current RMM memory resource stack and must remain the
+    topmost resource throughout the statistics gathering.
+    """
+
+    mr = rmm.mr.get_current_device_resource()
+    if not isinstance(mr, rmm.mr.StatisticsResourceAdaptor):
+        rmm.mr.set_current_device_resource(
+            rmm.mr.StatisticsResourceAdaptor(mr)
+        )
+
+
+def get_statistics() -> Optional[Statistics]:
+    """Get the current allocation statistics.
+
+    Return
+    ------
+    If enabled, returns the current tracked statistics.
+    If disabled, returns None.
+    """
+    mr = rmm.mr.get_current_device_resource()
+    if isinstance(mr, rmm.mr.StatisticsResourceAdaptor):
+        return mr.allocation_counts
+    return None
+
+
+def push_statistics() -> Optional[Statistics]:
+    """Push new counters on the current allocation statistics stack.
+
+    This returns the current tracked statistics and pushes a new set
+    of zero counters on the stack of statistics.
+
+    If statistics are disabled (the current memory resource is not an
+    instance of StatisticsResourceAdaptor), this function is a no-op.
+
+    Return
+    ------
+    If enabled, returns the current tracked statistics _before_ the pop.
+    If disabled, returns None.
+    """
+    mr = rmm.mr.get_current_device_resource()
+    if isinstance(mr, rmm.mr.StatisticsResourceAdaptor):
+        return mr.push_counters()
+    return None
+
+
+def pop_statistics() -> Optional[Statistics]:
+    """Pop the counters of the current allocation statistics stack.
+
+    This returns the counters of current tracked statistics and pops
+    them from the stack.
+
+    If statistics are disabled (the current memory resource is not an
+    instance of StatisticsResourceAdaptor), this function is a no-op.
+
+    Return
+    ------
+    If enabled, returns the popped counters.
+    If disabled, returns None.
+    """
+    mr = rmm.mr.get_current_device_resource()
+    if isinstance(mr, rmm.mr.StatisticsResourceAdaptor):
+        return mr.pop_counters()
+    return None
+
+
+@contextmanager
+def statistics():
+    """Context to enable allocation statistics.
+
+    If statistics have been enabled already (the current memory resource is an
+    instance of StatisticsResourceAdaptor), new counters are pushed on the
+    current allocation statistics stack when entering the context and popped
+    again when exiting using `push_statistics()` and `push_statistics()`.
+
+    If statistics have not been enabled, a new StatisticsResourceAdaptor is set
+    as the current RMM memory resource when entering the context and removed
+    again when exiting.
+
+    Raises
+    ------
+    ValueError
+        If the current RMM memory source was changed while in the context.
+    """
+
+    prior_non_stats_mr = None
+    if push_statistics() is None:
+        # Save the current non-statistics memory resource for later cleanup
+        prior_non_stats_mr = rmm.mr.get_current_device_resource()
+        enable_statistics()
+
+    try:
+        current_mr = rmm.mr.get_current_device_resource()
+        yield
+    finally:
+        if current_mr is not rmm.mr.get_current_device_resource():
+            raise ValueError(
+                "RMM memory source stack was changed "
+                "while in the statistics context"
+            )
+        if prior_non_stats_mr is None:
+            pop_statistics()
+        else:
+            rmm.mr.set_current_device_resource(prior_non_stats_mr)
+
+
+class ProfilerRecords:
+    """Records of the memory statistics recorded by a profiler."""
+
+    @dataclass
+    class MemoryRecord:
+        """Memory statistics of a single code block.
+
+        Attributes
+        ----------
+        num_calls
+            Number of times this code block was invoked.
+        memory_total
+            Total number of bytes allocated.
+        memory_peak
+            Peak number of bytes allocated.
+        """
+
+        num_calls: int = 0
+        memory_total: int = 0
+        memory_peak: int = 0
+
+        def add(self, memory_total: int, memory_peak: int):
+            self.num_calls += 1
+            self.memory_total += memory_total
+            self.memory_peak = max(self.memory_peak, memory_peak)
+
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self._records: Dict[str, ProfilerRecords.MemoryRecord] = defaultdict(
+            ProfilerRecords.MemoryRecord
+        )
+
+    def add(self, name: str, data: Statistics) -> None:
+        """Add memory statistics to the record named `name`.
+
+        This method is thread-safe.
+
+        Parameters
+        ----------
+        name
+            Name of the record.
+        data
+            Memory statistics of `name`.
+        """
+        with self._lock:
+            self._records[name].add(
+                memory_total=data.total_bytes, memory_peak=data.peak_bytes
+            )
+
+    @property
+    def records(self) -> Dict[str, MemoryRecord]:
+        """Dictionary mapping record names to their memory statistics."""
+        return dict(self._records)
+
+    def report(
+        self,
+        ordered_by: Literal[
+            "num_calls", "memory_peak", "memory_total"
+        ] = "memory_peak",
+    ) -> str:
+        """Pretty format the recorded memory statistics.
+
+        Parameters
+        ----------
+        ordered_by
+            Sort the statistics by this attribute.
+
+        Return
+        ------
+        The pretty formatted string of the memory statistics
+        """
+
+        # Sort by `ordered_by`
+        records = sorted(
+            ((name, data) for name, data in self.records.items()),
+            key=lambda x: getattr(x[1], ordered_by),
+            reverse=True,
+        )
+        ret = "Memory Profiling\n"
+        ret += "================\n\n"
+        if len(records) == 0:
+            return ret + "No data, maybe profiling wasn't enabled?"
+        ret += (
+            "Legends:\n"
+            "  ncalls       - number of times the function or code block "
+            "was called\n"
+            "  memory_peak  - peak memory allocated in function or code "
+            "block (in bytes)\n"
+            "  memory_total - total memory allocated in function or code "
+            "block (in bytes)\n"
+        )
+        ret += f"\nOrdered by: {ordered_by}\n"
+        ret += "\nncalls     memory_peak    memory_total  "
+        ret += "filename:lineno(function)\n"
+        for name, data in records:
+            ret += f"{data.num_calls:6,d} {data.memory_peak:15,d} "
+            ret += f"{data.memory_total:15,d}  {name}\n"
+        return ret[:-1]  # Remove the final newline
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.records})"
+
+    def __str__(self) -> str:
+        return self.report()
+
+
+def _get_descriptive_name_of_object(obj: object) -> str:
+    """Get descriptive name of object.
+
+    Parameters
+    ----------
+    obj
+        Object in question
+
+    Return
+    ------
+    A string including filename, line number, and object name.
+    """
+
+    obj = inspect.unwrap(obj)
+    _, linenumber = inspect.getsourcelines(obj)
+    filepath = inspect.getfile(obj)
+    return f"{filepath}:{linenumber}({obj.__qualname__})"
+
+
+default_profiler_records = ProfilerRecords()
+
+
+def profiler(
+    *,
+    records: ProfilerRecords = default_profiler_records,
+    name: str = "",
+):
+    """Decorator and context to profile function or code block.
+
+    If statistics are enabled (the current memory resource is an
+    instance of StatisticsResourceAdaptor), this decorator records the
+    memory statistics of the decorated function or code block.
+
+    If statistics are disabled, this decorator/context is a no-op.
+
+    Parameters
+    ----------
+    records
+        The profiler records that the memory statistics are written to. If
+        not set, a default profiler records are used.
+    name
+        The name of the memory profile, mandatory when the profiler
+        is used as a context manager. If used as a decorator, an empty name
+        is allowed. In this case, the name is the filename, line number, and
+        function name.
+    """
+
+    class ProfilerContext:
+        def __call__(self, func: callable) -> callable:
+            _name = name or _get_descriptive_name_of_object(func)
+
+            @wraps(func)
+            def wrapper(*args, **kwargs):
+                push_statistics()
+                try:
+                    return func(*args, **kwargs)
+                finally:
+                    if (stats := pop_statistics()) is not None:
+                        records.add(name=_name, data=stats)
+
+            return wrapper
+
+        def __enter__(self):
+            if not name:
+                raise ValueError(
+                    "When profiler is used as a context manager, "
+                    "a name must be provided"
+                )
+            push_statistics()
+            return self
+
+        def __exit__(self, *exc):
+            if (stats := pop_statistics()) is not None:
+                records.add(name=name, data=stats)
+            return False
+
+    return ProfilerContext()
diff --git a/python/rmm/rmm/tests/conftest.py b/python/rmm/rmm/tests/conftest.py
index 5fad81c79..b6debd9a2 100644
--- a/python/rmm/rmm/tests/conftest.py
+++ b/python/rmm/rmm/tests/conftest.py
@@ -1,6 +1,21 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import pytest
 
 import rmm
+import rmm.statistics
 
 
 @pytest.fixture(scope="function", autouse=True)
@@ -16,6 +31,6 @@ def rmm_auto_reinitialize():
 
 @pytest.fixture
 def stats_mr():
-    mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.CudaMemoryResource())
-    rmm.mr.set_current_device_resource(mr)
-    return mr
+    """Fixture that makes a StatisticsResourceAdaptor available to the test"""
+    with rmm.statistics.statistics():
+        yield rmm.mr.get_current_device_resource()
diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index 62adcd4a5..53edd96d2 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -649,68 +649,6 @@ def test_limiting_resource_adaptor(mr):
         rmm.DeviceBuffer(size=1)
 
 
-def test_statistics_resource_adaptor(stats_mr):
-
-    buffers = [rmm.DeviceBuffer(size=1000) for _ in range(10)]
-
-    for i in range(9, 0, -2):
-        del buffers[i]
-
-    assert stats_mr.allocation_counts == {
-        "current_bytes": 5040,
-        "current_count": 5,
-        "peak_bytes": 10080,
-        "peak_count": 10,
-        "total_bytes": 10080,
-        "total_count": 10,
-    }
-
-    # Push a new Tracking adaptor
-    mr2 = rmm.mr.StatisticsResourceAdaptor(stats_mr)
-    rmm.mr.set_current_device_resource(mr2)
-
-    for _ in range(2):
-        buffers.append(rmm.DeviceBuffer(size=1000))
-
-    assert mr2.allocation_counts == {
-        "current_bytes": 2016,
-        "current_count": 2,
-        "peak_bytes": 2016,
-        "peak_count": 2,
-        "total_bytes": 2016,
-        "total_count": 2,
-    }
-    assert stats_mr.allocation_counts == {
-        "current_bytes": 7056,
-        "current_count": 7,
-        "peak_bytes": 10080,
-        "peak_count": 10,
-        "total_bytes": 12096,
-        "total_count": 12,
-    }
-
-    del buffers
-    gc.collect()
-
-    assert mr2.allocation_counts == {
-        "current_bytes": 0,
-        "current_count": 0,
-        "peak_bytes": 2016,
-        "peak_count": 2,
-        "total_bytes": 2016,
-        "total_count": 2,
-    }
-    assert stats_mr.allocation_counts == {
-        "current_bytes": 0,
-        "current_count": 0,
-        "peak_bytes": 10080,
-        "peak_count": 10,
-        "total_bytes": 12096,
-        "total_count": 12,
-    }
-    gc.collect()
-
-
 def test_tracking_resource_adaptor():
     cuda_mr = rmm.mr.CudaMemoryResource()
 
diff --git a/python/rmm/rmm/tests/test_rmm_pytorch.py b/python/rmm/rmm/tests/test_rmm_pytorch.py
index 065507b61..2c9a4af23 100644
--- a/python/rmm/rmm/tests/test_rmm_pytorch.py
+++ b/python/rmm/rmm/tests/test_rmm_pytorch.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import gc
 
 import pytest
@@ -17,21 +31,21 @@ def torch_allocator():
 
 
 def test_rmm_torch_allocator(torch_allocator, stats_mr):
-    assert stats_mr.allocation_counts["current_bytes"] == 0
+    assert stats_mr.allocation_counts.current_bytes == 0
     x = torch.tensor([1, 2]).cuda()
-    assert stats_mr.allocation_counts["current_bytes"] > 0
+    assert stats_mr.allocation_counts.current_bytes > 0
     del x
     gc.collect()
-    assert stats_mr.allocation_counts["current_bytes"] == 0
+    assert stats_mr.allocation_counts.current_bytes == 0
 
 
 def test_rmm_torch_allocator_using_stream(torch_allocator, stats_mr):
-    assert stats_mr.allocation_counts["current_bytes"] == 0
+    assert stats_mr.allocation_counts.current_bytes == 0
     s = torch.cuda.Stream()
     with torch.cuda.stream(s):
         x = torch.tensor([1, 2]).cuda()
     torch.cuda.current_stream().wait_stream(s)
-    assert stats_mr.allocation_counts["current_bytes"] > 0
+    assert stats_mr.allocation_counts.current_bytes > 0
     del x
     gc.collect()
-    assert stats_mr.allocation_counts["current_bytes"] == 0
+    assert stats_mr.allocation_counts.current_bytes == 0
diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py
new file mode 100644
index 000000000..7ba09a92f
--- /dev/null
+++ b/python/rmm/rmm/tests/test_statistics.py
@@ -0,0 +1,336 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import rmm.mr
+from rmm.statistics import (
+    ProfilerRecords,
+    _get_descriptive_name_of_object,
+    default_profiler_records,
+    get_statistics,
+    pop_statistics,
+    profiler,
+    push_statistics,
+    statistics,
+)
+
+
+def test_context():
+    mr0 = rmm.mr.get_current_device_resource()
+    assert get_statistics() is None
+    with statistics():
+        mr1 = rmm.mr.get_current_device_resource()
+        assert isinstance(
+            rmm.mr.get_current_device_resource(),
+            rmm.mr.StatisticsResourceAdaptor,
+        )
+        b1 = rmm.DeviceBuffer(size=20)
+        stats = get_statistics()
+        assert stats.current_bytes == 32
+        assert stats.current_count == 1
+        assert stats.peak_bytes == 32
+        assert stats.peak_count == 1
+        assert stats.total_bytes == 32
+        assert stats.total_count == 1
+
+        with statistics():
+            mr2 = rmm.mr.get_current_device_resource()
+            assert mr1 is mr2
+            b2 = rmm.DeviceBuffer(size=10)
+            stats = get_statistics()
+            assert stats.current_bytes == 16
+            assert stats.current_count == 1
+            assert stats.peak_bytes == 16
+            assert stats.peak_count == 1
+            assert stats.total_bytes == 16
+            assert stats.total_count == 1
+
+        stats = get_statistics()
+        assert stats.current_bytes == 48
+        assert stats.current_count == 2
+        assert stats.peak_bytes == 48
+        assert stats.peak_count == 2
+        assert stats.total_bytes == 48
+        assert stats.total_count == 2
+
+        del b1
+        del b2
+    assert rmm.mr.get_current_device_resource() is mr0
+
+
+def test_multiple_mr(stats_mr):
+    buffers = [rmm.DeviceBuffer(size=1000) for _ in range(10)]
+
+    for i in range(9, 0, -2):
+        del buffers[i]
+
+    stats = stats_mr.allocation_counts
+    assert stats.current_bytes == 5040
+    assert stats.current_count == 5
+    assert stats.peak_bytes == 10080
+    assert stats.peak_count == 10
+    assert stats.total_bytes == 10080
+    assert stats.total_count == 10
+
+    # Push a new Tracking adaptor
+    mr2 = rmm.mr.StatisticsResourceAdaptor(stats_mr)
+    rmm.mr.set_current_device_resource(mr2)
+    try:
+        for _ in range(2):
+            buffers.append(rmm.DeviceBuffer(size=1000))
+
+        stats = mr2.allocation_counts
+        assert stats.current_bytes == 2016
+        assert stats.current_count == 2
+        assert stats.peak_bytes == 2016
+        assert stats.peak_count == 2
+        assert stats.total_bytes == 2016
+        assert stats.total_count == 2
+
+        stats = stats_mr.allocation_counts
+        assert stats.current_bytes == 7056
+        assert stats.current_count == 7
+        assert stats.peak_bytes == 10080
+        assert stats.peak_count == 10
+        assert stats.total_bytes == 12096
+        assert stats.total_count == 12
+
+        del buffers
+        stats = mr2.allocation_counts
+        assert stats.current_bytes == 0
+        assert stats.current_count == 0
+        assert stats.peak_bytes == 2016
+        assert stats.peak_count == 2
+        assert stats.total_bytes == 2016
+        assert stats.total_count == 2
+
+        stats = stats_mr.allocation_counts
+        assert stats.current_bytes == 0
+        assert stats.current_count == 0
+        assert stats.peak_bytes == 10080
+        assert stats.peak_count == 10
+        assert stats.total_bytes == 12096
+        assert stats.total_count == 12
+
+    finally:
+        rmm.mr.set_current_device_resource(stats_mr)
+
+
+def test_counter_stack(stats_mr):
+    buffers = [rmm.DeviceBuffer(size=10) for _ in range(10)]
+
+    # push returns the stats from the top before the push
+    stats = stats_mr.push_counters()  # stats from stack level 0
+    assert stats.current_bytes == 160
+    assert stats.current_count == 10
+    assert stats.peak_bytes == 160
+    assert stats.peak_count == 10
+    assert stats.total_bytes == 160
+    assert stats.total_count == 10
+
+    b1 = rmm.DeviceBuffer(size=10)
+
+    stats = stats_mr.push_counters()  # stats from stack level 1
+    assert stats.current_bytes == 16
+    assert stats.current_count == 1
+    assert stats.peak_bytes == 16
+    assert stats.peak_count == 1
+    assert stats.total_bytes == 16
+    assert stats.total_count == 1
+
+    del b1
+
+    # pop returns the popped stats
+    # Note, the bytes and counts can be negative
+    stats = stats_mr.pop_counters()  # stats from stack level 2
+    assert stats.current_bytes == -16
+    assert stats.current_count == -1
+    assert stats.peak_bytes == 0
+    assert stats.peak_count == 0
+    assert stats.total_bytes == 0
+    assert stats.total_count == 0
+
+    b1 = rmm.DeviceBuffer(size=10)
+
+    stats = stats_mr.push_counters()  # stats from stack level 1
+    assert stats.current_bytes == 16
+    assert stats.current_count == 1
+    assert stats.peak_bytes == 16
+    assert stats.peak_count == 1
+    assert stats.total_bytes == 32
+    assert stats.total_count == 2
+
+    b2 = rmm.DeviceBuffer(size=10)
+
+    stats = stats_mr.pop_counters()  # stats from stack level 2
+    assert stats.current_bytes == 16
+    assert stats.current_count == 1
+    assert stats.peak_bytes == 16
+    assert stats.peak_count == 1
+    assert stats.total_bytes == 16
+    assert stats.total_count == 1
+
+    stats = stats_mr.pop_counters()  # stats from stack level 1
+    assert stats.current_bytes == 32
+    assert stats.current_count == 2
+    assert stats.peak_bytes == 32
+    assert stats.peak_count == 2
+    assert stats.total_bytes == 48
+    assert stats.total_count == 3
+
+    del b1
+    del b2
+
+    stats = stats_mr.allocation_counts  # stats from stack level 0
+    assert stats.current_bytes == 160
+    assert stats.current_count == 10
+    assert stats.peak_bytes == 192
+    assert stats.peak_count == 12
+    assert stats.total_bytes == 208
+    assert stats.total_count == 13
+
+    del buffers
+    with pytest.raises(IndexError, match="cannot pop the last counter pair"):
+        stats_mr.pop_counters()
+
+
+def test_current_statistics(stats_mr):
+    b1 = rmm.DeviceBuffer(size=10)
+    stats = get_statistics()
+    assert stats.current_bytes == 16
+    assert stats.current_count == 1
+    assert stats.peak_bytes == 16
+    assert stats.peak_count == 1
+    assert stats.total_bytes == 16
+    assert stats.total_count == 1
+
+    b2 = rmm.DeviceBuffer(size=20)
+    stats = push_statistics()
+    assert stats.current_bytes == 48
+    assert stats.current_count == 2
+    assert stats.peak_bytes == 48
+    assert stats.peak_count == 2
+    assert stats.total_bytes == 48
+    assert stats.total_count == 2
+
+    del b1
+    stats = pop_statistics()
+    assert stats.current_bytes == -16
+    assert stats.current_count == -1
+    assert stats.peak_bytes == 0
+    assert stats.peak_count == 0
+    assert stats.total_bytes == 0
+    assert stats.total_count == 0
+
+    del b2
+    stats = get_statistics()
+    assert stats.current_bytes == 0
+    assert stats.current_count == 0
+    assert stats.peak_bytes == 48
+    assert stats.peak_count == 2
+    assert stats.total_bytes == 48
+    assert stats.total_count == 2
+
+
+def test_statistics_disabled():
+    assert get_statistics() is None
+    assert push_statistics() is None
+    assert get_statistics() is None
+
+
+def test_profiler(stats_mr):
+    profiler_records = ProfilerRecords()
+    assert len(profiler_records.records) == 0
+    assert "No data" in profiler_records.report()
+
+    @profiler(records=profiler_records)
+    def f1():
+        b1 = rmm.DeviceBuffer(size=10)
+        b2 = rmm.DeviceBuffer(size=10)
+        del b1
+        return b2
+
+    b1 = f1()
+    b2 = f1()
+
+    @profiler(records=profiler_records)
+    def f2():
+        b1 = rmm.DeviceBuffer(size=10)
+
+        @profiler(records=profiler_records, name="g2")
+        def g2(b1):
+            b2 = rmm.DeviceBuffer(size=10)
+            del b1
+            return b2
+
+        return g2(b1)
+
+    f2()
+    f2()
+    del b1
+    del b2
+    f2()
+
+    @profiler(records=profiler_records)
+    def f3():
+        return [rmm.DeviceBuffer(size=100) for _ in range(100)]
+
+    f3()
+
+    records = profiler_records.records
+    assert records[
+        _get_descriptive_name_of_object(f1)
+    ] == ProfilerRecords.MemoryRecord(
+        num_calls=2, memory_total=64, memory_peak=32
+    )
+    assert records[
+        _get_descriptive_name_of_object(f2)
+    ] == ProfilerRecords.MemoryRecord(
+        num_calls=3, memory_total=96, memory_peak=32
+    )
+    assert records["g2"] == ProfilerRecords.MemoryRecord(
+        num_calls=3, memory_total=48, memory_peak=16
+    )
+    assert records[
+        _get_descriptive_name_of_object(f3)
+    ] == ProfilerRecords.MemoryRecord(
+        num_calls=1, memory_total=11200, memory_peak=11200
+    )
+
+    @profiler()  # use the default profiler records
+    def f4():
+        return [rmm.DeviceBuffer(size=10) for _ in range(10)]
+
+    f4()
+
+    with profiler(name="b1 and b2"):  # use the profiler as a context manager
+        b1 = rmm.DeviceBuffer(size=100)
+        b2 = rmm.DeviceBuffer(size=100)
+        with profiler(name="del b1 and b2"):
+            del b1
+            del b2
+
+    records = default_profiler_records.records
+    assert records[
+        _get_descriptive_name_of_object(f4)
+    ] == ProfilerRecords.MemoryRecord(
+        num_calls=1, memory_total=160, memory_peak=160
+    )
+    assert records["b1 and b2"] == ProfilerRecords.MemoryRecord(
+        num_calls=1, memory_total=224, memory_peak=224
+    )
+    assert records["del b1 and b2"] == ProfilerRecords.MemoryRecord(
+        num_calls=1, memory_total=0, memory_peak=0
+    )
diff --git a/tests/mr/device/statistics_mr_tests.cpp b/tests/mr/device/statistics_mr_tests.cpp
index 8fd12f49b..6c5700f0b 100644
--- a/tests/mr/device/statistics_mr_tests.cpp
+++ b/tests/mr/device/statistics_mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -234,5 +234,47 @@ TEST(StatisticsTest, NegativeInnerTracking)
   EXPECT_EQ(inner_mr.get_allocations_counter().total, 5);
 }
 
+TEST(StatisticsTest, Nested)
+{
+  statistics_adaptor mr{rmm::mr::get_current_device_resource()};
+  void* a0 = mr.allocate(ten_MiB);
+  EXPECT_EQ(mr.get_bytes_counter().value, ten_MiB);
+  EXPECT_EQ(mr.get_allocations_counter().value, 1);
+  {
+    auto [bytes, allocs] = mr.push_counters();
+    EXPECT_EQ(bytes.value, ten_MiB);
+    EXPECT_EQ(allocs.value, 1);
+  }
+  EXPECT_EQ(mr.get_bytes_counter().value, 0);
+  EXPECT_EQ(mr.get_allocations_counter().value, 0);
+  void* a1 = mr.allocate(ten_MiB);
+  mr.push_counters();
+  EXPECT_EQ(mr.get_bytes_counter().value, 0);
+  EXPECT_EQ(mr.get_allocations_counter().value, 0);
+  void* a2 = mr.allocate(ten_MiB);
+  mr.deallocate(a2, ten_MiB);
+  EXPECT_EQ(mr.get_bytes_counter().value, 0);
+  EXPECT_EQ(mr.get_bytes_counter().peak, ten_MiB);
+  EXPECT_EQ(mr.get_allocations_counter().value, 0);
+  EXPECT_EQ(mr.get_allocations_counter().peak, 1);
+  {
+    auto [bytes, allocs] = mr.pop_counters();
+    EXPECT_EQ(bytes.value, 0);
+    EXPECT_EQ(bytes.peak, ten_MiB);
+    EXPECT_EQ(allocs.value, 0);
+    EXPECT_EQ(allocs.peak, 1);
+  }
+  mr.deallocate(a0, ten_MiB);
+  {
+    auto [bytes, allocs] = mr.pop_counters();
+    EXPECT_EQ(bytes.value, 0);
+    EXPECT_EQ(bytes.peak, ten_MiB * 2);
+    EXPECT_EQ(allocs.value, 0);
+    EXPECT_EQ(allocs.peak, 2);
+  }
+  mr.deallocate(a1, ten_MiB);
+  EXPECT_THROW(mr.pop_counters(), std::out_of_range);
+}
+
 }  // namespace
 }  // namespace rmm::test