diff --git a/include/rmm/mr/device/statistics_resource_adaptor.hpp b/include/rmm/mr/device/statistics_resource_adaptor.hpp index d072d5886..c8906afd2 100644 --- a/include/rmm/mr/device/statistics_resource_adaptor.hpp +++ b/include/rmm/mr/device/statistics_resource_adaptor.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace rmm::mr { /** @@ -36,8 +37,16 @@ namespace rmm::mr { * resource in order to satisfy allocation requests, but any existing * allocations will be untracked. Tracking statistics stores the current, peak * and total memory allocations for both the number of bytes and number of calls - * to the memory resource. `statistics_resource_adaptor` is intended as a debug - * adaptor and shouldn't be used in performance-sensitive code. + * to the memory resource. + * + * This resource supports nested statistics, which makes it possible to track statistics + * of a code block. Use `.push_counters()` to start tracking statistics on a code block + * and use `.pop_counters()` to stop the tracking. The nested statistics are cascading + * such that the statistics tracked by a code block include the statistics tracked in + * all its tracked sub code blocks. + * + * `statistics_resource_adaptor` is intended as a debug adaptor and shouldn't be + * used in performance-sensitive code. * * @tparam Upstream Type of the upstream resource used for * allocation/deallocation. @@ -45,11 +54,10 @@ namespace rmm::mr { template class statistics_resource_adaptor final : public device_memory_resource { public: - // can be a std::shared_mutex once C++17 is adopted using read_lock_t = - std::shared_lock; ///< Type of lock used to synchronize read access + std::shared_lock; ///< Type of lock used to synchronize read access using write_lock_t = - std::unique_lock; ///< Type of lock used to synchronize write access + std::unique_lock; ///< Type of lock used to synchronize write access /** * @brief Utility struct for counting the current, peak, and total value of a number */ @@ -83,6 +91,24 @@ class statistics_resource_adaptor final : public device_memory_resource { value -= val; return *this; } + + /** + * @brief Add `val` to the current value and update the peak value if necessary + * + * When updating the peak value, we assume that `val` is tracking a code block inside the + * code block tracked by `this`. Because nested statistics are cascading, we have to convert + * `val.peak` to the peak it would have been if it was part of the statistics tracked by `this`. + * We do this by adding the current value that was active when `val` started tracking such that + * we get `std::max(value + val.peak, peak)`. + * + * @param val Value to add + */ + void add_counters_from_tracked_sub_block(const counter& val) + { + peak = std::max(value + val.peak, peak); + value += val.value; + total += val.total; + } }; /** @@ -96,6 +122,8 @@ class statistics_resource_adaptor final : public device_memory_resource { statistics_resource_adaptor(Upstream* upstream) : upstream_{upstream} { RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + // Initially, we push a single counter pair on the stack + push_counters(); } statistics_resource_adaptor() = delete; @@ -131,7 +159,7 @@ class statistics_resource_adaptor final : public device_memory_resource { { read_lock_t lock(mtx_); - return bytes_; + return counter_stack_.top().first; } /** @@ -145,7 +173,40 @@ class statistics_resource_adaptor final : public device_memory_resource { { read_lock_t lock(mtx_); - return allocations_; + return counter_stack_.top().second; + } + + /** + * @brief Push a pair of zero counters on the stack, which becomes the new + * counters returned by `get_bytes_counter()` and `get_allocations_counter()` + * + * @return top pair of counters from the stack _before_ + * the push + */ + std::pair push_counters() + { + write_lock_t lock(mtx_); + auto ret = counter_stack_.top(); + counter_stack_.push(std::make_pair(counter{}, counter{})); + return ret; + } + + /** + * @brief Pop a pair of counters from the stack + * + * @return top pair of counters from the stack _before_ + * the pop + */ + std::pair pop_counters() + { + write_lock_t lock(mtx_); + if (counter_stack_.size() < 2) { throw std::out_of_range("cannot pop the last counter pair"); } + auto ret = counter_stack_.top(); + counter_stack_.pop(); + // Update the new top pair of counters + counter_stack_.top().first.add_counters_from_tracked_sub_block(ret.first); + counter_stack_.top().second.add_counters_from_tracked_sub_block(ret.second); + return ret; } private: @@ -171,8 +232,8 @@ class statistics_resource_adaptor final : public device_memory_resource { write_lock_t lock(mtx_); // Increment the allocation_count_ while we have the lock - bytes_ += bytes; - allocations_ += 1; + counter_stack_.top().first += bytes; + counter_stack_.top().second += 1; } return ptr; @@ -193,8 +254,8 @@ class statistics_resource_adaptor final : public device_memory_resource { write_lock_t lock(mtx_); // Decrement the current allocated counts. - bytes_ -= bytes; - allocations_ -= 1; + counter_stack_.top().first -= bytes; + counter_stack_.top().second -= 1; } } @@ -213,10 +274,10 @@ class statistics_resource_adaptor final : public device_memory_resource { return get_upstream_resource() == cast->get_upstream_resource(); } - counter bytes_; // peak, current and total allocated bytes - counter allocations_; // peak, current and total allocation count - std::shared_timed_mutex mutable mtx_; // mutex for thread safe access to allocations_ - Upstream* upstream_; // the upstream resource used for satisfying allocation requests + // Stack of counter pairs + std::stack> counter_stack_; + std::shared_mutex mutable mtx_; // mutex for thread safe access to allocations_ + Upstream* upstream_; // the upstream resource used for satisfying allocation requests }; /** diff --git a/include/rmm/mr/device/tracking_resource_adaptor.hpp b/include/rmm/mr/device/tracking_resource_adaptor.hpp index c49674849..d01d08b9c 100644 --- a/include/rmm/mr/device/tracking_resource_adaptor.hpp +++ b/include/rmm/mr/device/tracking_resource_adaptor.hpp @@ -53,11 +53,10 @@ namespace rmm::mr { template class tracking_resource_adaptor final : public device_memory_resource { public: - // can be a std::shared_mutex once C++17 is adopted using read_lock_t = - std::shared_lock; ///< Type of lock used to synchronize read access + std::shared_lock; ///< Type of lock used to synchronize read access using write_lock_t = - std::unique_lock; ///< Type of lock used to synchronize write access + std::unique_lock; ///< Type of lock used to synchronize write access /** * @brief Information stored about an allocation. Includes the size * and a stack trace if the `tracking_resource_adaptor` was initialized @@ -271,7 +270,7 @@ class tracking_resource_adaptor final : public device_memory_resource { bool capture_stacks_; // whether or not to capture call stacks std::map allocations_; // map of active allocations std::atomic allocated_bytes_; // number of bytes currently allocated - std::shared_timed_mutex mutable mtx_; // mutex for thread safe access to allocations_ + std::shared_mutex mutable mtx_; // mutex for thread safe access to allocations_ Upstream* upstream_; // the upstream resource used for satisfying allocation requests }; diff --git a/python/rmm/docs/guide.md b/python/rmm/docs/guide.md index 968be8586..bfba0800b 100644 --- a/python/rmm/docs/guide.md +++ b/python/rmm/docs/guide.md @@ -187,3 +187,104 @@ allocator. >>> torch.cuda.memory.change_current_allocator(rmm_torch_allocator) ``` + +## Memory statistics and profiling + +RMM can profile memory usage and track memory statistics by using either of the following: + - Use the context manager `rmm.statistics.statistics()` to enable statistics tracking for a specific code block. + - Call `rmm.statistics.enable_statistics()` to enable statistics tracking globally. + +Common to both usages is that they modify the currently active RMM memory resource. The current device resource is wrapped with a `StatisticsResourceAdaptor` which must remain the topmost resource throughout the statistics tracking: +```python +>>> import rmm +>>> import rmm.statistics + +>>> # We start with the default cuda memory resource +>>> rmm.mr.get_current_device_resource() + + +>>> # When using statistics, we get a StatisticsResourceAdaptor with the context +>>> with rmm.statistics.statistics(): +... rmm.mr.get_current_device_resource() + + +>>> # We can also enable statistics globally +>>> rmm.statistics.enable_statistics() +>>> print(rmm.mr.get_current_device_resource()) + +``` + +With statistics enabled, you can query statistics of the current and peak bytes and number of allocations performed by the current RMM memory resource: +```python +>>> buf = rmm.DeviceBuffer(size=10) +>>> rmm.statistics.get_statistics() +Statistics(current_bytes=16, current_count=1, peak_bytes=16, peak_count=1, total_bytes=16, total_count=1) +``` + +### Memory Profiler +To profile a specific block of code, first enable memory statistics by calling `rmm.statistics.enable_statistics()`. To profile a function, use `profiler` as a function decorator: +```python +>>> @rmm.statistics.profiler() +... def f(size): +... rmm.DeviceBuffer(size=size) +>>> f(1000) + +>>> # By default, the profiler write to rmm.statistics.default_profiler_records +>>> print(rmm.statistics.default_profiler_records.report()) +Memory Profiling +================ + +Legends: + ncalls - number of times the function or code block was called + memory_peak - peak memory allocated in function or code block (in bytes) + memory_total - total memory allocated in function or code block (in bytes) + +Ordered by: memory_peak + +ncalls memory_peak memory_total filename:lineno(function) + 1 1,008 1,008 :1(f) +``` + +To profile a code block, use `profiler` as a context manager: +```python +>>> with rmm.statistics.profiler(name="my code block"): +... rmm.DeviceBuffer(size=20) +>>> print(rmm.statistics.default_profiler_records.report()) +Memory Profiling +================ + +Legends: + ncalls - number of times the function or code block was called + memory_peak - peak memory allocated in function or code block (in bytes) + memory_total - total memory allocated in function or code block (in bytes) + +Ordered by: memory_peak + +ncalls memory_peak memory_total filename:lineno(function) + 1 1,008 1,008 :1(f) + 1 32 32 my code block +``` + +The `profiler` supports nesting: +```python +>>> with rmm.statistics.profiler(name="outer"): +... buf1 = rmm.DeviceBuffer(size=10) +... with rmm.statistics.profiler(name="inner"): +... buf2 = rmm.DeviceBuffer(size=10) +>>> print(rmm.statistics.default_profiler_records.report()) +Memory Profiling +================ + +Legends: + ncalls - number of times the function or code block was called + memory_peak - peak memory allocated in function or code block (in bytes) + memory_total - total memory allocated in function or code block (in bytes) + +Ordered by: memory_peak + +ncalls memory_peak memory_total filename:lineno(function) + 1 1,008 1,008 :1(f) + 1 32 32 my code block + 1 32 32 outer + 1 16 16 inner +``` diff --git a/python/rmm/docs/python_api.rst b/python/rmm/docs/python_api.rst index b229d8214..a62304d21 100644 --- a/python/rmm/docs/python_api.rst +++ b/python/rmm/docs/python_api.rst @@ -37,3 +37,12 @@ Memory Allocators :members: :undoc-members: :show-inheritance: + +Memory Statistics +----------------- + +.. automodule:: rmm.statistics + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx index 992203c27..e181bff4c 100644 --- a/python/rmm/rmm/_lib/memory_resource.pyx +++ b/python/rmm/rmm/_lib/memory_resource.pyx @@ -32,9 +32,7 @@ from libcpp.string cimport string from cuda.cudart import cudaError_t from rmm._cuda.gpu import CUDARuntimeError, getDevice, setDevice - from rmm._cuda.stream cimport Stream - from rmm._cuda.stream import DEFAULT_STREAM from rmm._lib.cuda_stream_view cimport cuda_stream_view @@ -46,6 +44,7 @@ from rmm._lib.per_device_resource cimport ( cuda_device_id, set_per_device_resource as cpp_set_per_device_resource, ) +from rmm.statistics import Statistics # Transparent handle of a C++ exception ctypedef pair[int, string] CppExcept @@ -182,8 +181,7 @@ cdef extern from "rmm/mr/device/logging_resource_adaptor.hpp" \ cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \ namespace "rmm::mr" nogil: - cdef cppclass statistics_resource_adaptor[Upstream]( - device_memory_resource): + cdef cppclass statistics_resource_adaptor[Upstream](device_memory_resource): struct counter: counter() @@ -191,11 +189,12 @@ cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \ int64_t peak int64_t total - statistics_resource_adaptor( - Upstream* upstream_mr) except + + statistics_resource_adaptor(Upstream* upstream_mr) except + counter get_bytes_counter() except + counter get_allocations_counter() except + + pair[counter, counter] pop_counters() except + + pair[counter, counter] push_counters() except + cdef extern from "rmm/mr/device/tracking_resource_adaptor.hpp" \ namespace "rmm::mr" nogil: @@ -798,6 +797,9 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor): allocations/deallocations performed by an upstream memory resource. Includes the ability to query these statistics at any time. + A stack of counters is maintained. Use :meth:`push_counters` and + :meth:`pop_counters` to track statistics at different nesting levels. + Parameters ---------- upstream : DeviceMemoryResource @@ -806,7 +808,7 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor): pass @property - def allocation_counts(self) -> dict: + def allocation_counts(self) -> Statistics: """ Gets the current, peak, and total allocated bytes and number of allocations. @@ -817,20 +819,62 @@ cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor): Returns: dict: Dictionary containing allocation counts and bytes. """ + cdef statistics_resource_adaptor[device_memory_resource]* mr = \ + self.c_obj.get() + + counts = deref(mr).get_allocations_counter() + byte_counts = deref(mr).get_bytes_counter() + return Statistics( + current_bytes=byte_counts.value, + current_count=counts.value, + peak_bytes=byte_counts.peak, + peak_count=counts.peak, + total_bytes=byte_counts.total, + total_count=counts.total, + ) + + def pop_counters(self) -> Statistics: + """ + Pop a counter pair (bytes and allocations) from the stack - counts = (( - self.c_obj.get()))[0].get_allocations_counter() - byte_counts = (( - self.c_obj.get()))[0].get_bytes_counter() - - return { - "current_bytes": byte_counts.value, - "current_count": counts.value, - "peak_bytes": byte_counts.peak, - "peak_count": counts.peak, - "total_bytes": byte_counts.total, - "total_count": counts.total, - } + Returns + ------- + The popped statistics + """ + cdef statistics_resource_adaptor[device_memory_resource]* mr = \ + self.c_obj.get() + + bytes_and_allocs = deref(mr).pop_counters() + return Statistics( + current_bytes=bytes_and_allocs.first.value, + current_count=bytes_and_allocs.second.value, + peak_bytes=bytes_and_allocs.first.peak, + peak_count=bytes_and_allocs.second.peak, + total_bytes=bytes_and_allocs.first.total, + total_count=bytes_and_allocs.second.total, + ) + + def push_counters(self) -> Statistics: + """ + Push a new counter pair (bytes and allocations) on the stack + + Returns + ------- + The statistics _before_ the push + """ + + cdef statistics_resource_adaptor[device_memory_resource]* mr = \ + self.c_obj.get() + + bytes_and_allocs = deref(mr).push_counters() + return Statistics( + current_bytes=bytes_and_allocs.first.value, + current_count=bytes_and_allocs.second.value, + peak_bytes=bytes_and_allocs.first.peak, + peak_count=bytes_and_allocs.second.peak, + total_bytes=bytes_and_allocs.first.total, + total_count=bytes_and_allocs.second.total, + ) cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor): diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py new file mode 100644 index 000000000..279e45dc6 --- /dev/null +++ b/python/rmm/rmm/statistics.py @@ -0,0 +1,350 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import threading +from collections import defaultdict +from contextlib import contextmanager +from dataclasses import dataclass +from functools import wraps +from typing import Dict, Literal, Optional + +import rmm.mr + + +@dataclass +class Statistics: + """Statistics returned by ``{get,push,pop}_statistics()``. + + Attributes + ---------- + current_bytes + Current number of bytes allocated + current_count + Current number of allocations allocated + peak_bytes + Peak number of bytes allocated + peak_count + Peak number of allocations allocated + total_bytes + Total number of bytes allocated + total_count + Total number of allocations allocated + """ + + current_bytes: int + current_count: int + peak_bytes: int + peak_count: int + total_bytes: int + total_count: int + + +def enable_statistics() -> None: + """Enable allocation statistics. + + This function is idempotent. If statistics have been enabled for the + current RMM resource stack, this is a no-op. + + Warnings + -------- + This modifies the current RMM memory resource. StatisticsResourceAdaptor + is pushed onto the current RMM memory resource stack and must remain the + topmost resource throughout the statistics gathering. + """ + + mr = rmm.mr.get_current_device_resource() + if not isinstance(mr, rmm.mr.StatisticsResourceAdaptor): + rmm.mr.set_current_device_resource( + rmm.mr.StatisticsResourceAdaptor(mr) + ) + + +def get_statistics() -> Optional[Statistics]: + """Get the current allocation statistics. + + Return + ------ + If enabled, returns the current tracked statistics. + If disabled, returns None. + """ + mr = rmm.mr.get_current_device_resource() + if isinstance(mr, rmm.mr.StatisticsResourceAdaptor): + return mr.allocation_counts + return None + + +def push_statistics() -> Optional[Statistics]: + """Push new counters on the current allocation statistics stack. + + This returns the current tracked statistics and pushes a new set + of zero counters on the stack of statistics. + + If statistics are disabled (the current memory resource is not an + instance of StatisticsResourceAdaptor), this function is a no-op. + + Return + ------ + If enabled, returns the current tracked statistics _before_ the pop. + If disabled, returns None. + """ + mr = rmm.mr.get_current_device_resource() + if isinstance(mr, rmm.mr.StatisticsResourceAdaptor): + return mr.push_counters() + return None + + +def pop_statistics() -> Optional[Statistics]: + """Pop the counters of the current allocation statistics stack. + + This returns the counters of current tracked statistics and pops + them from the stack. + + If statistics are disabled (the current memory resource is not an + instance of StatisticsResourceAdaptor), this function is a no-op. + + Return + ------ + If enabled, returns the popped counters. + If disabled, returns None. + """ + mr = rmm.mr.get_current_device_resource() + if isinstance(mr, rmm.mr.StatisticsResourceAdaptor): + return mr.pop_counters() + return None + + +@contextmanager +def statistics(): + """Context to enable allocation statistics. + + If statistics have been enabled already (the current memory resource is an + instance of StatisticsResourceAdaptor), new counters are pushed on the + current allocation statistics stack when entering the context and popped + again when exiting using `push_statistics()` and `push_statistics()`. + + If statistics have not been enabled, a new StatisticsResourceAdaptor is set + as the current RMM memory resource when entering the context and removed + again when exiting. + + Raises + ------ + ValueError + If the current RMM memory source was changed while in the context. + """ + + prior_non_stats_mr = None + if push_statistics() is None: + # Save the current non-statistics memory resource for later cleanup + prior_non_stats_mr = rmm.mr.get_current_device_resource() + enable_statistics() + + try: + current_mr = rmm.mr.get_current_device_resource() + yield + finally: + if current_mr is not rmm.mr.get_current_device_resource(): + raise ValueError( + "RMM memory source stack was changed " + "while in the statistics context" + ) + if prior_non_stats_mr is None: + pop_statistics() + else: + rmm.mr.set_current_device_resource(prior_non_stats_mr) + + +class ProfilerRecords: + """Records of the memory statistics recorded by a profiler.""" + + @dataclass + class MemoryRecord: + """Memory statistics of a single code block. + + Attributes + ---------- + num_calls + Number of times this code block was invoked. + memory_total + Total number of bytes allocated. + memory_peak + Peak number of bytes allocated. + """ + + num_calls: int = 0 + memory_total: int = 0 + memory_peak: int = 0 + + def add(self, memory_total: int, memory_peak: int): + self.num_calls += 1 + self.memory_total += memory_total + self.memory_peak = max(self.memory_peak, memory_peak) + + def __init__(self) -> None: + self._lock = threading.Lock() + self._records: Dict[str, ProfilerRecords.MemoryRecord] = defaultdict( + ProfilerRecords.MemoryRecord + ) + + def add(self, name: str, data: Statistics) -> None: + """Add memory statistics to the record named `name`. + + This method is thread-safe. + + Parameters + ---------- + name + Name of the record. + data + Memory statistics of `name`. + """ + with self._lock: + self._records[name].add( + memory_total=data.total_bytes, memory_peak=data.peak_bytes + ) + + @property + def records(self) -> Dict[str, MemoryRecord]: + """Dictionary mapping record names to their memory statistics.""" + return dict(self._records) + + def report( + self, + ordered_by: Literal[ + "num_calls", "memory_peak", "memory_total" + ] = "memory_peak", + ) -> str: + """Pretty format the recorded memory statistics. + + Parameters + ---------- + ordered_by + Sort the statistics by this attribute. + + Return + ------ + The pretty formatted string of the memory statistics + """ + + # Sort by `ordered_by` + records = sorted( + ((name, data) for name, data in self.records.items()), + key=lambda x: getattr(x[1], ordered_by), + reverse=True, + ) + ret = "Memory Profiling\n" + ret += "================\n\n" + if len(records) == 0: + return ret + "No data, maybe profiling wasn't enabled?" + ret += ( + "Legends:\n" + " ncalls - number of times the function or code block " + "was called\n" + " memory_peak - peak memory allocated in function or code " + "block (in bytes)\n" + " memory_total - total memory allocated in function or code " + "block (in bytes)\n" + ) + ret += f"\nOrdered by: {ordered_by}\n" + ret += "\nncalls memory_peak memory_total " + ret += "filename:lineno(function)\n" + for name, data in records: + ret += f"{data.num_calls:6,d} {data.memory_peak:15,d} " + ret += f"{data.memory_total:15,d} {name}\n" + return ret[:-1] # Remove the final newline + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self.records})" + + def __str__(self) -> str: + return self.report() + + +def _get_descriptive_name_of_object(obj: object) -> str: + """Get descriptive name of object. + + Parameters + ---------- + obj + Object in question + + Return + ------ + A string including filename, line number, and object name. + """ + + obj = inspect.unwrap(obj) + _, linenumber = inspect.getsourcelines(obj) + filepath = inspect.getfile(obj) + return f"{filepath}:{linenumber}({obj.__qualname__})" + + +default_profiler_records = ProfilerRecords() + + +def profiler( + *, + records: ProfilerRecords = default_profiler_records, + name: str = "", +): + """Decorator and context to profile function or code block. + + If statistics are enabled (the current memory resource is an + instance of StatisticsResourceAdaptor), this decorator records the + memory statistics of the decorated function or code block. + + If statistics are disabled, this decorator/context is a no-op. + + Parameters + ---------- + records + The profiler records that the memory statistics are written to. If + not set, a default profiler records are used. + name + The name of the memory profile, mandatory when the profiler + is used as a context manager. If used as a decorator, an empty name + is allowed. In this case, the name is the filename, line number, and + function name. + """ + + class ProfilerContext: + def __call__(self, func: callable) -> callable: + _name = name or _get_descriptive_name_of_object(func) + + @wraps(func) + def wrapper(*args, **kwargs): + push_statistics() + try: + return func(*args, **kwargs) + finally: + if (stats := pop_statistics()) is not None: + records.add(name=_name, data=stats) + + return wrapper + + def __enter__(self): + if not name: + raise ValueError( + "When profiler is used as a context manager, " + "a name must be provided" + ) + push_statistics() + return self + + def __exit__(self, *exc): + if (stats := pop_statistics()) is not None: + records.add(name=name, data=stats) + return False + + return ProfilerContext() diff --git a/python/rmm/rmm/tests/conftest.py b/python/rmm/rmm/tests/conftest.py index 5fad81c79..b6debd9a2 100644 --- a/python/rmm/rmm/tests/conftest.py +++ b/python/rmm/rmm/tests/conftest.py @@ -1,6 +1,21 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytest import rmm +import rmm.statistics @pytest.fixture(scope="function", autouse=True) @@ -16,6 +31,6 @@ def rmm_auto_reinitialize(): @pytest.fixture def stats_mr(): - mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.CudaMemoryResource()) - rmm.mr.set_current_device_resource(mr) - return mr + """Fixture that makes a StatisticsResourceAdaptor available to the test""" + with rmm.statistics.statistics(): + yield rmm.mr.get_current_device_resource() diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py index 62adcd4a5..53edd96d2 100644 --- a/python/rmm/rmm/tests/test_rmm.py +++ b/python/rmm/rmm/tests/test_rmm.py @@ -649,68 +649,6 @@ def test_limiting_resource_adaptor(mr): rmm.DeviceBuffer(size=1) -def test_statistics_resource_adaptor(stats_mr): - - buffers = [rmm.DeviceBuffer(size=1000) for _ in range(10)] - - for i in range(9, 0, -2): - del buffers[i] - - assert stats_mr.allocation_counts == { - "current_bytes": 5040, - "current_count": 5, - "peak_bytes": 10080, - "peak_count": 10, - "total_bytes": 10080, - "total_count": 10, - } - - # Push a new Tracking adaptor - mr2 = rmm.mr.StatisticsResourceAdaptor(stats_mr) - rmm.mr.set_current_device_resource(mr2) - - for _ in range(2): - buffers.append(rmm.DeviceBuffer(size=1000)) - - assert mr2.allocation_counts == { - "current_bytes": 2016, - "current_count": 2, - "peak_bytes": 2016, - "peak_count": 2, - "total_bytes": 2016, - "total_count": 2, - } - assert stats_mr.allocation_counts == { - "current_bytes": 7056, - "current_count": 7, - "peak_bytes": 10080, - "peak_count": 10, - "total_bytes": 12096, - "total_count": 12, - } - - del buffers - gc.collect() - - assert mr2.allocation_counts == { - "current_bytes": 0, - "current_count": 0, - "peak_bytes": 2016, - "peak_count": 2, - "total_bytes": 2016, - "total_count": 2, - } - assert stats_mr.allocation_counts == { - "current_bytes": 0, - "current_count": 0, - "peak_bytes": 10080, - "peak_count": 10, - "total_bytes": 12096, - "total_count": 12, - } - gc.collect() - - def test_tracking_resource_adaptor(): cuda_mr = rmm.mr.CudaMemoryResource() diff --git a/python/rmm/rmm/tests/test_rmm_pytorch.py b/python/rmm/rmm/tests/test_rmm_pytorch.py index 065507b61..2c9a4af23 100644 --- a/python/rmm/rmm/tests/test_rmm_pytorch.py +++ b/python/rmm/rmm/tests/test_rmm_pytorch.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import gc import pytest @@ -17,21 +31,21 @@ def torch_allocator(): def test_rmm_torch_allocator(torch_allocator, stats_mr): - assert stats_mr.allocation_counts["current_bytes"] == 0 + assert stats_mr.allocation_counts.current_bytes == 0 x = torch.tensor([1, 2]).cuda() - assert stats_mr.allocation_counts["current_bytes"] > 0 + assert stats_mr.allocation_counts.current_bytes > 0 del x gc.collect() - assert stats_mr.allocation_counts["current_bytes"] == 0 + assert stats_mr.allocation_counts.current_bytes == 0 def test_rmm_torch_allocator_using_stream(torch_allocator, stats_mr): - assert stats_mr.allocation_counts["current_bytes"] == 0 + assert stats_mr.allocation_counts.current_bytes == 0 s = torch.cuda.Stream() with torch.cuda.stream(s): x = torch.tensor([1, 2]).cuda() torch.cuda.current_stream().wait_stream(s) - assert stats_mr.allocation_counts["current_bytes"] > 0 + assert stats_mr.allocation_counts.current_bytes > 0 del x gc.collect() - assert stats_mr.allocation_counts["current_bytes"] == 0 + assert stats_mr.allocation_counts.current_bytes == 0 diff --git a/python/rmm/rmm/tests/test_statistics.py b/python/rmm/rmm/tests/test_statistics.py new file mode 100644 index 000000000..7ba09a92f --- /dev/null +++ b/python/rmm/rmm/tests/test_statistics.py @@ -0,0 +1,336 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import rmm.mr +from rmm.statistics import ( + ProfilerRecords, + _get_descriptive_name_of_object, + default_profiler_records, + get_statistics, + pop_statistics, + profiler, + push_statistics, + statistics, +) + + +def test_context(): + mr0 = rmm.mr.get_current_device_resource() + assert get_statistics() is None + with statistics(): + mr1 = rmm.mr.get_current_device_resource() + assert isinstance( + rmm.mr.get_current_device_resource(), + rmm.mr.StatisticsResourceAdaptor, + ) + b1 = rmm.DeviceBuffer(size=20) + stats = get_statistics() + assert stats.current_bytes == 32 + assert stats.current_count == 1 + assert stats.peak_bytes == 32 + assert stats.peak_count == 1 + assert stats.total_bytes == 32 + assert stats.total_count == 1 + + with statistics(): + mr2 = rmm.mr.get_current_device_resource() + assert mr1 is mr2 + b2 = rmm.DeviceBuffer(size=10) + stats = get_statistics() + assert stats.current_bytes == 16 + assert stats.current_count == 1 + assert stats.peak_bytes == 16 + assert stats.peak_count == 1 + assert stats.total_bytes == 16 + assert stats.total_count == 1 + + stats = get_statistics() + assert stats.current_bytes == 48 + assert stats.current_count == 2 + assert stats.peak_bytes == 48 + assert stats.peak_count == 2 + assert stats.total_bytes == 48 + assert stats.total_count == 2 + + del b1 + del b2 + assert rmm.mr.get_current_device_resource() is mr0 + + +def test_multiple_mr(stats_mr): + buffers = [rmm.DeviceBuffer(size=1000) for _ in range(10)] + + for i in range(9, 0, -2): + del buffers[i] + + stats = stats_mr.allocation_counts + assert stats.current_bytes == 5040 + assert stats.current_count == 5 + assert stats.peak_bytes == 10080 + assert stats.peak_count == 10 + assert stats.total_bytes == 10080 + assert stats.total_count == 10 + + # Push a new Tracking adaptor + mr2 = rmm.mr.StatisticsResourceAdaptor(stats_mr) + rmm.mr.set_current_device_resource(mr2) + try: + for _ in range(2): + buffers.append(rmm.DeviceBuffer(size=1000)) + + stats = mr2.allocation_counts + assert stats.current_bytes == 2016 + assert stats.current_count == 2 + assert stats.peak_bytes == 2016 + assert stats.peak_count == 2 + assert stats.total_bytes == 2016 + assert stats.total_count == 2 + + stats = stats_mr.allocation_counts + assert stats.current_bytes == 7056 + assert stats.current_count == 7 + assert stats.peak_bytes == 10080 + assert stats.peak_count == 10 + assert stats.total_bytes == 12096 + assert stats.total_count == 12 + + del buffers + stats = mr2.allocation_counts + assert stats.current_bytes == 0 + assert stats.current_count == 0 + assert stats.peak_bytes == 2016 + assert stats.peak_count == 2 + assert stats.total_bytes == 2016 + assert stats.total_count == 2 + + stats = stats_mr.allocation_counts + assert stats.current_bytes == 0 + assert stats.current_count == 0 + assert stats.peak_bytes == 10080 + assert stats.peak_count == 10 + assert stats.total_bytes == 12096 + assert stats.total_count == 12 + + finally: + rmm.mr.set_current_device_resource(stats_mr) + + +def test_counter_stack(stats_mr): + buffers = [rmm.DeviceBuffer(size=10) for _ in range(10)] + + # push returns the stats from the top before the push + stats = stats_mr.push_counters() # stats from stack level 0 + assert stats.current_bytes == 160 + assert stats.current_count == 10 + assert stats.peak_bytes == 160 + assert stats.peak_count == 10 + assert stats.total_bytes == 160 + assert stats.total_count == 10 + + b1 = rmm.DeviceBuffer(size=10) + + stats = stats_mr.push_counters() # stats from stack level 1 + assert stats.current_bytes == 16 + assert stats.current_count == 1 + assert stats.peak_bytes == 16 + assert stats.peak_count == 1 + assert stats.total_bytes == 16 + assert stats.total_count == 1 + + del b1 + + # pop returns the popped stats + # Note, the bytes and counts can be negative + stats = stats_mr.pop_counters() # stats from stack level 2 + assert stats.current_bytes == -16 + assert stats.current_count == -1 + assert stats.peak_bytes == 0 + assert stats.peak_count == 0 + assert stats.total_bytes == 0 + assert stats.total_count == 0 + + b1 = rmm.DeviceBuffer(size=10) + + stats = stats_mr.push_counters() # stats from stack level 1 + assert stats.current_bytes == 16 + assert stats.current_count == 1 + assert stats.peak_bytes == 16 + assert stats.peak_count == 1 + assert stats.total_bytes == 32 + assert stats.total_count == 2 + + b2 = rmm.DeviceBuffer(size=10) + + stats = stats_mr.pop_counters() # stats from stack level 2 + assert stats.current_bytes == 16 + assert stats.current_count == 1 + assert stats.peak_bytes == 16 + assert stats.peak_count == 1 + assert stats.total_bytes == 16 + assert stats.total_count == 1 + + stats = stats_mr.pop_counters() # stats from stack level 1 + assert stats.current_bytes == 32 + assert stats.current_count == 2 + assert stats.peak_bytes == 32 + assert stats.peak_count == 2 + assert stats.total_bytes == 48 + assert stats.total_count == 3 + + del b1 + del b2 + + stats = stats_mr.allocation_counts # stats from stack level 0 + assert stats.current_bytes == 160 + assert stats.current_count == 10 + assert stats.peak_bytes == 192 + assert stats.peak_count == 12 + assert stats.total_bytes == 208 + assert stats.total_count == 13 + + del buffers + with pytest.raises(IndexError, match="cannot pop the last counter pair"): + stats_mr.pop_counters() + + +def test_current_statistics(stats_mr): + b1 = rmm.DeviceBuffer(size=10) + stats = get_statistics() + assert stats.current_bytes == 16 + assert stats.current_count == 1 + assert stats.peak_bytes == 16 + assert stats.peak_count == 1 + assert stats.total_bytes == 16 + assert stats.total_count == 1 + + b2 = rmm.DeviceBuffer(size=20) + stats = push_statistics() + assert stats.current_bytes == 48 + assert stats.current_count == 2 + assert stats.peak_bytes == 48 + assert stats.peak_count == 2 + assert stats.total_bytes == 48 + assert stats.total_count == 2 + + del b1 + stats = pop_statistics() + assert stats.current_bytes == -16 + assert stats.current_count == -1 + assert stats.peak_bytes == 0 + assert stats.peak_count == 0 + assert stats.total_bytes == 0 + assert stats.total_count == 0 + + del b2 + stats = get_statistics() + assert stats.current_bytes == 0 + assert stats.current_count == 0 + assert stats.peak_bytes == 48 + assert stats.peak_count == 2 + assert stats.total_bytes == 48 + assert stats.total_count == 2 + + +def test_statistics_disabled(): + assert get_statistics() is None + assert push_statistics() is None + assert get_statistics() is None + + +def test_profiler(stats_mr): + profiler_records = ProfilerRecords() + assert len(profiler_records.records) == 0 + assert "No data" in profiler_records.report() + + @profiler(records=profiler_records) + def f1(): + b1 = rmm.DeviceBuffer(size=10) + b2 = rmm.DeviceBuffer(size=10) + del b1 + return b2 + + b1 = f1() + b2 = f1() + + @profiler(records=profiler_records) + def f2(): + b1 = rmm.DeviceBuffer(size=10) + + @profiler(records=profiler_records, name="g2") + def g2(b1): + b2 = rmm.DeviceBuffer(size=10) + del b1 + return b2 + + return g2(b1) + + f2() + f2() + del b1 + del b2 + f2() + + @profiler(records=profiler_records) + def f3(): + return [rmm.DeviceBuffer(size=100) for _ in range(100)] + + f3() + + records = profiler_records.records + assert records[ + _get_descriptive_name_of_object(f1) + ] == ProfilerRecords.MemoryRecord( + num_calls=2, memory_total=64, memory_peak=32 + ) + assert records[ + _get_descriptive_name_of_object(f2) + ] == ProfilerRecords.MemoryRecord( + num_calls=3, memory_total=96, memory_peak=32 + ) + assert records["g2"] == ProfilerRecords.MemoryRecord( + num_calls=3, memory_total=48, memory_peak=16 + ) + assert records[ + _get_descriptive_name_of_object(f3) + ] == ProfilerRecords.MemoryRecord( + num_calls=1, memory_total=11200, memory_peak=11200 + ) + + @profiler() # use the default profiler records + def f4(): + return [rmm.DeviceBuffer(size=10) for _ in range(10)] + + f4() + + with profiler(name="b1 and b2"): # use the profiler as a context manager + b1 = rmm.DeviceBuffer(size=100) + b2 = rmm.DeviceBuffer(size=100) + with profiler(name="del b1 and b2"): + del b1 + del b2 + + records = default_profiler_records.records + assert records[ + _get_descriptive_name_of_object(f4) + ] == ProfilerRecords.MemoryRecord( + num_calls=1, memory_total=160, memory_peak=160 + ) + assert records["b1 and b2"] == ProfilerRecords.MemoryRecord( + num_calls=1, memory_total=224, memory_peak=224 + ) + assert records["del b1 and b2"] == ProfilerRecords.MemoryRecord( + num_calls=1, memory_total=0, memory_peak=0 + ) diff --git a/tests/mr/device/statistics_mr_tests.cpp b/tests/mr/device/statistics_mr_tests.cpp index 8fd12f49b..6c5700f0b 100644 --- a/tests/mr/device/statistics_mr_tests.cpp +++ b/tests/mr/device/statistics_mr_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -234,5 +234,47 @@ TEST(StatisticsTest, NegativeInnerTracking) EXPECT_EQ(inner_mr.get_allocations_counter().total, 5); } +TEST(StatisticsTest, Nested) +{ + statistics_adaptor mr{rmm::mr::get_current_device_resource()}; + void* a0 = mr.allocate(ten_MiB); + EXPECT_EQ(mr.get_bytes_counter().value, ten_MiB); + EXPECT_EQ(mr.get_allocations_counter().value, 1); + { + auto [bytes, allocs] = mr.push_counters(); + EXPECT_EQ(bytes.value, ten_MiB); + EXPECT_EQ(allocs.value, 1); + } + EXPECT_EQ(mr.get_bytes_counter().value, 0); + EXPECT_EQ(mr.get_allocations_counter().value, 0); + void* a1 = mr.allocate(ten_MiB); + mr.push_counters(); + EXPECT_EQ(mr.get_bytes_counter().value, 0); + EXPECT_EQ(mr.get_allocations_counter().value, 0); + void* a2 = mr.allocate(ten_MiB); + mr.deallocate(a2, ten_MiB); + EXPECT_EQ(mr.get_bytes_counter().value, 0); + EXPECT_EQ(mr.get_bytes_counter().peak, ten_MiB); + EXPECT_EQ(mr.get_allocations_counter().value, 0); + EXPECT_EQ(mr.get_allocations_counter().peak, 1); + { + auto [bytes, allocs] = mr.pop_counters(); + EXPECT_EQ(bytes.value, 0); + EXPECT_EQ(bytes.peak, ten_MiB); + EXPECT_EQ(allocs.value, 0); + EXPECT_EQ(allocs.peak, 1); + } + mr.deallocate(a0, ten_MiB); + { + auto [bytes, allocs] = mr.pop_counters(); + EXPECT_EQ(bytes.value, 0); + EXPECT_EQ(bytes.peak, ten_MiB * 2); + EXPECT_EQ(allocs.value, 0); + EXPECT_EQ(allocs.peak, 2); + } + mr.deallocate(a1, ten_MiB); + EXPECT_THROW(mr.pop_counters(), std::out_of_range); +} + } // namespace } // namespace rmm::test