microsoft · PatriceVignola · Jan 18, 2023 · Jan 18, 2023 · Jan 18, 2023 · Jan 19, 2023
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
@@ -320,6 +320,13 @@ class IExecutionProvider {
     return default_device_;
   };
 
+  /**
+   * Return the appropriate OrtDevice object given OrtMemType that can be used directly by external callers.
+   */
+  virtual OrtDevice GetExternalOrtDeviceByMemType(OrtMemType mem_type) const {
+    return GetOrtDeviceByMemType(mem_type);
+  };
+
   /**
    * Create Preferred allocators for the current Execution Provider
    * This function is a stateless function which creates new instances of Allocator, without storing them in EP.

diff --git a/include/onnxruntime/core/framework/ortdevice.h b/include/onnxruntime/core/framework/ortdevice.h
@@ -24,6 +24,7 @@ struct OrtDevice {
     static const MemoryType CUDA_PINNED = 1;
     static const MemoryType HIP_PINNED = 2;
     static const MemoryType CANN_PINNED = 3;
+    static const MemoryType DML_EXTERNAL = 4;
   };
 
   constexpr OrtDevice(DeviceType device_type_, MemoryType memory_type_, DeviceId device_id_)

diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
@@ -150,9 +150,11 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
         onnxruntime::OpenVINO_GPU, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)),
         id1, mem_type1);
   } else if (strcmp(name1, onnxruntime::DML) == 0) {
+    // Since EPs cannot have 2 allocators with the same OrtMemType and Memory ID,
+    // we use -1 as the memory ID to represent external allocations that don't have any allocator.
     *out = new OrtMemoryInfo(
-        onnxruntime::DML, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)),
-        id1, mem_type1);
+        onnxruntime::DML, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DML_EXTERNAL, static_cast<OrtDevice::DeviceId>(id1)),
+        -1, mem_type1);
   } else if (strcmp(name1, onnxruntime::HIP) == 0) {
     *out = new OrtMemoryInfo(
         onnxruntime::HIP, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)), id1,

diff --git a/onnxruntime/core/framework/bfc_arena.cc b/onnxruntime/core/framework/bfc_arena.cc
@@ -42,22 +42,8 @@ BFCArena::BFCArena(std::unique_ptr<IAllocator> resource_allocator,
   stats_.bytes_limit = static_cast<int64_t>(total_memory);
 
   arena_extend_strategy_ = arena_extend_strategy;
+  UpdateFirstAllocationShrinkageLogic();
 
-  // We never want to shrink the initial allocation if the arena extend strategy is kNextPowerOfTwo.
-  // This could seem confusingly arbitrary but the rationale is as follows:
-  // The user selected initial allocation chunk is only valid for the arena extend strategy kNextPowerOfTwo
-  // and the user has likely chosen this initial value so that any ad-hoc arena extensions/shrinkages could potentially
-  // be avoided. So we do not consider the initial allocation for shrinkage whatever its usage status.
-  // On the other hand, if the arena extension strategy is kSameAsRequested, any initial chunk set by the user or otherwise,
-  // is moot and the arena will only extend based on the request size. In these cases, we consider any allocation for shrinkage
-  // if it is left unused (even if it is the first allocation).
-  if (arena_extend_strategy_ == ArenaExtendStrategy::kSameAsRequested) {
-    // Consider all allocation regions (including first allocation region) for shrinkage
-    consider_first_allocation_region_for_shrinkage_ = true;
-  } else {  // arena_extend_strategy_ == kNextPowerOfTwo
-    // Do not consider the first allocation region for shrinkage
-    consider_first_allocation_region_for_shrinkage_ = false;
-  }
   // Create a bunch of bins of various good sizes.
 
   // We create bins to fit all possible ranges that cover the
@@ -91,6 +77,29 @@ BFCArena::~BFCArena() {
   }
 }
 
+void BFCArena::UpdateFirstAllocationShrinkageLogic() {
+  // We never want to shrink the initial allocation if the arena extend strategy is kNextPowerOfTwo.
+  // This could seem confusingly arbitrary but the rationale is as follows:
+  // The user selected initial allocation chunk is only valid for the arena extend strategy kNextPowerOfTwo
+  // and the user has likely chosen this initial value so that any ad-hoc arena extensions/shrinkages could potentially
+  // be avoided. So we do not consider the initial allocation for shrinkage whatever its usage status.
+  // On the other hand, if the arena extension strategy is kSameAsRequested, any initial chunk set by the user or otherwise,
+  // is moot and the arena will only extend based on the request size. In these cases, we consider any allocation for shrinkage
+  // if it is left unused (even if it is the first allocation).
+  if (arena_extend_strategy_ == ArenaExtendStrategy::kSameAsRequested) {
+    // Consider all allocation regions (including first allocation region) for shrinkage
+    consider_first_allocation_region_for_shrinkage_ = true;
+  } else {  // arena_extend_strategy_ == kNextPowerOfTwo
+    // Do not consider the first allocation region for shrinkage
+    consider_first_allocation_region_for_shrinkage_ = false;
+  }
+}
+
+void BFCArena::SetArenaExtendStrategy(ArenaExtendStrategy arena_extend_strategy) {
+  arena_extend_strategy_ = arena_extend_strategy;
+  UpdateFirstAllocationShrinkageLogic();
+}
+
 BFCArena::Chunk* BFCArena::ChunkFromHandle(ChunkHandle h) {
   ORT_ENFORCE(h < chunks_.size());
   return &(chunks_[h]);

diff --git a/onnxruntime/core/framework/bfc_arena.h b/onnxruntime/core/framework/bfc_arena.h
@@ -77,6 +77,11 @@ class BFCArena : public IAllocator {
 
   ~BFCArena() override;
 
+  // Allows the caller to change the arena extend strategy after the allocator is done initializing.
+  // For example, kSameAsRequested may be desirable in certain situations and kNextPowerOfTwo may be
+  // desirable in others.
+  void SetArenaExtendStrategy(ArenaExtendStrategy arena_extend_strategy);
+
   // If size is 0, then this function returns either NULL,
   // or a unique pointer value that can later be successfully
   // passed to free(). Whatever, do not dereference that pointer
@@ -123,6 +128,9 @@ class BFCArena : public IAllocator {
  private:
   void DeallocateRawInternal(void* ptr);
 
+  // Updates whether the first allocation should be considered for shrinkage depending on the strategy type.
+  void UpdateFirstAllocationShrinkageLogic();
+
   // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
   // kInvalidChunkHandle means an invalid chunk
   using ChunkHandle = size_t;

diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
@@ -161,6 +161,19 @@ static Status BatchOrCopyMLValue(const SessionState& session_state,
     return Status::OK();
   }
 
+#ifdef USE_DML
+  const bool bothValuesOnGPU = copy_info.source_device.Type() == OrtDevice::GPU && copy_info.target_device.Type() == OrtDevice::GPU;
+  const bool sourceIsDmlAlloc = copy_info.source_device.MemType() == OrtDevice::MemType::DEFAULT || copy_info.source_device.MemType() == OrtDevice::MemType::DML_EXTERNAL;
+  const bool targetIsInternalAlloc = copy_info.target_device.MemType() == OrtDevice::MemType::DEFAULT;
+  const bool bothValuesOnSameDevice = copy_info.source_device.Id() == copy_info.target_device.Id();
+
+  // The DML EP supports binding external allocations directly, even if the memory types don't match, as long as they are on the same D3D12 device
+  if (bothValuesOnGPU && sourceIsDmlAlloc && targetIsInternalAlloc && bothValuesOnSameDevice) {
+    target_mlvalue = source_mlvalue;
+    return Status::OK();
+  }
+#endif
+
   auto allocator = session_state.GetAllocator(copy_info.target_device);
   if (!target_mlvalue.IsAllocated()) {
     ORT_ENFORCE(allocator != nullptr, "Failed to find allocator for device ", copy_info.target_device.ToString());

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
@@ -7,6 +7,7 @@ interface IMLOperatorRegistry;
 #include "core/common/status.h"
 #include "core/framework/data_transfer.h"
 #include "IWinmlExecutionProvider.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h"
 
 namespace onnxruntime
 {
@@ -17,20 +18,14 @@ namespace onnxruntime
     class KernelRegistry;
 }
 
-enum class AllocatorRoundingMode
-{
-    Disabled = 0,
-    Enabled = 1,
-};
-
 namespace Dml
 {
     std::unique_ptr<onnxruntime::IExecutionProvider> CreateExecutionProvider(
         IDMLDevice* dmlDevice,
         ID3D12CommandQueue* commandQueue,
-        bool enableMetacommands = true);
+        bool enableMetacommands,
+        bool enableBfcAllocator);
 
-    ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr);
     void FlushContext(onnxruntime::IExecutionProvider* provider);
     void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider);
 

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
@@ -9,6 +9,7 @@
 #include <optional>
 
 #include "core/framework/op_kernel.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h"
 
 struct AbstractOperatorDesc;
 interface IMLOperatorTensor;
@@ -22,6 +23,11 @@ namespace onnxruntime
     class Node;
 }
 
+namespace Dml
+{
+    struct TaggedPointer;
+}
+
 namespace Windows::AI::MachineLearning::Adapter
 {
     interface __declspec(uuid("5b19a18a-5ed5-4df2-a363-21b89380a698"))
@@ -34,19 +40,9 @@ namespace Windows::AI::MachineLearning::Adapter
         // the provider's underlying queues.
         virtual void QueueReference(IUnknown *object) = 0;
 
-        virtual void GetShadowCopyIfRequired(
-            bool isInternalOperator,
-            IUnknown* data,
-            IUnknown** dataCopy) const = 0;
-
-        virtual void GetABIDataInterface(
-            bool isInternalOperator,
-            IUnknown* data,
-            IUnknown** abiData) const = 0;
+        virtual Dml::D3D12BufferRegion GetBufferRegion(void* opaquePointer, uint64_t size) const = 0;
 
-        virtual uint64_t TryGetPooledAllocationId(
-            IUnknown* data,
-            bool isInternalOperator) = 0;
+        virtual uint64_t GetUniqueId(void* opaquePointer) = 0;
 
         virtual void GetABIExecutionInterfaceAndInvalidateState(
             bool isInternalOperator,

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
@@ -561,11 +561,17 @@ HRESULT STDMETHODCALLTYPE AbiCustomRegistry::RegisterOperatorKernel(
         //
         // For backward compatibility, this does not propagate errors for external operators
         static_cast<void>(m_kernelRegistry->RegisterCustomKernel(create_info));  // ignore result
+        m_hasExternalOperators = true;
     }
 
     return S_OK;
     }
     ORT_CATCH_RETURN
 }
 
+bool STDMETHODCALLTYPE AbiCustomRegistry::HasExternalOperators() const noexcept
+{
+    return m_hasExternalOperators;
+}
+
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.h
@@ -15,7 +15,7 @@ namespace WRL
 }
 
 namespace Windows::AI::MachineLearning::Adapter
-{ 
+{
 
 using namespace Microsoft::WRL;
 
@@ -49,14 +49,16 @@ class AbiCustomRegistry : public WRL::Base<IMLOperatorRegistry, IMLOperatorRegis
         IMLOperatorKernelFactory* operatorKernelFactory,
         _In_opt_ IMLOperatorShapeInferrer* shapeInferrer) const noexcept override;
 
+    bool STDMETHODCALLTYPE HasExternalOperators() const noexcept override;
+
     std::list<std::shared_ptr<onnxruntime::CustomRegistry>> GetRegistries()
     {
         std::list<std::shared_ptr<onnxruntime::CustomRegistry>> registries;
         for (auto& registry : m_customRegistryOpsetVerMap)
         {
             registries.push_back(registry.second);
         }
-        
+
         registries.push_back(m_kernelRegistry);
 
         return registries;
@@ -86,15 +88,15 @@ class AbiCustomRegistry : public WRL::Base<IMLOperatorRegistry, IMLOperatorRegis
 
  private:
     static onnx::OpSchema ConvertOpSchema(
-        _In_z_ const char* domain, 
+        _In_z_ const char* domain,
         const MLOperatorSchemaDescription& abiSchema,
         IMLOperatorTypeInferrer* typeInferrer,
         IMLOperatorShapeInferrer* shapeInferrer);
 
     static std::string ConvertFormalParameterType(const MLOperatorSchemaEdgeDescription& formalParameter);
     static onnx::OpSchema::FormalParameterOption ConvertFormalParameterOption(MLOperatorParameterOptions options);
     static void SetAttributesAndDefaults(onnx::OpSchema& schema, const MLOperatorSchemaDescription& abiSchema);
-    
+
     static AttributeMap GetDefaultAttributes(const MLOperatorKernelDescription* opKernel);
 
     std::shared_ptr<onnxruntime::CustomRegistry> m_kernelRegistry;
@@ -107,6 +109,8 @@ class AbiCustomRegistry : public WRL::Base<IMLOperatorRegistry, IMLOperatorRegis
     // Map between Lotus KernelDefs and extended data used during partitioning
     mutable std::shared_ptr<InternalRegistrationInfoMap> m_internalRegInfoMap;
 
+    mutable bool m_hasExternalOperators = false;
+
 };
 
 }    // namespace Windows::AI::MachineLearning::Adapter