From 72986fde749dcf3a31739b7962e4edfc86b7c076 Mon Sep 17 00:00:00 2001 From: Anna R Date: Thu, 11 Jun 2020 23:31:45 -0700 Subject: [PATCH 01/58] Adding StreamExecutor C API RFC --- rfcs/20200612-stream-executor-c-api.md | 494 +++++++++++++++++++++++++ 1 file changed, 494 insertions(+) create mode 100644 rfcs/20200612-stream-executor-c-api.md diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md new file mode 100644 index 000000000..66cb0d71d --- /dev/null +++ b/rfcs/20200612-stream-executor-c-api.md @@ -0,0 +1,494 @@ +# Title of RFC + +| Status | (Proposed / Accepted / Implemented / Obsolete) | +| :------------ | :------------------------------------------------------ | +| **RFC #** | [NNN](https://github.com/tensorflow/community/pull/NNN) | +: : (update when you have community PR #) : +| **Author(s)** | Anna Revinskaya (annarev@google.com), Penporn | +: : Koanantakool (penporn@google.com), Russell Power : +: : (power@google.com), Yi Situ (yisitu@google.com) : +| **Sponsor** | Gunhan Gulsoy (gunan@tensorflow.org) | +| **Updated** | 2020-06-11 | + +# Objective + +Provide basic device management C API to allow new devices to modularly connect +to the legacy TensorFlow runtime. + +## Goals + +* C API wrapper of a subset of methods in StreamExecutorInterface. +* Eventual API and ABI stability + +## Non-goals + +* Compatibility with the + [new TensorFlow stack](https://blog.tensorflow.org/2020/04/tfrt-new-tensorflow-runtime.html). +* Generic potpourri C API for all kinds of devices. + +# Motivation + +Current device support in TensorFlow adds code directly into the +[main TensorFlow repository](http://github.com/tensorflow/tensorflow). This +approach is +[not scalable](https://github.com/tensorflow/community/blob/master/rfcs/20190305-modular-tensorflow.md#adding-support-for-new-hardware-is-very-difficult-and-not-scalable) +because it adds complexity to the build dependency and tool chains, takes longer +time to build, and requires the TensorFlow team’s review. To handle the surge in +new hardware accelerators and programming paradigms, TensorFlow must allow +device addition in a modular manner: contributors code outside of the TensorFlow +repository and distribute a binary module which would connect to TensorFlow at +runtime through a stable application binary interface (ABI). + +The new TensorFlow stack, composed of +[TFRT](https://blog.tensorflow.org/2020/04/tfrt-new-tensorflow-runtime.html) and +[MLIR](https://www.tensorflow.org/mlir), is designed with this in mind. However, +it is still in an active development phase and is not ready for third-party +device integration until later this year. (For device support expecting to land +in 2021 or later, we highly recommend waiting to integrate with the new stack, +since it is fundamentally different from the current stack and cannot guarantee +code reuse.) + +In the meantime, we plan to provide limited device integration support for the +current TensorFlow stack through +[Modular TensorFlow](https://github.com/tensorflow/community/blob/master/rfcs/20190305-modular-tensorflow.md). +We anticipate three main functionalities within a device plugin module: + +* Device registration: Will be addressed in a different RFC. +* Device management: The focus of this RFC. +* Kernel and op registration and implementation: + [C API](https://github.com/tensorflow/community/blob/master/rfcs/20190814-kernel-and-op-registration.md). + +A key part of TensorFlow device implementation is the class called +[StreamExecutor](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_pimpl.h;l=73), +which handles work execution and memory management. It provides a set of methods +(such as +[Memcpy](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_internal.h;l=240)) +that can be customized for a particular device. + +We propose a C API wrapper of a subset of methods in +[StreamExecutorInterface](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_internal.h;l=166?q=StreamExecutorinterface) +as an ABI-stable way to register a custom StreamExecutor platform. + +# User Benefits + +A decoupled way to add a new device to TensorFlow. + +* Simpler process: Does not have to add a new build toolchain to TensorFlow +* Faster time-to-solution: Does not need code review from the TensorFlow team. +* Lower maintenance efforts: Only C-API-related changes could break the + integration. Unrelated TensorFlow changes would not break the code. + +# Design Proposal + +## StreamExecutorInterface + +[StreamExecutorInterface](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_internal.h;l=166?q=StreamExecutorinterface) +is quite large and some of its methods are only sporadically used. Therefore, we +plan to wrap only a subset of key StreamExecutorInterface functionality. + +See proposed C API below: + +```cpp +#define TF_CAPI_EXPORT +#else +#if defined(_WIN32) +#ifdef TF_COMPILE_LIBRARY +#define TF_CAPI_EXPORT __declspec(dllexport) +#else +#define TF_CAPI_EXPORT __declspec(dllimport) +#endif // TF_COMPILE_LIBRARY +#else +#define TF_CAPI_EXPORT __attribute__((visibility("default"))) +#endif // _WIN32 +#endif // SWIG + +#ifdef __cplusplus +extern "C" { +#endif + +typedef SE_Stream_st* SE_Stream; +typedef SE_Event_st* SE_Event; +typedef SE_Timer_st* SE_Timer; +typedef TF_Status* (*TF_StatusCallbackFn)(void*); + +typedef struct SE_PlatformId { + void* id; // aka stream_executor::Platform::Id +} SE_PlatformId; + +typedef struct SE_TimerFns { + const size_t struct_size; + int64_t (*nanoseconds)(SE_Timer timer); + int64_t (*microseconds)(SE_Timer timer); +} SE_Timer; + +typedef struct SE_AllocatorStats { + const size_t struct_size; + int64_t num_allocs; + int64_t bytes_in_use; + int64_t peak_bytes_in_use; + int64_t largest_alloc_size; + + bool has_bytes_limit; + int64_t bytes_limit; + + int64_t bytes_reserved; + int64_t peak_bytes_reserved; + + bool has_bytes_reservable_limit; + int64_t bytes_reservable_limit; + + int64_t largest_free_block_bytes; +} SE_AllocatorStats; + +typedef enum SE_EventStatus { + SE_EVENT_UNKNOWN, + SE_EVENT_ERROR, + SE_EVENT_PENDING, + SE_EVENT_COMPLETE, +} SE_EventStatus; + +typedef struct SE_Options { + const size_t struct_size; + int ordinal; +}; + +typedef struct SE_Device { + const size_t struct_size; + char* name; + size_t name_len; + + // Device vendor can store handle to their device representation + // here. + void* device_handle; + + // Any kind of data that plugin device might want to store. + void* data; +}; + +typedef struct SE_StreamExecutor { + const size_t struct_size; + + /*** ALLOCATION CALLBACKS ***/ + // Synchronously allocates size bytes on the underlying platform and returns + // a DeviceMemoryBase representing that allocation. In the case of failure, + // nullptr is returned. + TF_DeviceMemoryBase* (*allocate)( + SE_Device* se, uint64_t size, int64_t memory_space); + + // Deallocate the DeviceMemory previously allocated via this interface. + // Deallocation of a nullptr-representative value is permitted. + void (*deallocate)( + SE_Device* se, TF_DeviceMemoryBase* memory); + + + // Return allocator statistics. + bool (*get_allocator_stats)(SE_Device* executor, + SE_AllocatorStats* stats); + // Returns the underlying device memory usage information, if it is available. + // If it is not available (false is returned), free/total may not be + // initialized. + bool (*device_memory_usage)( + SE_Device* executor, int64_t* free, int64_t* total); + + + /*** STREAM CALLBACKS ***/ + // Creates SE_Stream. This call should also Allocate stream + // resources on the underlying platform and initializes its + // internals. + void (*create_stream)(SE_Device* executor, SE_Stream*, TF_Status*); + + // Deletes SE_Stream and deallocates any underlying resources. + void (*delete_stream)(SE_Device* executor, SE_Stream stream); + + // Causes dependent to not begin execution until other has finished its + // last-enqueued work. + bool (*create_stream_dependency)( + SE_Device* executor, SE_Stream dependent, + SE_Stream other); + + // Without blocking the device, retrieve the current stream status. + void (*get_status)(SE_Device* executor, SE_Stream stream, + TF_Status* status); + + /*** EVENT CALLBACKS ***/ + // Create TF_Event. Performs platform-specific allocation and initialization of an event. + void (*create_event)( + SE_Device* executor, TF_Event* event, TF_Status* status); + + // Delete SE_Event and perform any platform-specific deallocation and cleanup of an event. + void (*delete_event)( + SE_Device* executor, SE_Event event, TF_Status* status); + + // Requests the current status of the event from the underlying platform. + SE_EventStatus (*poll_for_event_status)( + SE_Device* executor, SE_Event event); + // Inserts the specified event at the end of the specified stream. + void (*record_event)( + SE_Device* executor, SE_Stream stream, + SE_Event event, TF_Status* status); + + // Wait for the specified event at the end of the specified stream. + void (*wait_for_event)( + SE_Device* executor, SE_Stream stream, + SE_Event event, TF_Status* status); + + /*** TIMER CALLBACKS ***/ + // Creates TF_Timer. Allocates timer resources on the underlying platform and initializes its + // internals. + void (*create_timer)(SE_Device* executor, SE_Timer* timer, TF_Status* status); + + // Deletes timer and deallocates timer resources on the underlying platform. + void (*delete_timer)(SE_Device* executor, SE_Timer timer); + + // Records a start event for an interval timer. + bool (*start_timer)( + SE_Device* executor, SE_Stream stream, SE_Timer timer); + + + // Records a stop event for an interval timer. + bool (*stop_timer)( + SE_Device* executor, SE_Stream stream, SE_Timer timer); + + /*** MEMCPY CALLBACKS ***/ + // Entrains a memcpy operation onto stream, with a host destination location + // host_dst and a device memory source, with target size size. + bool (*memcpy_to_host)( + SE_Device* executor, SE_Stream stream, + void* host_dst, + const SE_DeviceMemoryBase* device_src, + uint64_t size); + + // Entrains a memcpy operation onto stream, with a device destination location + // and a host memory source, with target size size + + bool (*memcpy_from_host)( + SE_Device* executor, SE_Stream stream, + SE_DeviceMemoryBase* device_dst, + const void* host_src, uint64_t size); + + // Causes the host code to synchronously wait for operations entrained onto + // stream to complete. Effectively a join on the asynchronous device + // operations enqueued on the stream before this program point. + void (*block_host_until_done)( + SE_Device* executor, SE_Stream stream, + TF_Status* status); + + // Synchronizes all activity occurring in the StreamExecutor's context (most + // likely a whole device). + bool (*synchronize_all_activity)(SE_Device* executor); + + // Obtains metadata about the underlying device. + void fill_device_description(SE_Device* executor, + SE_DeviceDescription* description, + TF_Status* status); + + // Entrains on a stream a user-specified function to be run on the host. + bool host_callback(SE_Device* executor, SE_Stream* stream, + TF_StatusCallbackFn callback_fn, void* ctx); +} SE_Device; + +TF_CAPI_EXPORT extern SE_Platform* SE_NewPlatform( + SE_PlatformId id, + int visible_device_count, + SE_Device* (*create_device)(SE_Options* options, TF_Status* status), + SE_StreamExecutor* (*create_stream_executor)(TF_Status* status), + void (*delete_device)(SE_Device* device), + void (*delete_stream_executor)(SE_StreamExecutor* stream_executor); +); + +TF_CAPI_EXPORT extern void SE_RegisterPlatform( + char* name, + size_t name_len, + SE_Platform* platform, + TF_Status* status); + +#ifdef __cplusplus +} /* end extern "C" */ +#endif +``` + +## PlatformId + +`SE_PlatformId.id` should be set to a unique identifier. + +## Usage example + +Define functions that create and delete `SE_Device` and `SE_StreamExecutor`: + +```cpp +SE_Device* create_device(SE_Options* options, TF_Status* status) { + SE_Device* se = new SE_Device(); + se->device_handle = get_my_device_handle(); + ... + return se; +} +SE_StreamExecutor* create_stream_executor(TF_Status* status) { + SE_StreamExecutor* se_fns = new SE_StreamExecutor(); + se->memcpy_from_host = my_device_memcpy_from_host_function; + ... + return se; +} +void delete_device(SE_Device* device) { + -- delete device handle here -- + ... + delete device; +} +void delete_stream_executor(SE_StreamExecutor* stream_executor) { + delete stream_executor; +} +``` + +Create a new platform using `SE_NewPlatform` and register it using +`SE_RegisterPlatform`: + +```cpp +void RegisterMyCustomPlatform() { + static plugin_id_value = 123; + SE_PlatformId id; + id.id = &plugin_id_value; + int visible_device_count = 2; + + SE_Platform* custom_platform = SE_NewPlatform( + id, visible_device_count, + create_device, create_stream_executor, + delete_device, delete_stream_executor); + + TF_Status* status = TF_NewStatus(); + std::string name = "MyCustomDevice"; + SE_RegisterPlatform( + name.c_str(), name.size(), + custom_platform, + status); +} +``` + +Use static initialization to register the new platform: + +```cpp +static bool IsMyCustomPlatformRegistered = []() { + RegisterMyCustomPlatform(); + return true; +}(); +``` + +## Stream/Timer/Event representation + +API extension would require defining SE\_Stream\_st, SE\_Event\_st and +SE\_Timer\_st structs. From the point of view of TensorFlow, we will treat their +pointers as opaque. + +Underneath, StreamExecutor will rely on customized implementations of +[StreamInterface](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_internal.h;l=114?q=TimerInterface&ss=tensorflow%2Ftensorflow), +[TimerInterface](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_internal.h;l=145?q=TimerInterface&ss=tensorflow%2Ftensorflow) +and +[EventInterface](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_internal.h;l=76?q=TimerInterface&ss=tensorflow%2Ftensorflow). +For example, Stream customization might look as follows: + +```cpp +class CStream : public StreamInterface { + public: + explicit CStream(TF_StreamExecutor* device, + TF_StreamExecutor* stream_executor) : + device_(device), stream_executor_(stream_executor), + stream_handle_(nullptr) { + } + ~CStream() override { + Destroy(); + } + + void Init() { + stream_handle_ = stream_executor_->create_stream(device_); + } + + void Destroy() { + if (stream_handle_ != nullptr) { + stream_executor_->delete_stream(device_, stream_handle_); + stream_handle_ = nullptr; + } + } + + SE_Stream Handle() { + return stream_handle_; + } + + private: + TF_StreamExecutor* device_; // not owned + TF_StreamExecutorFns* stream_executor_; // not owned + SE_Stream stream_handle_; +}; +``` + +## Platform registration + +This document just introduces the StreamExecutorInterface C API itself. RFC(s) +for how this API will be used inside TensorFlow will be sent separately. + +Specifically, external contributors from Intel are planning to send an RFC for a +CustomDevice registration that uses this API. + +## Stability / User Impact + +The C API will be placed under _tensorflow/c/experimental/_ directory. +Initially, we won’t have any compatibility guarantees. At the same time we will +make the best effort to perform any updates in a backwards compatible way. For +e.g. we plan to keep track of struct sizes. + +After a bake-in period of 6 months we will revisit and consider moving the API +to _tensorflow/c/ _directory. + +## Alternatives Considered + +* **Forking:** Contributors could always fork the TensorFlow repository, + directly make changes there to add a device, and release custom TensorFlow + packages. However, keeping forked copy in sync with the main repository can + be challenging and tedious, especially if some breakages cannot be fixed and + the code diverges. +* **Designing a new C API instead of StreamExecutor:** We are transitioning to + the new TensorFlow stack soon. Since the current stack’s code might not be + compatible with the new stack, we decided to stick with the existing + StreamExecutorInterface to minimize throw-away efforts. + +## Performance Implications + +The C API should not affect TensorFlow’s performance. Using the C API to connect +a device modularly would help save build time (compared to adding code directly +to the repository.) + +## Dependencies + +* This proposal doesn’t add any new dependencies to TensorFlow. +* This proposal doesn’t affect any projects dependent on TensorFlow. + +## Engineering Impact + +* The C API would increase the binary size and the build time, but not + significantly so. We don’t expect it to affect startup time / test times. +* The TensorFlow DevInfra team will maintain this code. + +## Platforms and Environments + +* **Platforms:** The C API should work on all platforms supported by + TensorFlow, apart from embedded/mobile platforms. It does not impact + automatic code generation or mobile stripping tooling. We don’t expect it to + interact with transformation tools. +* **Execution environments:** The C API should work on any standard execution + environments. + +## Best Practices + +* Going forward, Modular TensorFlow will be the only way to integrate new + devices to the current TensorFlow stack. +* For device integrations that can be done in 2021 or later, we strongly + encourage waiting to integrate with the new TensorFlow stack instead. + +## Compatibility + +How will this proposal interact with other parts of the TensorFlow Ecosystem? + +* **TFLite:** We don’t plan to make this work for TFLite. +* **Distribution strategies:** The C API should not impede them. +* **tf.function:** The C API would not interact with tf.function. +* **GPU/TPU:** Certain GPUs and TPUs are already supported in TensorFlow and + wouldn’t need this C API. Other GPU/devices can use this C API if the + functionality coverage is sufficient for them. +* **SavedModel: **The C API will not be serialized to a SavedModel. From 0de0f083d64c2128aba47c919f99f5f047850bdc Mon Sep 17 00:00:00 2001 From: Anna R Date: Fri, 12 Jun 2020 11:49:59 -0700 Subject: [PATCH 02/58] Added Questions and Discussion Topics section + a few other updates --- rfcs/20200612-stream-executor-c-api.md | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 66cb0d71d..c262c016b 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -418,14 +418,6 @@ class CStream : public StreamInterface { }; ``` -## Platform registration - -This document just introduces the StreamExecutorInterface C API itself. RFC(s) -for how this API will be used inside TensorFlow will be sent separately. - -Specifically, external contributors from Intel are planning to send an RFC for a -CustomDevice registration that uses this API. - ## Stability / User Impact The C API will be placed under _tensorflow/c/experimental/_ directory. @@ -463,7 +455,8 @@ to the repository.) * The C API would increase the binary size and the build time, but not significantly so. We don’t expect it to affect startup time / test times. -* The TensorFlow DevInfra team will maintain this code. +* The TensorFlow DevInfra team will maintain this code. StreamExecutor C API + will be packaged along with other C APIs that TensorFlow currently has. ## Platforms and Environments @@ -492,3 +485,8 @@ How will this proposal interact with other parts of the TensorFlow Ecosystem? wouldn’t need this C API. Other GPU/devices can use this C API if the functionality coverage is sufficient for them. * **SavedModel: **The C API will not be serialized to a SavedModel. + +## Questions and Discussion Topics + +* Any comments on the API design? Any missing functionality? +* Please let us know if you plan to use this C API for device integration. From 2e299828c9081536554e0e2ecd94025940d8cb7a Mon Sep 17 00:00:00 2001 From: Anna R Date: Fri, 12 Jun 2020 11:54:32 -0700 Subject: [PATCH 03/58] Update title --- rfcs/20200612-stream-executor-c-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index c262c016b..5f42e0935 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -1,4 +1,4 @@ -# Title of RFC +# StreamExecutor C API | Status | (Proposed / Accepted / Implemented / Obsolete) | | :------------ | :------------------------------------------------------ | From 13f3811a65180cdf357035944976b572b921b837 Mon Sep 17 00:00:00 2001 From: Anna R Date: Fri, 12 Jun 2020 12:11:53 -0700 Subject: [PATCH 04/58] Fix table lines by not wrapping them --- rfcs/20200612-stream-executor-c-api.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 5f42e0935..db2391ef8 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -2,11 +2,8 @@ | Status | (Proposed / Accepted / Implemented / Obsolete) | | :------------ | :------------------------------------------------------ | -| **RFC #** | [NNN](https://github.com/tensorflow/community/pull/NNN) | -: : (update when you have community PR #) : -| **Author(s)** | Anna Revinskaya (annarev@google.com), Penporn | -: : Koanantakool (penporn@google.com), Russell Power : -: : (power@google.com), Yi Situ (yisitu@google.com) : +| **RFC #** | [NNN](https://github.com/tensorflow/community/pull/NNN) (update when you have community PR #) | +| **Author(s)** | Anna Revinskaya (annarev@google.com), Penporn Koanantakool (penporn@google.com), Russell Power (power@google.com), Yi Situ (yisitu@google.com) | | **Sponsor** | Gunhan Gulsoy (gunan@tensorflow.org) | | **Updated** | 2020-06-11 | From e4f7b0c4e8ae5b0d8998acf0b22b118483430485 Mon Sep 17 00:00:00 2001 From: Anna R Date: Fri, 12 Jun 2020 12:17:29 -0700 Subject: [PATCH 05/58] Updated link and status --- rfcs/20200612-stream-executor-c-api.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index db2391ef8..f241ba25f 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -1,9 +1,11 @@ # StreamExecutor C API -| Status | (Proposed / Accepted / Implemented / Obsolete) | +| Status | Proposed | | :------------ | :------------------------------------------------------ | -| **RFC #** | [NNN](https://github.com/tensorflow/community/pull/NNN) (update when you have community PR #) | -| **Author(s)** | Anna Revinskaya (annarev@google.com), Penporn Koanantakool (penporn@google.com), Russell Power (power@google.com), Yi Situ (yisitu@google.com) | +| **RFC #** | [257](https://github.com/tensorflow/community/pull/257) | +| **Author(s)** | Anna Revinskaya (annarev@google.com), Penporn | +: : Koanantakool (penporn@google.com), Russell Power : +: : (power@google.com), Yi Situ (yisitu@google.com) : | **Sponsor** | Gunhan Gulsoy (gunan@tensorflow.org) | | **Updated** | 2020-06-11 | From 9cc21e51d614f03a29efac800fa7bd39c1bd9fd3 Mon Sep 17 00:00:00 2001 From: Anna R Date: Fri, 12 Jun 2020 12:19:23 -0700 Subject: [PATCH 06/58] Fix table again --- rfcs/20200612-stream-executor-c-api.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index f241ba25f..b5af68e52 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -3,10 +3,8 @@ | Status | Proposed | | :------------ | :------------------------------------------------------ | | **RFC #** | [257](https://github.com/tensorflow/community/pull/257) | -| **Author(s)** | Anna Revinskaya (annarev@google.com), Penporn | -: : Koanantakool (penporn@google.com), Russell Power : -: : (power@google.com), Yi Situ (yisitu@google.com) : -| **Sponsor** | Gunhan Gulsoy (gunan@tensorflow.org) | +| **Author(s)** | Anna Revinskaya (annarev@google.com), Penporn Koanantakool (penporn@google.com), Russell Power (power@google.com), Yi Situ (yisitu@google.com) | +| **Sponsor** | Gunhan Gulsoy (gunan@google.com) | | **Updated** | 2020-06-11 | # Objective From a7840c8dc6c73e16b963919d0f8b03d3c26463e8 Mon Sep 17 00:00:00 2001 From: Penporn Koanantakool <38085909+penpornk@users.noreply.github.com> Date: Fri, 12 Jun 2020 18:14:29 -0700 Subject: [PATCH 07/58] Minor edits. --- rfcs/20200612-stream-executor-c-api.md | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index b5af68e52..7383f935f 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -10,18 +10,18 @@ # Objective Provide basic device management C API to allow new devices to modularly connect -to the legacy TensorFlow runtime. +to the current TensorFlow runtime. ## Goals * C API wrapper of a subset of methods in StreamExecutorInterface. -* Eventual API and ABI stability +* Best-effort API and ABI stability after an initial experimental phase. ## Non-goals * Compatibility with the [new TensorFlow stack](https://blog.tensorflow.org/2020/04/tfrt-new-tensorflow-runtime.html). -* Generic potpourri C API for all kinds of devices. +* APIs that will expose all device-specific capabilities. # Motivation @@ -36,7 +36,7 @@ device addition in a modular manner: contributors code outside of the TensorFlow repository and distribute a binary module which would connect to TensorFlow at runtime through a stable application binary interface (ABI). -The new TensorFlow stack, composed of +The new TensorFlow stack, based on [TFRT](https://blog.tensorflow.org/2020/04/tfrt-new-tensorflow-runtime.html) and [MLIR](https://www.tensorflow.org/mlir), is designed with this in mind. However, it is still in an active development phase and is not ready for third-party @@ -48,16 +48,14 @@ code reuse.) In the meantime, we plan to provide limited device integration support for the current TensorFlow stack through [Modular TensorFlow](https://github.com/tensorflow/community/blob/master/rfcs/20190305-modular-tensorflow.md). -We anticipate three main functionalities within a device plugin module: +We anticipate three major functionalities within a device plugin module: * Device registration: Will be addressed in a different RFC. * Device management: The focus of this RFC. * Kernel and op registration and implementation: [C API](https://github.com/tensorflow/community/blob/master/rfcs/20190814-kernel-and-op-registration.md). -A key part of TensorFlow device implementation is the class called -[StreamExecutor](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_pimpl.h;l=73), -which handles work execution and memory management. It provides a set of methods +[StreamExecutor](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_pimpl.h;l=73) is TensorFlow's main device manager, responsible for work execution and memory management. It provides a set of methods (such as [Memcpy](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_internal.h;l=240)) that can be customized for a particular device. @@ -74,6 +72,10 @@ A decoupled way to add a new device to TensorFlow. * Faster time-to-solution: Does not need code review from the TensorFlow team. * Lower maintenance efforts: Only C-API-related changes could break the integration. Unrelated TensorFlow changes would not break the code. + * The C APIs may be changed during the initial experimental phase based + on developer experience and feedback. When the APIs become more mature, + we will try to keep them stable (in a best-effort manner) until the new + TensorFlow stack is available. # Design Proposal @@ -423,7 +425,7 @@ make the best effort to perform any updates in a backwards compatible way. For e.g. we plan to keep track of struct sizes. After a bake-in period of 6 months we will revisit and consider moving the API -to _tensorflow/c/ _directory. +to _tensorflow/c/_ directory. ## Alternatives Considered @@ -467,7 +469,7 @@ to the repository.) ## Best Practices * Going forward, Modular TensorFlow will be the only way to integrate new - devices to the current TensorFlow stack. + third-party devices to the current TensorFlow stack. * For device integrations that can be done in 2021 or later, we strongly encourage waiting to integrate with the new TensorFlow stack instead. @@ -481,7 +483,7 @@ How will this proposal interact with other parts of the TensorFlow Ecosystem? * **GPU/TPU:** Certain GPUs and TPUs are already supported in TensorFlow and wouldn’t need this C API. Other GPU/devices can use this C API if the functionality coverage is sufficient for them. -* **SavedModel: **The C API will not be serialized to a SavedModel. +* **SavedModel:** The C API will not be serialized to a SavedModel. ## Questions and Discussion Topics From cba7123860b061b00c58bd68f93c066887a95ca9 Mon Sep 17 00:00:00 2001 From: Penporn Koanantakool <38085909+penpornk@users.noreply.github.com> Date: Mon, 15 Jun 2020 17:46:37 -0700 Subject: [PATCH 08/58] Minor fixes. --- rfcs/20200612-stream-executor-c-api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 7383f935f..7bc6f0572 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -20,7 +20,7 @@ to the current TensorFlow runtime. ## Non-goals * Compatibility with the - [new TensorFlow stack](https://blog.tensorflow.org/2020/04/tfrt-new-tensorflow-runtime.html). + [new TensorFlow runtime stack](https://blog.tensorflow.org/2020/04/tfrt-new-tensorflow-runtime.html). * APIs that will expose all device-specific capabilities. # Motivation @@ -53,7 +53,7 @@ We anticipate three major functionalities within a device plugin module: * Device registration: Will be addressed in a different RFC. * Device management: The focus of this RFC. * Kernel and op registration and implementation: - [C API](https://github.com/tensorflow/community/blob/master/rfcs/20190814-kernel-and-op-registration.md). + [RFC Accepted](https://github.com/tensorflow/community/blob/master/rfcs/20190814-kernel-and-op-registration.md). [C API implemented](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/c/). [StreamExecutor](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_pimpl.h;l=73) is TensorFlow's main device manager, responsible for work execution and memory management. It provides a set of methods (such as From 1da7dc4481f7e513ade8a6fc7930e1e7ea0a740d Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Mon, 15 Jun 2020 18:01:04 -0700 Subject: [PATCH 09/58] Change nanoseconds and microseconds values to unsigned, remove header --- rfcs/20200612-stream-executor-c-api.md | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 7bc6f0572..48a7bc1a2 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -88,23 +88,6 @@ plan to wrap only a subset of key StreamExecutorInterface functionality. See proposed C API below: ```cpp -#define TF_CAPI_EXPORT -#else -#if defined(_WIN32) -#ifdef TF_COMPILE_LIBRARY -#define TF_CAPI_EXPORT __declspec(dllexport) -#else -#define TF_CAPI_EXPORT __declspec(dllimport) -#endif // TF_COMPILE_LIBRARY -#else -#define TF_CAPI_EXPORT __attribute__((visibility("default"))) -#endif // _WIN32 -#endif // SWIG - -#ifdef __cplusplus -extern "C" { -#endif - typedef SE_Stream_st* SE_Stream; typedef SE_Event_st* SE_Event; typedef SE_Timer_st* SE_Timer; @@ -116,8 +99,8 @@ typedef struct SE_PlatformId { typedef struct SE_TimerFns { const size_t struct_size; - int64_t (*nanoseconds)(SE_Timer timer); - int64_t (*microseconds)(SE_Timer timer); + uint64_t (*nanoseconds)(SE_Timer timer); + uint64_t (*microseconds)(SE_Timer timer); } SE_Timer; typedef struct SE_AllocatorStats { From f9a332215d7a99c1fd8f7ce6f86fe3b62be04657 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Mon, 15 Jun 2020 18:03:08 -0700 Subject: [PATCH 10/58] Update sentence that talks about bake-in period. --- rfcs/20200612-stream-executor-c-api.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 48a7bc1a2..5323a1d01 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -407,8 +407,7 @@ Initially, we won’t have any compatibility guarantees. At the same time we wil make the best effort to perform any updates in a backwards compatible way. For e.g. we plan to keep track of struct sizes. -After a bake-in period of 6 months we will revisit and consider moving the API -to _tensorflow/c/_ directory. +We will have an initial bake-in period before we consider moving the API out of experimental directory. ## Alternatives Considered From 54459f36b7d755a72f6dba84ec69f1521390b439 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Tue, 16 Jun 2020 11:36:07 -0700 Subject: [PATCH 11/58] Change updated date --- rfcs/20200612-stream-executor-c-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 5323a1d01..58bbe2f0d 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -5,7 +5,7 @@ | **RFC #** | [257](https://github.com/tensorflow/community/pull/257) | | **Author(s)** | Anna Revinskaya (annarev@google.com), Penporn Koanantakool (penporn@google.com), Russell Power (power@google.com), Yi Situ (yisitu@google.com) | | **Sponsor** | Gunhan Gulsoy (gunan@google.com) | -| **Updated** | 2020-06-11 | +| **Updated** | 2020-06-16 | # Objective From 015b13a9fa796f84085494e3cb4bed69cc4c691f Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Tue, 23 Jun 2020 11:38:35 -0700 Subject: [PATCH 12/58] Rename TF_StreamExecutor --> SE_Device, TF_StreamExecutorFns --> SE_StreamExecutor --- rfcs/20200612-stream-executor-c-api.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 58bbe2f0d..d4b19e356 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -369,8 +369,8 @@ For example, Stream customization might look as follows: ```cpp class CStream : public StreamInterface { public: - explicit CStream(TF_StreamExecutor* device, - TF_StreamExecutor* stream_executor) : + explicit CStream(SE_Device* device, + SE_StreamExecutor* stream_executor) : device_(device), stream_executor_(stream_executor), stream_handle_(nullptr) { } @@ -394,8 +394,8 @@ class CStream : public StreamInterface { } private: - TF_StreamExecutor* device_; // not owned - TF_StreamExecutorFns* stream_executor_; // not owned + SE_Device* device_; // not owned + SE_StreamExecutor* stream_executor_; // not owned SE_Stream stream_handle_; }; ``` From e0baefc92576ac6971e8a2005cbad52cda1d20da Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Fri, 26 Jun 2020 10:04:59 -0700 Subject: [PATCH 13/58] Update 20200612-stream-executor-c-api.md Defined a portable bool type TF_BOOL. Added structure size definitions and updated usage examples. Fixed typos where member functions are supposed to be member function pointers. Fixed missing extern "C". --- rfcs/20200612-stream-executor-c-api.md | 91 +++++++++++++++++--------- 1 file changed, 60 insertions(+), 31 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index d4b19e356..173cbf0b4 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -88,40 +88,62 @@ plan to wrap only a subset of key StreamExecutorInterface functionality. See proposed C API below: ```cpp +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + typedef SE_Stream_st* SE_Stream; typedef SE_Event_st* SE_Event; typedef SE_Timer_st* SE_Timer; typedef TF_Status* (*TF_StatusCallbackFn)(void*); +#ifndef TF_BOOL_DEFINED +#define TF_BOOL int8_t +#endif // TF_BOOL_DEFINED + +#ifndef TF_OFFSET_OF_END +#define TF_OFFSET_OF_END(TYPE, MEMBER) (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) +#endif // TF_OFFSET_OF_END + typedef struct SE_PlatformId { + size_t struct_size; void* id; // aka stream_executor::Platform::Id } SE_PlatformId; -typedef struct SE_TimerFns { - const size_t struct_size; +#define SE_PLATFORMID_STRUCT_SIZE TF_OFFSET_OF_END(SE_PlatformId, id) + +typedef struct SE_Timer { + size_t struct_size; uint64_t (*nanoseconds)(SE_Timer timer); uint64_t (*microseconds)(SE_Timer timer); } SE_Timer; +#define SE_TIMER_STRUCT_SIZE TF_OFFSET_OF_END(SE_Timer, microseconds) + typedef struct SE_AllocatorStats { - const size_t struct_size; + size_t struct_size; int64_t num_allocs; int64_t bytes_in_use; int64_t peak_bytes_in_use; int64_t largest_alloc_size; - bool has_bytes_limit; + int8_t has_bytes_limit; int64_t bytes_limit; int64_t bytes_reserved; int64_t peak_bytes_reserved; - bool has_bytes_reservable_limit; + int8_t has_bytes_reservable_limit; int64_t bytes_reservable_limit; int64_t largest_free_block_bytes; } SE_AllocatorStats; +#define SE_ALLOCATORSTATS_STRUCT_SIZE TF_OFFSET_OF_END(SE_AllocatorStats, largest_free_block_bytes) + typedef enum SE_EventStatus { SE_EVENT_UNKNOWN, SE_EVENT_ERROR, @@ -130,12 +152,14 @@ typedef enum SE_EventStatus { } SE_EventStatus; typedef struct SE_Options { - const size_t struct_size; - int ordinal; + size_t struct_size; + int32_t ordinal; }; +#define SE_OPTIONS_STRUCT_SIZE TF_OFFSET_OF_END(SE_Options, ordinal) + typedef struct SE_Device { - const size_t struct_size; + size_t struct_size; char* name; size_t name_len; @@ -147,8 +171,10 @@ typedef struct SE_Device { void* data; }; +#define SE_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, data) + typedef struct SE_StreamExecutor { - const size_t struct_size; + size_t struct_size; /*** ALLOCATION CALLBACKS ***/ // Synchronously allocates size bytes on the underlying platform and returns @@ -157,6 +183,7 @@ typedef struct SE_StreamExecutor { TF_DeviceMemoryBase* (*allocate)( SE_Device* se, uint64_t size, int64_t memory_space); + // Deallocate the DeviceMemory previously allocated via this interface. // Deallocation of a nullptr-representative value is permitted. void (*deallocate)( @@ -164,12 +191,12 @@ typedef struct SE_StreamExecutor { // Return allocator statistics. - bool (*get_allocator_stats)(SE_Device* executor, + TF_BOOL (*get_allocator_stats)(SE_Device* executor, SE_AllocatorStats* stats); // Returns the underlying device memory usage information, if it is available. // If it is not available (false is returned), free/total may not be // initialized. - bool (*device_memory_usage)( + TF_BOOL (*device_memory_usage)( SE_Device* executor, int64_t* free, int64_t* total); @@ -184,7 +211,7 @@ typedef struct SE_StreamExecutor { // Causes dependent to not begin execution until other has finished its // last-enqueued work. - bool (*create_stream_dependency)( + TF_BOOL (*create_stream_dependency)( SE_Device* executor, SE_Stream dependent, SE_Stream other); @@ -223,18 +250,18 @@ typedef struct SE_StreamExecutor { void (*delete_timer)(SE_Device* executor, SE_Timer timer); // Records a start event for an interval timer. - bool (*start_timer)( + TF_BOOL (*start_timer)( SE_Device* executor, SE_Stream stream, SE_Timer timer); // Records a stop event for an interval timer. - bool (*stop_timer)( + TF_BOOL (*stop_timer)( SE_Device* executor, SE_Stream stream, SE_Timer timer); /*** MEMCPY CALLBACKS ***/ // Entrains a memcpy operation onto stream, with a host destination location // host_dst and a device memory source, with target size size. - bool (*memcpy_to_host)( + TF_BOOL (*memcpy_to_host)( SE_Device* executor, SE_Stream stream, void* host_dst, const SE_DeviceMemoryBase* device_src, @@ -243,7 +270,7 @@ typedef struct SE_StreamExecutor { // Entrains a memcpy operation onto stream, with a device destination location // and a host memory source, with target size size - bool (*memcpy_from_host)( + TF_BOOL (*memcpy_from_host)( SE_Device* executor, SE_Stream stream, SE_DeviceMemoryBase* device_dst, const void* host_src, uint64_t size); @@ -257,35 +284,37 @@ typedef struct SE_StreamExecutor { // Synchronizes all activity occurring in the StreamExecutor's context (most // likely a whole device). - bool (*synchronize_all_activity)(SE_Device* executor); + TF_BOOL (*synchronize_all_activity)(SE_Device* executor); // Obtains metadata about the underlying device. - void fill_device_description(SE_Device* executor, + void (*fill_device_description)(SE_Device* executor, SE_DeviceDescription* description, TF_Status* status); // Entrains on a stream a user-specified function to be run on the host. - bool host_callback(SE_Device* executor, SE_Stream* stream, + TF_BOOL (*host_callback)(SE_Device* executor, SE_Stream* stream, TF_StatusCallbackFn callback_fn, void* ctx); -} SE_Device; +} SE_StreamExecutor; + +#define SE_STREAMEXECUTOR_STRUCT_SIZE TF_OFFSET_OF_END(SE_StreamExecutor, host_callback) -TF_CAPI_EXPORT extern SE_Platform* SE_NewPlatform( - SE_PlatformId id, - int visible_device_count, +TF_CAPI_EXPORT SE_Platform* SE_NewPlatform( + SE_PlatformId* id, + int32_t visible_device_count, SE_Device* (*create_device)(SE_Options* options, TF_Status* status), SE_StreamExecutor* (*create_stream_executor)(TF_Status* status), void (*delete_device)(SE_Device* device), void (*delete_stream_executor)(SE_StreamExecutor* stream_executor); ); -TF_CAPI_EXPORT extern void SE_RegisterPlatform( +TF_CAPI_EXPORT void SE_RegisterPlatform( char* name, size_t name_len, SE_Platform* platform, TF_Status* status); #ifdef __cplusplus -} /* end extern "C" */ +} // extern "C" #endif ``` @@ -299,13 +328,13 @@ Define functions that create and delete `SE_Device` and `SE_StreamExecutor`: ```cpp SE_Device* create_device(SE_Options* options, TF_Status* status) { - SE_Device* se = new SE_Device(); + SE_Device* se = new SE_Device{ SE_DEVICE_STRUCT_SIZE }; se->device_handle = get_my_device_handle(); ... return se; } SE_StreamExecutor* create_stream_executor(TF_Status* status) { - SE_StreamExecutor* se_fns = new SE_StreamExecutor(); + SE_StreamExecutor* se_fns = new SE_StreamExecutor{ SE_STREAMEXECUTOR_STRUCT_SIZE }; se->memcpy_from_host = my_device_memcpy_from_host_function; ... return se; @@ -325,13 +354,13 @@ Create a new platform using `SE_NewPlatform` and register it using ```cpp void RegisterMyCustomPlatform() { - static plugin_id_value = 123; - SE_PlatformId id; + static const int32_t plugin_id_value = 123; + SE_PlatformId id{ SE_PLATFORMID_STRUCT_SIZE }; id.id = &plugin_id_value; - int visible_device_count = 2; + int32_t visible_device_count = 2; SE_Platform* custom_platform = SE_NewPlatform( - id, visible_device_count, + &id, visible_device_count, create_device, create_stream_executor, delete_device, delete_stream_executor); From 0bef9d10ef10e2f07244af1dff8340f521ecbf78 Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Fri, 26 Jun 2020 10:33:45 -0700 Subject: [PATCH 14/58] Update 20200612-stream-executor-c-api.md --- rfcs/20200612-stream-executor-c-api.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 173cbf0b4..625258c32 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -206,8 +206,8 @@ typedef struct SE_StreamExecutor { // internals. void (*create_stream)(SE_Device* executor, SE_Stream*, TF_Status*); - // Deletes SE_Stream and deallocates any underlying resources. - void (*delete_stream)(SE_Device* executor, SE_Stream stream); + // Destroys SE_Stream and deallocates any underlying resources. + void (*destroy_stream)(SE_Device* executor, SE_Stream stream); // Causes dependent to not begin execution until other has finished its // last-enqueued work. @@ -224,8 +224,8 @@ typedef struct SE_StreamExecutor { void (*create_event)( SE_Device* executor, TF_Event* event, TF_Status* status); - // Delete SE_Event and perform any platform-specific deallocation and cleanup of an event. - void (*delete_event)( + // Destroy SE_Event and perform any platform-specific deallocation and cleanup of an event. + void (*destroy_event)( SE_Device* executor, SE_Event event, TF_Status* status); // Requests the current status of the event from the underlying platform. @@ -246,8 +246,8 @@ typedef struct SE_StreamExecutor { // internals. void (*create_timer)(SE_Device* executor, SE_Timer* timer, TF_Status* status); - // Deletes timer and deallocates timer resources on the underlying platform. - void (*delete_timer)(SE_Device* executor, SE_Timer timer); + // Destroy timer and deallocates timer resources on the underlying platform. + void (*destroy_timer)(SE_Device* executor, SE_Timer timer); // Records a start event for an interval timer. TF_BOOL (*start_timer)( @@ -302,9 +302,9 @@ TF_CAPI_EXPORT SE_Platform* SE_NewPlatform( SE_PlatformId* id, int32_t visible_device_count, SE_Device* (*create_device)(SE_Options* options, TF_Status* status), + void (*destroy_device)(SE_Device* device), SE_StreamExecutor* (*create_stream_executor)(TF_Status* status), - void (*delete_device)(SE_Device* device), - void (*delete_stream_executor)(SE_StreamExecutor* stream_executor); + void (*destroy_stream_executor)(SE_StreamExecutor* stream_executor); ); TF_CAPI_EXPORT void SE_RegisterPlatform( @@ -324,7 +324,7 @@ TF_CAPI_EXPORT void SE_RegisterPlatform( ## Usage example -Define functions that create and delete `SE_Device` and `SE_StreamExecutor`: +Define functions that create and destroy `SE_Device` and `SE_StreamExecutor`: ```cpp SE_Device* create_device(SE_Options* options, TF_Status* status) { @@ -339,12 +339,12 @@ SE_StreamExecutor* create_stream_executor(TF_Status* status) { ... return se; } -void delete_device(SE_Device* device) { - -- delete device handle here -- +void destroy_device(SE_Device* device) { + -- destroy device handle here -- ... delete device; } -void delete_stream_executor(SE_StreamExecutor* stream_executor) { +void destroy_stream_executor(SE_StreamExecutor* stream_executor) { delete stream_executor; } ``` From aecb20ab0cb060d0ae331954d2d8c68a73240e41 Mon Sep 17 00:00:00 2001 From: Penporn Koanantakool <38085909+penpornk@users.noreply.github.com> Date: Fri, 26 Jun 2020 14:58:22 -0700 Subject: [PATCH 15/58] Link to PluggableDevice RFC. --- rfcs/20200612-stream-executor-c-api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index d4b19e356..85c5cc721 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -48,9 +48,9 @@ code reuse.) In the meantime, we plan to provide limited device integration support for the current TensorFlow stack through [Modular TensorFlow](https://github.com/tensorflow/community/blob/master/rfcs/20190305-modular-tensorflow.md). -We anticipate three major functionalities within a device plugin module: +We anticipate three basic functionalities within a device plugin module: -* Device registration: Will be addressed in a different RFC. +* Device registration: Addressed in a different RFC, [Adding Pluggable Device for TensorFlow](https://github.com/tensorflow/community/pull/262). * Device management: The focus of this RFC. * Kernel and op registration and implementation: [RFC Accepted](https://github.com/tensorflow/community/blob/master/rfcs/20190814-kernel-and-op-registration.md). [C API implemented](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/c/). From d64f1308bd77d2f5ca48d30e819c443527b43a82 Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Mon, 29 Jun 2020 12:00:39 -0700 Subject: [PATCH 16/58] Update 20200612-stream-executor-c-api.md --- rfcs/20200612-stream-executor-c-api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 625258c32..7a4029edd 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -115,11 +115,11 @@ typedef struct SE_PlatformId { #define SE_PLATFORMID_STRUCT_SIZE TF_OFFSET_OF_END(SE_PlatformId, id) -typedef struct SE_Timer { +typedef struct SE_TimerFns { size_t struct_size; uint64_t (*nanoseconds)(SE_Timer timer); uint64_t (*microseconds)(SE_Timer timer); -} SE_Timer; +} SE_TimerFns; #define SE_TIMER_STRUCT_SIZE TF_OFFSET_OF_END(SE_Timer, microseconds) From b956b6846b29a1713933216a76ec9aa39a21b1dc Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Tue, 30 Jun 2020 11:35:23 -0700 Subject: [PATCH 17/58] Update 20200612-stream-executor-c-api.md --- rfcs/20200612-stream-executor-c-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 7a4029edd..7b50fe65b 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -101,7 +101,7 @@ typedef SE_Timer_st* SE_Timer; typedef TF_Status* (*TF_StatusCallbackFn)(void*); #ifndef TF_BOOL_DEFINED -#define TF_BOOL int8_t +#define TF_BOOL unsigned char #endif // TF_BOOL_DEFINED #ifndef TF_OFFSET_OF_END From 28a3904714381d5a5c0f64794f35dfed3acbe5a7 Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Tue, 30 Jun 2020 12:07:10 -0700 Subject: [PATCH 18/58] Update 20200612-stream-executor-c-api.md Small fix such that APIs pass const char* types. --- rfcs/20200612-stream-executor-c-api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 9d4200196..197438129 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -160,7 +160,7 @@ typedef struct SE_Options { typedef struct SE_Device { size_t struct_size; - char* name; + const char* name; size_t name_len; // Device vendor can store handle to their device representation @@ -308,7 +308,7 @@ TF_CAPI_EXPORT SE_Platform* SE_NewPlatform( ); TF_CAPI_EXPORT void SE_RegisterPlatform( - char* name, + const char* name, size_t name_len, SE_Platform* platform, TF_Status* status); From 5473c553d43944f46acc24388c5c27fb800a50f3 Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Wed, 1 Jul 2020 19:30:01 -0700 Subject: [PATCH 19/58] Update 20200612-stream-executor-c-api.md --- rfcs/20200612-stream-executor-c-api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 197438129..0930d27d2 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -220,9 +220,9 @@ typedef struct SE_StreamExecutor { TF_Status* status); /*** EVENT CALLBACKS ***/ - // Create TF_Event. Performs platform-specific allocation and initialization of an event. + // Create SE_Event. Performs platform-specific allocation and initialization of an event. void (*create_event)( - SE_Device* executor, TF_Event* event, TF_Status* status); + SE_Device* executor, SE_Event* event, TF_Status* status); // Destroy SE_Event and perform any platform-specific deallocation and cleanup of an event. void (*destroy_event)( From 340a4de7c094c06c6b2c4766e08529fc11e7ff2c Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Wed, 1 Jul 2020 19:31:25 -0700 Subject: [PATCH 20/58] Update 20200612-stream-executor-c-api.md --- rfcs/20200612-stream-executor-c-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 0930d27d2..48637a184 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -121,7 +121,7 @@ typedef struct SE_TimerFns { uint64_t (*microseconds)(SE_Timer timer); } SE_TimerFns; -#define SE_TIMER_STRUCT_SIZE TF_OFFSET_OF_END(SE_Timer, microseconds) +#define SE_TIMER_STRUCT_SIZE TF_OFFSET_OF_END(SE_TimerFns, microseconds) typedef struct SE_AllocatorStats { size_t struct_size; From aca4ecb0549df479b4e52217cb963a6ac3098e6f Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Mon, 6 Jul 2020 15:23:57 -0700 Subject: [PATCH 21/58] Update 20200612-stream-executor-c-api.md --- rfcs/20200612-stream-executor-c-api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 48637a184..5cc29bc06 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -154,7 +154,7 @@ typedef enum SE_EventStatus { typedef struct SE_Options { size_t struct_size; int32_t ordinal; -}; +} SE_Options; #define SE_OPTIONS_STRUCT_SIZE TF_OFFSET_OF_END(SE_Options, ordinal) @@ -169,7 +169,7 @@ typedef struct SE_Device { // Any kind of data that plugin device might want to store. void* data; -}; +} SE_Device; #define SE_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, data) From 49cde60767435a6cfe58da52e4d15c973bf5db63 Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Mon, 6 Jul 2020 15:56:36 -0700 Subject: [PATCH 22/58] Update 20200612-stream-executor-c-api.md --- rfcs/20200612-stream-executor-c-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 5cc29bc06..fd912241b 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -121,7 +121,7 @@ typedef struct SE_TimerFns { uint64_t (*microseconds)(SE_Timer timer); } SE_TimerFns; -#define SE_TIMER_STRUCT_SIZE TF_OFFSET_OF_END(SE_TimerFns, microseconds) +#define SE_TIMER_FNS_STRUCT_SIZE TF_OFFSET_OF_END(SE_TimerFns, microseconds) typedef struct SE_AllocatorStats { size_t struct_size; From 22bfc8ff379b62f86a6ae69a18d3103690d8c645 Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Mon, 6 Jul 2020 15:57:53 -0700 Subject: [PATCH 23/58] Update 20200612-stream-executor-c-api.md --- rfcs/20200612-stream-executor-c-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index fd912241b..ecb645d84 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -121,7 +121,7 @@ typedef struct SE_TimerFns { uint64_t (*microseconds)(SE_Timer timer); } SE_TimerFns; -#define SE_TIMER_FNS_STRUCT_SIZE TF_OFFSET_OF_END(SE_TimerFns, microseconds) +#define SE_TIMERFNS_STRUCT_SIZE TF_OFFSET_OF_END(SE_TimerFns, microseconds) typedef struct SE_AllocatorStats { size_t struct_size; From fd9573e5d1f1dda05bac24d417a074ad48434a02 Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Mon, 6 Jul 2020 21:39:13 -0700 Subject: [PATCH 24/58] Update 20200612-stream-executor-c-api.md --- rfcs/20200612-stream-executor-c-api.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index ecb645d84..1b486830f 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -110,6 +110,7 @@ typedef TF_Status* (*TF_StatusCallbackFn)(void*); typedef struct SE_PlatformId { size_t struct_size; + void* ext; void* id; // aka stream_executor::Platform::Id } SE_PlatformId; @@ -117,6 +118,7 @@ typedef struct SE_PlatformId { typedef struct SE_TimerFns { size_t struct_size; + void* ext; uint64_t (*nanoseconds)(SE_Timer timer); uint64_t (*microseconds)(SE_Timer timer); } SE_TimerFns; @@ -125,6 +127,7 @@ typedef struct SE_TimerFns { typedef struct SE_AllocatorStats { size_t struct_size; + void* ext; int64_t num_allocs; int64_t bytes_in_use; int64_t peak_bytes_in_use; @@ -153,6 +156,7 @@ typedef enum SE_EventStatus { typedef struct SE_Options { size_t struct_size; + void* ext; int32_t ordinal; } SE_Options; @@ -160,6 +164,7 @@ typedef struct SE_Options { typedef struct SE_Device { size_t struct_size; + void* ext; const char* name; size_t name_len; @@ -175,6 +180,7 @@ typedef struct SE_Device { typedef struct SE_StreamExecutor { size_t struct_size; + void* ext; /*** ALLOCATION CALLBACKS ***/ // Synchronously allocates size bytes on the underlying platform and returns From a3e44b19216b2b179f0fd5e08324463a334b5220 Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Mon, 6 Jul 2020 21:40:34 -0700 Subject: [PATCH 25/58] Update 20200612-stream-executor-c-api.md Removed timer_fns as it is not used. --- rfcs/20200612-stream-executor-c-api.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 1b486830f..3d50021d1 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -116,15 +116,6 @@ typedef struct SE_PlatformId { #define SE_PLATFORMID_STRUCT_SIZE TF_OFFSET_OF_END(SE_PlatformId, id) -typedef struct SE_TimerFns { - size_t struct_size; - void* ext; - uint64_t (*nanoseconds)(SE_Timer timer); - uint64_t (*microseconds)(SE_Timer timer); -} SE_TimerFns; - -#define SE_TIMERFNS_STRUCT_SIZE TF_OFFSET_OF_END(SE_TimerFns, microseconds) - typedef struct SE_AllocatorStats { size_t struct_size; void* ext; From 2d530f67f9b83e673a035da07263d0b9d08a3917 Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Tue, 7 Jul 2020 13:15:09 -0700 Subject: [PATCH 26/58] Update 20200612-stream-executor-c-api.md Added SE_MAJOR, SE_MINOR, SE_REVISION. --- rfcs/20200612-stream-executor-c-api.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 3d50021d1..c747e64a8 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -91,6 +91,10 @@ See proposed C API below: #include #include +#define SE_MAJOR 0 +#define SE_MINOR 0 +#define SE_REVISION 1 + #ifdef __cplusplus extern "C" { #endif From 4e239ea6347aead9dfb57cb7039caf54dc9eee59 Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Tue, 7 Jul 2020 13:48:30 -0700 Subject: [PATCH 27/58] Update 20200612-stream-executor-c-api.md Updated usage of SE_TimerFns --- rfcs/20200612-stream-executor-c-api.md | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index c747e64a8..78b6a0eb1 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -112,10 +112,19 @@ typedef TF_Status* (*TF_StatusCallbackFn)(void*); #define TF_OFFSET_OF_END(TYPE, MEMBER) (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) #endif // TF_OFFSET_OF_END +typedef struct SE_TimerFns { + size_t struct_size; + void* ext; + uint64_t (*nanoseconds)(SE_Timer timer); + uint64_t (*microseconds)(SE_Timer timer); +} SE_TimerFns; + +#define SE_TIMER_FNS_STRUCT_SIZE TF_OFFSET_OF_END(SE_TimerFns, microseconds) + typedef struct SE_PlatformId { - size_t struct_size; - void* ext; - void* id; // aka stream_executor::Platform::Id + size_t struct_size; + void* ext; + void* id; // aka stream_executor::Platform::Id } SE_PlatformId; #define SE_PLATFORMID_STRUCT_SIZE TF_OFFSET_OF_END(SE_PlatformId, id) @@ -245,10 +254,10 @@ typedef struct SE_StreamExecutor { /*** TIMER CALLBACKS ***/ // Creates TF_Timer. Allocates timer resources on the underlying platform and initializes its // internals. - void (*create_timer)(SE_Device* executor, SE_Timer* timer, TF_Status* status); + void (*create_timer)(SE_Device* executor, SE_Timer* timer, SE_TimerFns** timer_fns, TF_Status* status); // Destroy timer and deallocates timer resources on the underlying platform. - void (*destroy_timer)(SE_Device* executor, SE_Timer timer); + void (*destroy_timer)(SE_Device* executor, SE_Timer timer, SE_TimerFns* timer_fns); // Records a start event for an interval timer. TF_BOOL (*start_timer)( From 078c7e774aca785677f535752b095d80abababbb Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Tue, 7 Jul 2020 13:49:53 -0700 Subject: [PATCH 28/58] Update 20200612-stream-executor-c-api.md Non-functional refactoring. --- rfcs/20200612-stream-executor-c-api.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 78b6a0eb1..9b028e164 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -112,6 +112,14 @@ typedef TF_Status* (*TF_StatusCallbackFn)(void*); #define TF_OFFSET_OF_END(TYPE, MEMBER) (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) #endif // TF_OFFSET_OF_END +typedef struct SE_PlatformId { + size_t struct_size; + void* ext; + void* id; // aka stream_executor::Platform::Id +} SE_PlatformId; + +#define SE_PLATFORMID_STRUCT_SIZE TF_OFFSET_OF_END(SE_PlatformId, id) + typedef struct SE_TimerFns { size_t struct_size; void* ext; @@ -121,14 +129,6 @@ typedef struct SE_TimerFns { #define SE_TIMER_FNS_STRUCT_SIZE TF_OFFSET_OF_END(SE_TimerFns, microseconds) -typedef struct SE_PlatformId { - size_t struct_size; - void* ext; - void* id; // aka stream_executor::Platform::Id -} SE_PlatformId; - -#define SE_PLATFORMID_STRUCT_SIZE TF_OFFSET_OF_END(SE_PlatformId, id) - typedef struct SE_AllocatorStats { size_t struct_size; void* ext; From caf2d1f78678c286a546a6ab433b158e80d1268a Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Tue, 7 Jul 2020 18:39:22 -0700 Subject: [PATCH 29/58] Changed arguments to structs, added registration details --- rfcs/20200612-stream-executor-c-api.md | 67 +++++++++++++++++++++----- 1 file changed, 56 insertions(+), 11 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 9b028e164..e6abb9f01 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -308,19 +308,41 @@ typedef struct SE_StreamExecutor { #define SE_STREAMEXECUTOR_STRUCT_SIZE TF_OFFSET_OF_END(SE_StreamExecutor, host_callback) -TF_CAPI_EXPORT SE_Platform* SE_NewPlatform( - SE_PlatformId* id, - int32_t visible_device_count, - SE_Device* (*create_device)(SE_Options* options, TF_Status* status), - void (*destroy_device)(SE_Device* device), - SE_StreamExecutor* (*create_stream_executor)(TF_Status* status), - void (*destroy_stream_executor)(SE_StreamExecutor* stream_executor); -); +typedef struct SE_PlatformParams { + size_t struct_size; + SE_PlatformId* id; + + // Callbacks for creating/destroying SE_Device. + SE_Device* (*create_device)(SE_Options* options, TF_Status* status); + void (*destroy_device)(SE_Device* device); + + // Callbacks for creating/destroying SE_StreamExecutor. + SE_StreamExecutor* (*create_stream_executor)(TF_Status* status); + void (*destroy_stream_executor)(SE_StreamExecutor* stream_executor); +} SE_PlatformParams; + +#define SE_PLATFORM_PARAMS_SIZE TF_OFFSET_OF_END(SE_PlatformParams, destroy_stream_executor) + +TF_CAPI_EXPORT SE_Platform* SE_NewPlatform(SE_PlatformParams params); + +typedef struct SE_PlatformRegistartionParams { + size_t struct_size; + + // Platform name + const char* name; + size_t name_len; + + // Device type name. Right now only GPU is supported. + char* type; + size_t type_len; + + SE_Platform* platform; +} SE_PlatformParams; + +#define SE_PLATFORM_REGISTRATION_PARAMS_SIZE TF_OFFSET_OF_END(SE_PlatformRegistrationParams, platform) TF_CAPI_EXPORT void SE_RegisterPlatform( - const char* name, - size_t name_len, - SE_Platform* platform, + SE_PlatformRegistartionParams params, TF_Status* status); #ifdef __cplusplus @@ -328,6 +350,29 @@ TF_CAPI_EXPORT void SE_RegisterPlatform( #endif ``` +## Registration implementation + +Registration will be implemented by registering a new StreamExecutor platform as well as a new TensorFlow device with [DeviceFactory](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/core/common_runtime/device_factory.h;l=30?q=DeviceFactory). + +```cpp +void SE_RegisterPlatform( + SE_PlatformRegistrationParams params, TF_Status* status) { + // Register new platform + std::unique_ptr platform( + new stream_executor::internal::CPlatform(params)); + SE_CHECK_OK( + stream_executor::MultiPlatformManager::RegisterPlatform( + std::move(platform))); + + // Register PluggableDevice + std::string platform_name_str(params.name, params.name_len); + std::string type_str(params.type, params.type_len); + DeviceFactory::Register(type_str, new PluggableDeviceFactory(platform_name_str), priority); +} +``` + +`PluggableDevice` is covered in a separate RFC: [RFC: Adding Pluggable Device For TensorFlow](https://github.com/tensorflow/community/pull/262). + ## PlatformId `SE_PlatformId.id` should be set to a unique identifier. From b193e4e8d2afe7f418511e2e2fb1cce1be3ffa06 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Wed, 8 Jul 2020 18:24:14 -0700 Subject: [PATCH 30/58] Switch static initialization to fixed name Fixed name function called `SE_InitializePlugin` --- rfcs/20200612-stream-executor-c-api.md | 123 +++++++++++++++---------- 1 file changed, 72 insertions(+), 51 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index e6abb9f01..fbb6b8a35 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -178,6 +178,9 @@ typedef struct SE_Device { // Any kind of data that plugin device might want to store. void* data; + + // TensorFlow will set this field to `data` passed with SE_PlatformParams. + void* platform_data; } SE_Device; #define SE_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, data) @@ -310,10 +313,22 @@ typedef struct SE_StreamExecutor { typedef struct SE_PlatformParams { size_t struct_size; + + // Free form data set by plugin. It will be pre-filled + // in SE_Device passed to create_device. + void* data; SE_PlatformId* id; - // Callbacks for creating/destroying SE_Device. - SE_Device* (*create_device)(SE_Options* options, TF_Status* status); + // Platform name + const char* name; + size_t name_len; + + // Device type name. Right now only GPU is supported. + char* type; + size_t type_len; + + // Callbacks for creating/destroying. + void (*create_device)(SE_Device* device, SE_Options* options, TF_Status* status); void (*destroy_device)(SE_Device* device); // Callbacks for creating/destroying SE_StreamExecutor. @@ -325,25 +340,22 @@ typedef struct SE_PlatformParams { TF_CAPI_EXPORT SE_Platform* SE_NewPlatform(SE_PlatformParams params); -typedef struct SE_PlatformRegistartionParams { +typedef struct SE_PlatformRegistrationParams { size_t struct_size; + void* ext; - // Platform name - const char* name; - size_t name_len; - - // Device type name. Right now only GPU is supported. - char* type; - size_t type_len; + // StreamExecutor C API version. + int32_t major_version; + int32_t minor_version; + int32_t revision_version; - SE_Platform* platform; -} SE_PlatformParams; + // Params must be filled by the plugin. + SE_PlatformParams params; +} SE_PlatformRegistrationParams; #define SE_PLATFORM_REGISTRATION_PARAMS_SIZE TF_OFFSET_OF_END(SE_PlatformRegistrationParams, platform) -TF_CAPI_EXPORT void SE_RegisterPlatform( - SE_PlatformRegistartionParams params, - TF_Status* status); +void SE_InitializePlugin(SE_PlatformRegistrationParams* params, TF_Status* status); #ifdef __cplusplus } // extern "C" @@ -355,20 +367,35 @@ TF_CAPI_EXPORT void SE_RegisterPlatform( Registration will be implemented by registering a new StreamExecutor platform as well as a new TensorFlow device with [DeviceFactory](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/core/common_runtime/device_factory.h;l=30?q=DeviceFactory). ```cpp -void SE_RegisterPlatform( - SE_PlatformRegistrationParams params, TF_Status* status) { - // Register new platform - std::unique_ptr platform( - new stream_executor::internal::CPlatform(params)); - SE_CHECK_OK( - stream_executor::MultiPlatformManager::RegisterPlatform( - std::move(platform))); - - // Register PluggableDevice - std::string platform_name_str(params.name, params.name_len); - std::string type_str(params.type, params.type_len); - DeviceFactory::Register(type_str, new PluggableDeviceFactory(platform_name_str), priority); +typedef (*SEPluginInitFn)(SE_PlatformRegistrationParams*, TF_Status*); +... + +void* plugin = dlopen("myplugin.so", ...); +if (!plugin) { + ... output error and skip this plugin ... } +void* initialize_sym = dlsym(plugin, "SE_InitializePlugin"); +if (!initialize_sym) { + ... output error and skip this plugin ... +} +SEPluginInitFn initialize_fn = reinterpret_cast(initialize_sym); + +SE_PlatformRegistrationParams params; +TF_Status* status = TF_NewStatus(); + +initialize_fn(¶ms, status); + +// Register new platform +std::unique_ptr platform( + new stream_executor::internal::CPlatform(params)); +SE_CHECK_OK( + stream_executor::MultiPlatformManager::RegisterPlatform( + std::move(platform))); + +// Register PluggableDevice +std::string platform_name_str(params.params.name, params.params.name_len); +std::string type_str(params.params.type, params.params.type_len); +DeviceFactory::Register(type_str, new PluggableDeviceFactory(platform_name_str), priority); ``` `PluggableDevice` is covered in a separate RFC: [RFC: Adding Pluggable Device For TensorFlow](https://github.com/tensorflow/community/pull/262). @@ -404,38 +431,32 @@ void destroy_stream_executor(SE_StreamExecutor* stream_executor) { } ``` -Create a new platform using `SE_NewPlatform` and register it using -`SE_RegisterPlatform`: +Define `SE_InitializePlugin` that TensorFlow will call when registering the device plugin: ```cpp -void RegisterMyCustomPlatform() { +void SE_InitializePlugin(SE_PlatformRegistrationParams* params, TF_Status* status) { static const int32_t plugin_id_value = 123; SE_PlatformId id{ SE_PLATFORMID_STRUCT_SIZE }; id.id = &plugin_id_value; int32_t visible_device_count = 2; - - SE_Platform* custom_platform = SE_NewPlatform( - &id, visible_device_count, - create_device, create_stream_executor, - delete_device, delete_stream_executor); - - TF_Status* status = TF_NewStatus(); - std::string name = "MyCustomDevice"; - SE_RegisterPlatform( - name.c_str(), name.size(), - custom_platform, - status); + + std::string name = "MyDevice"; + std::string type = "GPU"; + + params.params.id = id; + params.params.visible_device_count = visible_device_count; + params.params.create_device = create_device; + params.params.destroy_device = destroy_device; + params.params.create_stream_executor = create_stream_executor; + params.params.destroy_stream_executor = destroy_stream_executor; + params.params.name = name.c_str(); + params.params.name_len = name.size(); + params.params.type = type.c_str(); + params.params.type_len = type.size(); } ``` -Use static initialization to register the new platform: - -```cpp -static bool IsMyCustomPlatformRegistered = []() { - RegisterMyCustomPlatform(); - return true; -}(); -``` +TensorFlow will call `InitializeSEPlugin` when registering the plugin. ## Stream/Timer/Event representation From e7556905d534f72c93dab70dc50b0e9132431014 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Wed, 8 Jul 2020 18:30:32 -0700 Subject: [PATCH 31/58] Pass SE_StreamExecutor* to create function --- rfcs/20200612-stream-executor-c-api.md | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index fbb6b8a35..7cb5a3983 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -332,7 +332,7 @@ typedef struct SE_PlatformParams { void (*destroy_device)(SE_Device* device); // Callbacks for creating/destroying SE_StreamExecutor. - SE_StreamExecutor* (*create_stream_executor)(TF_Status* status); + void (*create_stream_executor)(SE_StreamExecutor*, TF_Status* status); void (*destroy_stream_executor)(SE_StreamExecutor* stream_executor); } SE_PlatformParams; @@ -409,25 +409,20 @@ DeviceFactory::Register(type_str, new PluggableDeviceFactory(platform_name_str), Define functions that create and destroy `SE_Device` and `SE_StreamExecutor`: ```cpp -SE_Device* create_device(SE_Options* options, TF_Status* status) { - SE_Device* se = new SE_Device{ SE_DEVICE_STRUCT_SIZE }; - se->device_handle = get_my_device_handle(); +void create_device(SE_Device* device, SE_Options* options, TF_Status* status) { + device->device_handle = get_my_device_handle(); ... - return se; } -SE_StreamExecutor* create_stream_executor(TF_Status* status) { - SE_StreamExecutor* se_fns = new SE_StreamExecutor{ SE_STREAMEXECUTOR_STRUCT_SIZE }; +void create_stream_executor(SE_StreamExecutor* se, TF_Status* status) { se->memcpy_from_host = my_device_memcpy_from_host_function; ... - return se; } void destroy_device(SE_Device* device) { -- destroy device handle here -- ... - delete device; } void destroy_stream_executor(SE_StreamExecutor* stream_executor) { - delete stream_executor; + -- perform any clean up needed for stream executor -- } ``` From dda41e3d04544217c1186c91436ace79f21e1dd0 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Wed, 8 Jul 2020 18:41:07 -0700 Subject: [PATCH 32/58] Entrains --> Enqueues --- rfcs/20200612-stream-executor-c-api.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 7cb5a3983..eccde1e4c 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -272,7 +272,7 @@ typedef struct SE_StreamExecutor { SE_Device* executor, SE_Stream stream, SE_Timer timer); /*** MEMCPY CALLBACKS ***/ - // Entrains a memcpy operation onto stream, with a host destination location + // Enqueues a memcpy operation onto stream, with a host destination location // host_dst and a device memory source, with target size size. TF_BOOL (*memcpy_to_host)( SE_Device* executor, SE_Stream stream, @@ -280,9 +280,8 @@ typedef struct SE_StreamExecutor { const SE_DeviceMemoryBase* device_src, uint64_t size); - // Entrains a memcpy operation onto stream, with a device destination location + // Enqueues a memcpy operation onto stream, with a device destination location // and a host memory source, with target size size - TF_BOOL (*memcpy_from_host)( SE_Device* executor, SE_Stream stream, SE_DeviceMemoryBase* device_dst, @@ -304,7 +303,7 @@ typedef struct SE_StreamExecutor { SE_DeviceDescription* description, TF_Status* status); - // Entrains on a stream a user-specified function to be run on the host. + // Enqueues on a stream a user-specified function to be run on the host. TF_BOOL (*host_callback)(SE_Device* executor, SE_Stream* stream, TF_StatusCallbackFn callback_fn, void* ctx); } SE_StreamExecutor; From d463d1fd85f01865c1dab79c0fcc96f452bdb303 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Wed, 8 Jul 2020 18:41:48 -0700 Subject: [PATCH 33/58] entrained --> enqueued --- rfcs/20200612-stream-executor-c-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index eccde1e4c..fc09e9b86 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -287,7 +287,7 @@ typedef struct SE_StreamExecutor { SE_DeviceMemoryBase* device_dst, const void* host_src, uint64_t size); - // Causes the host code to synchronously wait for operations entrained onto + // Causes the host code to synchronously wait for operations enqueued onto // stream to complete. Effectively a join on the asynchronous device // operations enqueued on the stream before this program point. void (*block_host_until_done)( From 7c947d7fe5319e20eeaf5aa0dc9b04acadd3d994 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Thu, 9 Jul 2020 12:15:39 -0700 Subject: [PATCH 34/58] Added memory_space comment to allocate --- rfcs/20200612-stream-executor-c-api.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index fc09e9b86..7e4d6fb1f 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -193,6 +193,8 @@ typedef struct SE_StreamExecutor { // Synchronously allocates size bytes on the underlying platform and returns // a DeviceMemoryBase representing that allocation. In the case of failure, // nullptr is returned. + // memory_space is reserved for a potential future usage and should be set + // to 0. TF_DeviceMemoryBase* (*allocate)( SE_Device* se, uint64_t size, int64_t memory_space); From a3b1518e7cf26fe3b133e93ce2f2adba091ede6c Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Thu, 9 Jul 2020 12:25:32 -0700 Subject: [PATCH 35/58] Clarify how StreamExecutor methods were selected --- rfcs/20200612-stream-executor-c-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 7e4d6fb1f..4290f86f0 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -83,7 +83,7 @@ A decoupled way to add a new device to TensorFlow. [StreamExecutorInterface](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_internal.h;l=166?q=StreamExecutorinterface) is quite large and some of its methods are only sporadically used. Therefore, we -plan to wrap only a subset of key StreamExecutorInterface functionality. +plan to wrap only a subset of key StreamExecutorInterface functionality. We decided on this subset based on the PluggableDevice usecase as well as potential future devices such as TPUs. See proposed C API below: From f824058aa7d92c5eaec1d0996375a001e9a6117e Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Thu, 9 Jul 2020 13:07:12 -0700 Subject: [PATCH 36/58] Remove extra * from SE_TimerFns** --- rfcs/20200612-stream-executor-c-api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 4290f86f0..5f7b96938 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -258,8 +258,8 @@ typedef struct SE_StreamExecutor { /*** TIMER CALLBACKS ***/ // Creates TF_Timer. Allocates timer resources on the underlying platform and initializes its - // internals. - void (*create_timer)(SE_Device* executor, SE_Timer* timer, SE_TimerFns** timer_fns, TF_Status* status); + // internals, setting `timer` output variable. Sets values in `timer_fns` struct. + void (*create_timer)(SE_Device* executor, SE_Timer* timer, SE_TimerFns* timer_fns, TF_Status* status); // Destroy timer and deallocates timer resources on the underlying platform. void (*destroy_timer)(SE_Device* executor, SE_Timer timer, SE_TimerFns* timer_fns); From 3fde30274cee8fca92a3e7d9ffe332f61ee75ebd Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Thu, 9 Jul 2020 14:49:10 -0700 Subject: [PATCH 37/58] Annotated in/out arguments for create_device/create_stream_executor We should probably annotate all functions in the C API to clarify inputs and outputs. I will do that when implementing. --- rfcs/20200612-stream-executor-c-api.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 5f7b96938..10718ba67 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -329,11 +329,16 @@ typedef struct SE_PlatformParams { size_t type_len; // Callbacks for creating/destroying. - void (*create_device)(SE_Device* device, SE_Options* options, TF_Status* status); + void (*create_device)( + SE_Device* device, \\ out + SE_Options* options, \\ in + TF_Status* status); \\ out void (*destroy_device)(SE_Device* device); // Callbacks for creating/destroying SE_StreamExecutor. - void (*create_stream_executor)(SE_StreamExecutor*, TF_Status* status); + void (*create_stream_executor)( + SE_StreamExecutor*, \\ out + TF_Status* status); \\ out void (*destroy_stream_executor)(SE_StreamExecutor* stream_executor); } SE_PlatformParams; From 09e231324c3b2b1f653c3f385fd2120edd460815 Mon Sep 17 00:00:00 2001 From: Situ Yi <60493024+yisitu@users.noreply.github.com> Date: Thu, 9 Jul 2020 14:30:41 -0700 Subject: [PATCH 38/58] Create C_API_versioning_strategy.md Added details on how TensorFlow SE C API versioning works. --- .../C_API_versioning_strategy.md | 226 ++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md diff --git a/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md b/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md new file mode 100644 index 000000000..38ca64c07 --- /dev/null +++ b/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md @@ -0,0 +1,226 @@ +**Authors**: yisitu@, penporn@, annarev@ + +**Date**: 7/9/20 + +In reply to a question on [PR #262](https://github.com/tensorflow/community/pull/262#issuecomment-653690654). + +# TensorFlow Versioning Strategy + +TensorFlow StreamExecutorInterface (SEI) uses struct_size for version checking. Struct members are not allowed to be removed or reordered. Following are concrete examples of how TensorFlow remains compatible with plug-ins when functionality is added to or removed from StreamExecutorInterface. We will be using a simplified SE_Device as an example. + +## When TensorFlow extends functionality +### Backwards compatibility +TensorFlow is compiled against a newer SEI header (v2), which has SE_Device extended with device_handle. + +**Future TensorFlow compiled against StreamExecutorInterface v2** +```cpp +// StreamExecutorInterface header version 2 +typedef struct SE_Device { + size_t struct_size; + void* next; // Always set to zero, reserved by TF for future use. + + const char* name; + size_t name_len; + void* device_handle; +} SE_Device; + +// Evaluates to 40 +#define SE_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, device_handle) +``` + +The plugin was compiled against an older version of SEI header without device_handle. + +**Older Plugin compiled against StreamExecutorInterface v1** +```cpp +// StreamExecutorInterface header version 1 +typedef struct SE_Device { + size_t struct_size; + void* next; + + const char* name; + size_t name_len; +} SE_Device; + + +// Evaluates to 32 +#define SE_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, name_len) + +// Plugin Implementation + +SE_Device* Plugin_CreateDevice() { + SE_Device* se = new SE_Device{ SE_DEVICE_STRUCT_SIZE }; + // Based on header v1, se->struct_size will be 32 + ... + return se; +} +``` + +TensorFlow checks that struct_size must be greater than the offset of device_handle before accessing it. + +```cpp +// TF Implementation + +void TF_Foo(const SE_Device* device) { + // TF checks for struct_size greater than 32. + if (device->struct_size > offsetof(SE_Device, device_handle)) { + // TF knows that device_handle can be safely read from. + DoSomething(device->device_handle); + } +} +``` + +### Forwards compatibility + +In the event that a plugin is up to date or newer, se->struct_size would have been initialized to 48. This would then pass the TF_Foo() check above and device_handle can be safely accessed. + +**Future Plugin compiled against StreamExecutorInterface v3** + +```cpp +// StreamExecutorInterface header version 3 +typedef struct SE_Device { + size_t struct_size; + void* next; + + const char* name; + size_t name_len; + void* device_handle; + void* data; // Added in v3 +} SE_Device; + +// Evaluates to 48 +#define SE_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, data) + + +// Plugin Implementation + +SE_Device* Plugin_CreateDevice() { + SE_Device* se = new SE_Device{ SE_DEVICE_STRUCT_SIZE }; + // se->struct_size will be 48 + ... + return se; +} +``` + +Using the same TF_Foo() above, TF_Foo() was implemented before SE_Device::data was added after SE_Device::device_handle. Since TensorFlow only knows about the members that come before SE_Device::data, the newly added device->data will not be accessed. + +## When TensorFlow deprecates functionality +When functionality is being deprecated, there will be comments next to the member indicating so. The member is left in place to preserve the alignment and offset of the existing structure members. + +Since members are not allowed to be removed or reordered, refactors (e.g. renaming device_handle to dev_handle) or changing of member types (e.g. from int to float) are considered as deprecation. + +### Backwards compatibility +SE_Device::data has been deprecated in version 4, and a comment in the header indicated as such. + +**Future TensorFlow compiled against StreamExecutorInterface v4** + +```cpp +// StreamExecutorInterface header version 4 +typedef struct SE_Device { + size_t struct_size; + void* next; + + const char* name; + size_t name_len; + void* device_handle; + void* data; // Deprecated +} SE_Device; + +// Evaluates to 48 +#define SE_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, data) + +// TF Implementation + +void TF_Foo(const SE_Device* device) { + // TF checks for struct_size greater than 32. + if (device->struct_size > offsetof(SE_Device, device_handle)) { + // TF knows that device_handle can be safely accessed. + DoSomething(device->device_handle); + } + + // TensorFlow removes implementation to stop using deprecated functionality. + /* + if (device->struct_size > offsetof(SE_Device, data)) { + // TF knows that device->data can be safely accessed. + DoSomething(device->data); + } + */ + +} +`` + +The plugin, being older, was initializing the recently deprecated SE_Device::data. Since TF_Foo() does not access it anymore, SE_Device::data will be safely ignored (even though it was initialized). + +### Forwards compatibility +Plugins may choose to support older TensorFlow releases that have deprecated functionality. + +In a simple case, TensorFlow is already performing input validation and capable of providing best effort forward compatibility with newer plugins. + +**Older TensorFlow compiled against StreamExecutorInterface v4 with data validation** + +```cpp +void TF_Foo(const SE_Device* device) { + ... + // TF checks for struct_size greater than offset of data, and also validates device->data. + if (device->struct_size > offsetof(SE_Device, data) && device->data != nullptr) { + // TF knows that data can be safely accessed. + DoSomething(device->data); + } +} +``` + + +This way, plugins can safely remove implementation of deprecated functionality. + +**Future Plugin compiled against StreamExecutorInterface v5** +```cpp +// StreamExecutorInterface header version 5 +typedef struct SE_Device { + size_t struct_size; + void* next; + + const char* name; + size_t name_len; + void* device_handle; + void* data; // Deprecated in v4 + void* data2; +} SE_Device; + +// Evaluates to 56 +#define SE_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, data2) + + +// Plugin Implementation + +SE_Device* Plugin_CreateDevice() { + SE_Device* se = new SE_Device{ SE_DEVICE_STRUCT_SIZE }; + // se->struct_size will be 56 + se->device_handle = GetHandle(); + + // se->data was deprecated so ignore it. It was already zero initialized + // at “SE_Device{ SE_DEVICE_STRUCT_SIZE }” above. + + se->data2 = GetData2(); + return se; +} +``` + +In a more complex scenario, an older TensorFlow release might consume deprecated functionality for granted. + +**Older TensorFlow compiled against StreamExecutorInterface v4 without data validation** + +```cpp +void TF_Foo(const SE_Device* device) { + ... + // TF checks for struct_size greater than offset of data. + // No input validation. + if (device->struct_size > offsetof(SE_Device, data)) { + // Will crash on null pointer dereference. + Dereference(device->data); + } +} +``` + +In this case, it is recommended for plugins to continue to keep the deprecated implementation around. Once the plugin stops supporting the latest version of TensorFlow that uses the deprecated functionality, the implementation can be safely removed. This comes at the cost of maintenance of legacy deprecated code on the plugin side. + +## Limitations +* Maximum supported alignment is 8 bytes. From 9053ba611363cece317ba601f478dc98b00417c0 Mon Sep 17 00:00:00 2001 From: Penporn Koanantakool <38085909+penpornk@users.noreply.github.com> Date: Fri, 10 Jul 2020 17:19:17 -0700 Subject: [PATCH 39/58] Add a missing backtick. --- .../20200612-stream-executor-c-api/C_API_versioning_strategy.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md b/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md index 38ca64c07..1e394b718 100644 --- a/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md +++ b/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md @@ -146,7 +146,7 @@ void TF_Foo(const SE_Device* device) { */ } -`` +``` The plugin, being older, was initializing the recently deprecated SE_Device::data. Since TF_Foo() does not access it anymore, SE_Device::data will be safely ignored (even though it was initialized). From 539e7fa226c48fab5e75fa523142fc2de482b5c8 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Wed, 15 Jul 2020 19:03:13 -0700 Subject: [PATCH 40/58] Prefix structs with SE_ or SP_ prefixes --- rfcs/20200612-stream-executor-c-api.md | 63 ++++++++++++++------------ 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 10718ba67..519a93c0f 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -3,9 +3,9 @@ | Status | Proposed | | :------------ | :------------------------------------------------------ | | **RFC #** | [257](https://github.com/tensorflow/community/pull/257) | -| **Author(s)** | Anna Revinskaya (annarev@google.com), Penporn Koanantakool (penporn@google.com), Russell Power (power@google.com), Yi Situ (yisitu@google.com) | +| **Author(s)** | Anna Revinskaya (annarev@google.com), Penporn Koanantakool (penporn@google.com), Yi Situ (yisitu@google.com), Russell Power (power@google.com) | | **Sponsor** | Gunhan Gulsoy (gunan@google.com) | -| **Updated** | 2020-06-16 | +| **Updated** | 2020-07-15 | # Objective @@ -85,6 +85,13 @@ A decoupled way to add a new device to TensorFlow. is quite large and some of its methods are only sporadically used. Therefore, we plan to wrap only a subset of key StreamExecutorInterface functionality. We decided on this subset based on the PluggableDevice usecase as well as potential future devices such as TPUs. +Implementation conventions: + +* Structs include `struct_size` parameter. This parameter should be filled in both by TensorFlow and the plugin and can be checked to determine which struct fields are available for current version of TensorFlow. +* Struct name prefixes indicates which side of the API is responsible for populating the struct: + * `SE_` prefix: filled by TensorFlow. + * `SP_` prefix: filled by plugins, except `struct_size` which is also filled by TensorFlow when TensorFlow passes it to a callback. + See proposed C API below: ```cpp @@ -99,9 +106,9 @@ See proposed C API below: extern "C" { #endif -typedef SE_Stream_st* SE_Stream; -typedef SE_Event_st* SE_Event; -typedef SE_Timer_st* SE_Timer; +typedef SP_Stream_st* SP_Stream; +typedef SP_Event_st* SP_Event; +typedef SP_Timer_st* SP_Timer; typedef TF_Status* (*TF_StatusCallbackFn)(void*); #ifndef TF_BOOL_DEFINED @@ -120,16 +127,16 @@ typedef struct SE_PlatformId { #define SE_PLATFORMID_STRUCT_SIZE TF_OFFSET_OF_END(SE_PlatformId, id) -typedef struct SE_TimerFns { +typedef struct SP_TimerFns { size_t struct_size; void* ext; uint64_t (*nanoseconds)(SE_Timer timer); uint64_t (*microseconds)(SE_Timer timer); -} SE_TimerFns; +} SP_TimerFns; -#define SE_TIMER_FNS_STRUCT_SIZE TF_OFFSET_OF_END(SE_TimerFns, microseconds) +#define SP_TIMER_FNS_STRUCT_SIZE TF_OFFSET_OF_END(SE_TimerFns, microseconds) -typedef struct SE_AllocatorStats { +typedef struct SP_AllocatorStats { size_t struct_size; void* ext; int64_t num_allocs; @@ -147,9 +154,9 @@ typedef struct SE_AllocatorStats { int64_t bytes_reservable_limit; int64_t largest_free_block_bytes; -} SE_AllocatorStats; +} SP_AllocatorStats; -#define SE_ALLOCATORSTATS_STRUCT_SIZE TF_OFFSET_OF_END(SE_AllocatorStats, largest_free_block_bytes) +#define SP_ALLOCATORSTATS_STRUCT_SIZE TF_OFFSET_OF_END(SE_AllocatorStats, largest_free_block_bytes) typedef enum SE_EventStatus { SE_EVENT_UNKNOWN, @@ -166,7 +173,7 @@ typedef struct SE_Options { #define SE_OPTIONS_STRUCT_SIZE TF_OFFSET_OF_END(SE_Options, ordinal) -typedef struct SE_Device { +typedef struct SP_Device { size_t struct_size; void* ext; const char* name; @@ -178,14 +185,12 @@ typedef struct SE_Device { // Any kind of data that plugin device might want to store. void* data; - - // TensorFlow will set this field to `data` passed with SE_PlatformParams. - void* platform_data; -} SE_Device; -#define SE_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, data) +} SP_Device; -typedef struct SE_StreamExecutor { +#define SP_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, data) + +typedef struct SP_StreamExecutor { size_t struct_size; void* ext; @@ -308,16 +313,16 @@ typedef struct SE_StreamExecutor { // Enqueues on a stream a user-specified function to be run on the host. TF_BOOL (*host_callback)(SE_Device* executor, SE_Stream* stream, TF_StatusCallbackFn callback_fn, void* ctx); -} SE_StreamExecutor; +} SP_StreamExecutor; -#define SE_STREAMEXECUTOR_STRUCT_SIZE TF_OFFSET_OF_END(SE_StreamExecutor, host_callback) +#define SP_STREAMEXECUTOR_STRUCT_SIZE TF_OFFSET_OF_END(SP_StreamExecutor, host_callback) -typedef struct SE_PlatformParams { +typedef struct SP_Platform { size_t struct_size; - // Free form data set by plugin. It will be pre-filled - // in SE_Device passed to create_device. + // Free form data set by plugin. void* data; + SE_PlatformId* id; // Platform name @@ -340,11 +345,9 @@ typedef struct SE_PlatformParams { SE_StreamExecutor*, \\ out TF_Status* status); \\ out void (*destroy_stream_executor)(SE_StreamExecutor* stream_executor); -} SE_PlatformParams; - -#define SE_PLATFORM_PARAMS_SIZE TF_OFFSET_OF_END(SE_PlatformParams, destroy_stream_executor) +} SP_Platform; -TF_CAPI_EXPORT SE_Platform* SE_NewPlatform(SE_PlatformParams params); +#define SP_PLATFORM_SIZE TF_OFFSET_OF_END(SP_Platform, destroy_stream_executor) typedef struct SE_PlatformRegistrationParams { size_t struct_size; @@ -355,8 +358,8 @@ typedef struct SE_PlatformRegistrationParams { int32_t minor_version; int32_t revision_version; - // Params must be filled by the plugin. - SE_PlatformParams params; + // Must be filled by the plugin. + SP_Platform platform; // out } SE_PlatformRegistrationParams; #define SE_PLATFORM_REGISTRATION_PARAMS_SIZE TF_OFFSET_OF_END(SE_PlatformRegistrationParams, platform) @@ -408,7 +411,7 @@ DeviceFactory::Register(type_str, new PluggableDeviceFactory(platform_name_str), ## PlatformId -`SE_PlatformId.id` should be set to a unique identifier. +Currently Platforms registered with StreamExecutor have an id parameter. ## Usage example From 36784e52ff538e9eba74a95dc3bf6c8410b731ff Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Wed, 15 Jul 2020 21:03:00 -0700 Subject: [PATCH 41/58] Missed rename SE_TimerFns --> SP_TimerFns Co-authored-by: Situ Yi <60493024+yisitu@users.noreply.github.com> --- rfcs/20200612-stream-executor-c-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 519a93c0f..9e28f7415 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -134,7 +134,7 @@ typedef struct SP_TimerFns { uint64_t (*microseconds)(SE_Timer timer); } SP_TimerFns; -#define SP_TIMER_FNS_STRUCT_SIZE TF_OFFSET_OF_END(SE_TimerFns, microseconds) +#define SP_TIMER_FNS_STRUCT_SIZE TF_OFFSET_OF_END(SP_TimerFns, microseconds) typedef struct SP_AllocatorStats { size_t struct_size; From e8165731cf3f3f19b6fb8645934b0e24e15a4712 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Wed, 15 Jul 2020 21:03:50 -0700 Subject: [PATCH 42/58] Missed rename SE_AllocatorStats --> SP_AllocatorStats Co-authored-by: Situ Yi <60493024+yisitu@users.noreply.github.com> --- rfcs/20200612-stream-executor-c-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 9e28f7415..952904264 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -156,7 +156,7 @@ typedef struct SP_AllocatorStats { int64_t largest_free_block_bytes; } SP_AllocatorStats; -#define SP_ALLOCATORSTATS_STRUCT_SIZE TF_OFFSET_OF_END(SE_AllocatorStats, largest_free_block_bytes) +#define SP_ALLOCATORSTATS_STRUCT_SIZE TF_OFFSET_OF_END(SP_AllocatorStats, largest_free_block_bytes) typedef enum SE_EventStatus { SE_EVENT_UNKNOWN, From e8291e1f27ac5b78a69d4316e4a8f50effd2aec6 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Wed, 15 Jul 2020 21:04:17 -0700 Subject: [PATCH 43/58] Missed suggestion SE_Device --> SP_Device Co-authored-by: Situ Yi <60493024+yisitu@users.noreply.github.com> --- rfcs/20200612-stream-executor-c-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 952904264..ae4dc833a 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -188,7 +188,7 @@ typedef struct SP_Device { } SP_Device; -#define SP_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, data) +#define SP_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SP_Device, data) typedef struct SP_StreamExecutor { size_t struct_size; From 3b9e27ee64ec6fed18d4c63a46f1ad4356589da2 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Wed, 15 Jul 2020 21:04:56 -0700 Subject: [PATCH 44/58] data --> ext Co-authored-by: Situ Yi <60493024+yisitu@users.noreply.github.com> --- rfcs/20200612-stream-executor-c-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index ae4dc833a..62d0b3ebc 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -184,7 +184,7 @@ typedef struct SP_Device { void* device_handle; // Any kind of data that plugin device might want to store. - void* data; + void* ext; } SP_Device; From d9fc33a6c0a711454d23bf3da5eeb72cc28ad9fb Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Wed, 15 Jul 2020 21:25:14 -0700 Subject: [PATCH 45/58] Sync memcpy + more SE_/SP_ changes + support other types --- rfcs/20200612-stream-executor-c-api.md | 132 ++++++++++++++----------- 1 file changed, 77 insertions(+), 55 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 62d0b3ebc..d9b6e80ae 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -173,6 +173,16 @@ typedef struct SE_Options { #define SE_OPTIONS_STRUCT_SIZE TF_OFFSET_OF_END(SE_Options, ordinal) +typedef struct SE_DeviceMemoryBase { + size_t struct_size; + void* ext; + void* opaque; + uint64_t size; + uint64_t payload; +} SE_DeviceMemoryBase; + +#define SE_DEVICE_MEMORY_BASE_STRUCT_SIZE TF_OFFSET_OF_END(SE_DeviceMemoryBase, payload) + typedef struct SP_Device { size_t struct_size; void* ext; @@ -201,88 +211,88 @@ typedef struct SP_StreamExecutor { // memory_space is reserved for a potential future usage and should be set // to 0. TF_DeviceMemoryBase* (*allocate)( - SE_Device* se, uint64_t size, int64_t memory_space); + SP_Device* se, uint64_t size, int64_t memory_space); - // Deallocate the DeviceMemory previously allocated via this interface. + // Deallocate the device memory previously allocated via this interface. // Deallocation of a nullptr-representative value is permitted. void (*deallocate)( - SE_Device* se, TF_DeviceMemoryBase* memory); + SP_Device* se, SE_DeviceMemoryBase* memory); - // Return allocator statistics. - TF_BOOL (*get_allocator_stats)(SE_Device* executor, - SE_AllocatorStats* stats); + // Fill SP_AllocatorStats with allocator statistics. + TF_BOOL (*get_allocator_stats)(SP_Device* executor, + SP_AllocatorStats* stats); // Returns the underlying device memory usage information, if it is available. // If it is not available (false is returned), free/total may not be // initialized. TF_BOOL (*device_memory_usage)( - SE_Device* executor, int64_t* free, int64_t* total); + SP_Device* executor, int64_t* free, int64_t* total); /*** STREAM CALLBACKS ***/ // Creates SE_Stream. This call should also Allocate stream // resources on the underlying platform and initializes its // internals. - void (*create_stream)(SE_Device* executor, SE_Stream*, TF_Status*); + void (*create_stream)(SP_Device* executor, SP_Stream*, TF_Status*); // Destroys SE_Stream and deallocates any underlying resources. - void (*destroy_stream)(SE_Device* executor, SE_Stream stream); + void (*destroy_stream)(SP_Device* executor, SP_Stream stream); // Causes dependent to not begin execution until other has finished its // last-enqueued work. TF_BOOL (*create_stream_dependency)( - SE_Device* executor, SE_Stream dependent, - SE_Stream other); + SP_Device* executor, SP_Stream dependent, + SP_Stream other); // Without blocking the device, retrieve the current stream status. - void (*get_status)(SE_Device* executor, SE_Stream stream, + void (*get_status)(SP_Device* executor, SP_Stream stream, TF_Status* status); /*** EVENT CALLBACKS ***/ - // Create SE_Event. Performs platform-specific allocation and initialization of an event. + // Create SP_Event. Performs platform-specific allocation and initialization of an event. void (*create_event)( - SE_Device* executor, SE_Event* event, TF_Status* status); + SP_Device* executor, SP_Event* event, TF_Status* status); // Destroy SE_Event and perform any platform-specific deallocation and cleanup of an event. void (*destroy_event)( - SE_Device* executor, SE_Event event, TF_Status* status); + SP_Device* executor, SP_Event event, TF_Status* status); // Requests the current status of the event from the underlying platform. SE_EventStatus (*poll_for_event_status)( - SE_Device* executor, SE_Event event); + SP_Device* executor, SP_Event event); // Inserts the specified event at the end of the specified stream. void (*record_event)( - SE_Device* executor, SE_Stream stream, - SE_Event event, TF_Status* status); + SP_Device* executor, SP_Stream stream, + SP_Event event, TF_Status* status); // Wait for the specified event at the end of the specified stream. void (*wait_for_event)( - SE_Device* executor, SE_Stream stream, - SE_Event event, TF_Status* status); + SP_Device* executor, SP_Stream stream, + SP_Event event, TF_Status* status); /*** TIMER CALLBACKS ***/ // Creates TF_Timer. Allocates timer resources on the underlying platform and initializes its // internals, setting `timer` output variable. Sets values in `timer_fns` struct. - void (*create_timer)(SE_Device* executor, SE_Timer* timer, SE_TimerFns* timer_fns, TF_Status* status); + void (*create_timer)(SP_Device* executor, SP_Timer* timer, SP_TimerFns* timer_fns, TF_Status* status); // Destroy timer and deallocates timer resources on the underlying platform. - void (*destroy_timer)(SE_Device* executor, SE_Timer timer, SE_TimerFns* timer_fns); + void (*destroy_timer)(SP_Device* executor, SP_Timer timer, SP_TimerFns* timer_fns); // Records a start event for an interval timer. TF_BOOL (*start_timer)( - SE_Device* executor, SE_Stream stream, SE_Timer timer); + SP_Device* executor, SP_Stream stream, SP_Timer timer); // Records a stop event for an interval timer. TF_BOOL (*stop_timer)( - SE_Device* executor, SE_Stream stream, SE_Timer timer); + SP_Device* executor, SP_Stream stream, SP_Timer timer); /*** MEMCPY CALLBACKS ***/ // Enqueues a memcpy operation onto stream, with a host destination location // host_dst and a device memory source, with target size size. TF_BOOL (*memcpy_to_host)( - SE_Device* executor, SE_Stream stream, + SP_Device* executor, SP_Stream stream, void* host_dst, const SE_DeviceMemoryBase* device_src, uint64_t size); @@ -290,29 +300,41 @@ typedef struct SP_StreamExecutor { // Enqueues a memcpy operation onto stream, with a device destination location // and a host memory source, with target size size TF_BOOL (*memcpy_from_host)( - SE_Device* executor, SE_Stream stream, + SP_Device* executor, SP_Stream stream, + SE_DeviceMemoryBase* device_dst, + const void* host_src, uint64_t size); + + // Blocks the caller while a data segment of the given size is + // copied from the device source to the host destination. + TF_BOOL (*sync_memcpy_to_host)( + SP_Device* executor, + void* host_dst, + const SE_DeviceMemoryBase* device_src, + uint64_t size); + + // Blocks the caller while a data segment of the given size is + // copied from the host source to the device destination. + TF_BOOL (*sync_memcpy_from_host)( + SP_Device* executor, SE_DeviceMemoryBase* device_dst, const void* host_src, uint64_t size); - // Causes the host code to synchronously wait for operations enqueued onto - // stream to complete. Effectively a join on the asynchronous device - // operations enqueued on the stream before this program point. - void (*block_host_until_done)( - SE_Device* executor, SE_Stream stream, - TF_Status* status); + // Causes the host code to synchronously wait for the event to complete. + void (*block_host_for_event)( + SP_Device* executor, SP_Event event, TF_Status* status); // Synchronizes all activity occurring in the StreamExecutor's context (most // likely a whole device). - TF_BOOL (*synchronize_all_activity)(SE_Device* executor); + TF_BOOL (*synchronize_all_activity)(SP_Device* executor); // Obtains metadata about the underlying device. - void (*fill_device_description)(SE_Device* executor, - SE_DeviceDescription* description, - TF_Status* status); + void (*fill_device_description)(SP_Device* executor, + SP_DeviceDescription* description, + TF_Status* status); // Enqueues on a stream a user-specified function to be run on the host. - TF_BOOL (*host_callback)(SE_Device* executor, SE_Stream* stream, - TF_StatusCallbackFn callback_fn, void* ctx); + TF_BOOL (*host_callback)(SP_Device* executor, SP_Stream* stream, + TF_StatusCallbackFn callback_fn, void* ctx); } SP_StreamExecutor; #define SP_STREAMEXECUTOR_STRUCT_SIZE TF_OFFSET_OF_END(SP_StreamExecutor, host_callback) @@ -329,22 +351,22 @@ typedef struct SP_Platform { const char* name; size_t name_len; - // Device type name. Right now only GPU is supported. + // Device type name, for example GPU. char* type; size_t type_len; // Callbacks for creating/destroying. void (*create_device)( - SE_Device* device, \\ out + SP_Device* device, \\ out SE_Options* options, \\ in TF_Status* status); \\ out - void (*destroy_device)(SE_Device* device); + void (*destroy_device)(SP_Device* device); // Callbacks for creating/destroying SE_StreamExecutor. void (*create_stream_executor)( - SE_StreamExecutor*, \\ out + SP_StreamExecutor*, \\ out TF_Status* status); \\ out - void (*destroy_stream_executor)(SE_StreamExecutor* stream_executor); + void (*destroy_stream_executor)(SP_StreamExecutor* stream_executor); } SP_Platform; #define SP_PLATFORM_SIZE TF_OFFSET_OF_END(SP_Platform, destroy_stream_executor) @@ -418,19 +440,19 @@ Currently Platforms registered with StreamExecutor have an id parameter. Define functions that create and destroy `SE_Device` and `SE_StreamExecutor`: ```cpp -void create_device(SE_Device* device, SE_Options* options, TF_Status* status) { +void create_device(SP_Device* device, SE_Options* options, TF_Status* status) { device->device_handle = get_my_device_handle(); ... } -void create_stream_executor(SE_StreamExecutor* se, TF_Status* status) { +void create_stream_executor(SP_StreamExecutor* se, TF_Status* status) { se->memcpy_from_host = my_device_memcpy_from_host_function; ... } -void destroy_device(SE_Device* device) { +void destroy_device(SP_Device* device) { -- destroy device handle here -- ... } -void destroy_stream_executor(SE_StreamExecutor* stream_executor) { +void destroy_stream_executor(SP_StreamExecutor* stream_executor) { -- perform any clean up needed for stream executor -- } ``` @@ -464,8 +486,8 @@ TensorFlow will call `InitializeSEPlugin` when registering the plugin. ## Stream/Timer/Event representation -API extension would require defining SE\_Stream\_st, SE\_Event\_st and -SE\_Timer\_st structs. From the point of view of TensorFlow, we will treat their +API extension would require defining SP\_Stream\_st, SP\_Event\_st and +SP\_Timer\_st structs. From the point of view of TensorFlow, we will treat their pointers as opaque. Underneath, StreamExecutor will rely on customized implementations of @@ -478,8 +500,8 @@ For example, Stream customization might look as follows: ```cpp class CStream : public StreamInterface { public: - explicit CStream(SE_Device* device, - SE_StreamExecutor* stream_executor) : + explicit CStream(SP_Device* device, + SP_StreamExecutor* stream_executor) : device_(device), stream_executor_(stream_executor), stream_handle_(nullptr) { } @@ -498,14 +520,14 @@ class CStream : public StreamInterface { } } - SE_Stream Handle() { + SP_Stream Handle() { return stream_handle_; } private: - SE_Device* device_; // not owned - SE_StreamExecutor* stream_executor_; // not owned - SE_Stream stream_handle_; + SP_Device* device_; // not owned + SP_StreamExecutor* stream_executor_; // not owned + SP_Stream stream_handle_; }; ``` From 8acf6852ad995668371cfefbb453342b2c840818 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Fri, 17 Jul 2020 11:13:50 -0700 Subject: [PATCH 46/58] Remove PlatformId --- rfcs/20200612-stream-executor-c-api.md | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index d9b6e80ae..22b37ebab 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -119,14 +119,6 @@ typedef TF_Status* (*TF_StatusCallbackFn)(void*); #define TF_OFFSET_OF_END(TYPE, MEMBER) (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) #endif // TF_OFFSET_OF_END -typedef struct SE_PlatformId { - size_t struct_size; - void* ext; - void* id; // aka stream_executor::Platform::Id -} SE_PlatformId; - -#define SE_PLATFORMID_STRUCT_SIZE TF_OFFSET_OF_END(SE_PlatformId, id) - typedef struct SP_TimerFns { size_t struct_size; void* ext; @@ -343,9 +335,7 @@ typedef struct SP_Platform { size_t struct_size; // Free form data set by plugin. - void* data; - - SE_PlatformId* id; + void* ext; // Platform name const char* name; @@ -433,7 +423,7 @@ DeviceFactory::Register(type_str, new PluggableDeviceFactory(platform_name_str), ## PlatformId -Currently Platforms registered with StreamExecutor have an id parameter. +StreamExecutor [Platform](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/platform.h;l=114) has an id parameter. This parameter will be hidden from the C API and set internally by TensorFlow instead. ## Usage example @@ -461,9 +451,6 @@ Define `SE_InitializePlugin` that TensorFlow will call when registering the devi ```cpp void SE_InitializePlugin(SE_PlatformRegistrationParams* params, TF_Status* status) { - static const int32_t plugin_id_value = 123; - SE_PlatformId id{ SE_PLATFORMID_STRUCT_SIZE }; - id.id = &plugin_id_value; int32_t visible_device_count = 2; std::string name = "MyDevice"; From 9d6383f41d46fb2afac3f4997153de7af71a4593 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Thu, 23 Jul 2020 14:10:06 -0700 Subject: [PATCH 47/58] Added host_memory_allocate/deallocate --- rfcs/20200612-stream-executor-c-api.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 22b37ebab..5e2b66aaa 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -220,6 +220,12 @@ typedef struct SP_StreamExecutor { // initialized. TF_BOOL (*device_memory_usage)( SP_Device* executor, int64_t* free, int64_t* total); + + // Allocate host memory. + void* (*host_memory_allocate)(uint64 size); + + // Deallocate host memory. + void (*host_memory_deallocate)(void *mem); /*** STREAM CALLBACKS ***/ From cf501f4827f6ea14e8857896c4acab64e2f23c10 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Thu, 23 Jul 2020 14:11:45 -0700 Subject: [PATCH 48/58] uint64 --> uint64_t --- rfcs/20200612-stream-executor-c-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 5e2b66aaa..93fb13f45 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -222,7 +222,7 @@ typedef struct SP_StreamExecutor { SP_Device* executor, int64_t* free, int64_t* total); // Allocate host memory. - void* (*host_memory_allocate)(uint64 size); + void* (*host_memory_allocate)(uint64_t size); // Deallocate host memory. void (*host_memory_deallocate)(void *mem); From 31de234a153eb36c84de3a9045ed57e3b8501f05 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Tue, 18 Aug 2020 21:43:45 -0700 Subject: [PATCH 49/58] Add device-to-device memcopies, removed extra void* ext --- rfcs/20200612-stream-executor-c-api.md | 30 +++++++++++++++++--------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 93fb13f45..ba3da30ef 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -177,20 +177,16 @@ typedef struct SE_DeviceMemoryBase { typedef struct SP_Device { size_t struct_size; - void* ext; + void* ext; // free-form field filled by plugin const char* name; size_t name_len; // Device vendor can store handle to their device representation // here. void* device_handle; - - // Any kind of data that plugin device might want to store. - void* ext; - } SP_Device; -#define SP_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SP_Device, data) +#define SP_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SP_Device, device_handle) typedef struct SP_StreamExecutor { size_t struct_size; @@ -289,7 +285,7 @@ typedef struct SP_StreamExecutor { /*** MEMCPY CALLBACKS ***/ // Enqueues a memcpy operation onto stream, with a host destination location // host_dst and a device memory source, with target size size. - TF_BOOL (*memcpy_to_host)( + TF_BOOL (*memcpy_dtoh)( SP_Device* executor, SP_Stream stream, void* host_dst, const SE_DeviceMemoryBase* device_src, @@ -297,14 +293,21 @@ typedef struct SP_StreamExecutor { // Enqueues a memcpy operation onto stream, with a device destination location // and a host memory source, with target size size - TF_BOOL (*memcpy_from_host)( + TF_BOOL (*memcpy_htod)( SP_Device* executor, SP_Stream stream, SE_DeviceMemoryBase* device_dst, const void* host_src, uint64_t size); + // Enqueues a memcpy operation onto stream, with a device destination + // location and a device memory source, with target size `size`. + void (*memcpy_dtod)(const SP_Device* device, SP_Stream stream, + SP_DeviceMemoryBase* device_dst, + const SP_DeviceMemoryBase* device_src, uint64_t size, + TF_Status* status); + // Blocks the caller while a data segment of the given size is // copied from the device source to the host destination. - TF_BOOL (*sync_memcpy_to_host)( + TF_BOOL (*sync_memcpy_dtoh)( SP_Device* executor, void* host_dst, const SE_DeviceMemoryBase* device_src, @@ -312,10 +315,17 @@ typedef struct SP_StreamExecutor { // Blocks the caller while a data segment of the given size is // copied from the host source to the device destination. - TF_BOOL (*sync_memcpy_from_host)( + TF_BOOL (*sync_memcpy_htod)( SP_Device* executor, SE_DeviceMemoryBase* device_dst, const void* host_src, uint64_t size); + + // Blocks the caller while a data segment of the given size is copied from the + // device source to the device destination. + void (*sync_memcpy_dtod)(const SP_Device* device, + SP_DeviceMemoryBase* device_dst, + const SP_DeviceMemoryBase* device_src, uint64_t size, + TF_Status* status); // Causes the host code to synchronously wait for the event to complete. void (*block_host_for_event)( From b42f06e689786262e96a932973732c1f4ec696c7 Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Thu, 20 Aug 2020 12:16:27 -0700 Subject: [PATCH 50/58] Added unified memory allocate/deallocate --- rfcs/20200612-stream-executor-c-api.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index ba3da30ef..ddccca54b 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -218,11 +218,19 @@ typedef struct SP_StreamExecutor { SP_Device* executor, int64_t* free, int64_t* total); // Allocate host memory. - void* (*host_memory_allocate)(uint64_t size); + void* (*host_memory_allocate)(TF_Device* device, uint64_t size); // Deallocate host memory. - void (*host_memory_deallocate)(void *mem); + void (*host_memory_deallocate)(TF_Device* device, void *mem); + // Allocates unified memory space of the given size, if supported. Support + // should be added by setting `supports_unified_memory` field in + // `DeviceDescription`. + void* (*unified_memory_allocate)(TF_Device* device, uint64_t bytes); + + // Deallocates unified memory space previously allocated with + // `unified_memory_allocate`. + void (*unified_memory_deallocate)(TF_Device* device, void* location); /*** STREAM CALLBACKS ***/ // Creates SE_Stream. This call should also Allocate stream From c8d9d0010f328a7a15c7fbd2fb6c00aaffaa73c3 Mon Sep 17 00:00:00 2001 From: Penporn Koanantakool <38085909+penpornk@users.noreply.github.com> Date: Mon, 31 Aug 2020 17:16:01 -0700 Subject: [PATCH 51/58] Change REVISION_VERSION to PATCH_VERSION To be to consistent with https://www.tensorflow.org/guide/versions --- rfcs/20200612-stream-executor-c-api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index ddccca54b..fe0f93397 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -100,7 +100,7 @@ See proposed C API below: #define SE_MAJOR 0 #define SE_MINOR 0 -#define SE_REVISION 1 +#define SE_PATCH 1 #ifdef __cplusplus extern "C" { @@ -392,7 +392,7 @@ typedef struct SE_PlatformRegistrationParams { // StreamExecutor C API version. int32_t major_version; int32_t minor_version; - int32_t revision_version; + int32_t patch_version; // Must be filled by the plugin. SP_Platform platform; // out From 3bb5b4dca4ed67e79d42445b2b96d8b984bc064c Mon Sep 17 00:00:00 2001 From: Penporn Koanantakool <38085909+penpornk@users.noreply.github.com> Date: Wed, 9 Sep 2020 10:20:38 -0700 Subject: [PATCH 52/58] Update 20200612-stream-executor-c-api.md * Added the Implementation Conventions and Usage Overview section. * Replaced the Stability / User Impact with the Versioning Strategy and Stability section. Also moved the section higher. * Updated the API design according to the real implementation. * Updated code examples. --- rfcs/20200612-stream-executor-c-api.md | 621 +++++++++++++++---------- 1 file changed, 366 insertions(+), 255 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index fe0f93397..b915ef203 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -3,9 +3,9 @@ | Status | Proposed | | :------------ | :------------------------------------------------------ | | **RFC #** | [257](https://github.com/tensorflow/community/pull/257) | -| **Author(s)** | Anna Revinskaya (annarev@google.com), Penporn Koanantakool (penporn@google.com), Yi Situ (yisitu@google.com), Russell Power (power@google.com) | +| **Authors** | Anna Revinskaya (annarev@google.com), Penporn Koanantakool (penporn@google.com), Yi Situ (yisitu@google.com), Russell Power (power@google.com) | | **Sponsor** | Gunhan Gulsoy (gunan@google.com) | -| **Updated** | 2020-07-15 | +| **Updated** | 2020-09-08 | # Objective @@ -21,7 +21,7 @@ to the current TensorFlow runtime. * Compatibility with the [new TensorFlow runtime stack](https://blog.tensorflow.org/2020/04/tfrt-new-tensorflow-runtime.html). -* APIs that will expose all device-specific capabilities. +* APIs that will expose all device-specific capabilities. # Motivation @@ -40,7 +40,7 @@ The new TensorFlow stack, based on [TFRT](https://blog.tensorflow.org/2020/04/tfrt-new-tensorflow-runtime.html) and [MLIR](https://www.tensorflow.org/mlir), is designed with this in mind. However, it is still in an active development phase and is not ready for third-party -device integration until later this year. (For device support expecting to land +device integration yet. (For device support expecting to land in 2021 or later, we highly recommend waiting to integrate with the new stack, since it is fundamentally different from the current stack and cannot guarantee code reuse.) @@ -48,15 +48,15 @@ code reuse.) In the meantime, we plan to provide limited device integration support for the current TensorFlow stack through [Modular TensorFlow](https://github.com/tensorflow/community/blob/master/rfcs/20190305-modular-tensorflow.md). -We anticipate three basic functionalities within a device plugin module: +We anticipate three basic functionalities within a device plug-in module: * Device registration: Addressed in a different RFC, [Adding Pluggable Device for TensorFlow](https://github.com/tensorflow/community/pull/262). * Device management: The focus of this RFC. * Kernel and op registration and implementation: [RFC Accepted](https://github.com/tensorflow/community/blob/master/rfcs/20190814-kernel-and-op-registration.md). [C API implemented](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/c/). -[StreamExecutor](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_pimpl.h;l=73) is TensorFlow's main device manager, responsible for work execution and memory management. It provides a set of methods -(such as +[StreamExecutor](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_pimpl.h;l=73) is TensorFlow's main device manager, responsible for work execution and memory +management. It provides a set of methods (such as [Memcpy](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_internal.h;l=240)) that can be customized for a particular device. @@ -72,65 +72,123 @@ A decoupled way to add a new device to TensorFlow. * Faster time-to-solution: Does not need code review from the TensorFlow team. * Lower maintenance efforts: Only C-API-related changes could break the integration. Unrelated TensorFlow changes would not break the code. - * The C APIs may be changed during the initial experimental phase based - on developer experience and feedback. When the APIs become more mature, - we will try to keep them stable (in a best-effort manner) until the new + * The C APIs may be changed during the initial experimental phase based + on developer experience and feedback. When the APIs become more mature, + we will try to keep them stable (in a best-effort manner) until the new TensorFlow stack is available. # Design Proposal -## StreamExecutorInterface - [StreamExecutorInterface](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_internal.h;l=166?q=StreamExecutorinterface) -is quite large and some of its methods are only sporadically used. Therefore, we -plan to wrap only a subset of key StreamExecutorInterface functionality. We decided on this subset based on the PluggableDevice usecase as well as potential future devices such as TPUs. - -Implementation conventions: - -* Structs include `struct_size` parameter. This parameter should be filled in both by TensorFlow and the plugin and can be checked to determine which struct fields are available for current version of TensorFlow. -* Struct name prefixes indicates which side of the API is responsible for populating the struct: - * `SE_` prefix: filled by TensorFlow. - * `SP_` prefix: filled by plugins, except `struct_size` which is also filled by TensorFlow when TensorFlow passes it to a callback. - -See proposed C API below: - -```cpp -#include -#include - +has a large number of methods, some of which are only sporadically used. +Therefore, we plan to wrap only a subset of key `StreamExecutorInterface` +functionality. We decided on this subset based on the [PluggableDevice](https://github.com/tensorflow/community/pull/262) +usecase as well as potential future devices such as TPUs. + +## Versioning Strategy and Stability +StreamExecutor C API follows Semantic Versioning 2.0.0 ([semver](http://semver.org/)). +Each release version has a format `MAJOR.MINOR.PATCH`, as outlined in +[TensorFlow version compatibility](https://www.tensorflow.org/guide/versions#semantic_versioning_20). +We also use struct sizes to track compatibility. More details on functionality +extension and deprecation in [StreamExecutor C API Versioning Strategy](20200612-stream-executor-c-api/C_API_versioning_strategy.md). + +The C API will have an initial bake-in period, where we won’t have any +compatibility guarantees. However, we will make the best effort to perform any +updates in a backwards compatible way. For example, we plan to keep track of +struct sizes. + +The C API will be placed in [tensorflow/c/experimental](https://cs.opensource.google/tensorflow/tensorflow/+/refs/tags/v2.3.0:tensorflow/c/experimental/). +We will consider moving the API out of the experimental directory once it is +more stable. + +## Implementation Conventions + +* Struct prefix indicates whether struct fields should be filled by the plug-in or core implementation: + * `SE_`: Set/filled by core, unless marked otherwise. + * `SP_`: Set/filled by plug-in, unless marked otherwise. + * This prefix rule only applies to structures. Enumerations and methods are all prefixed with `SE_`. +* Structs begin with two fields: + * `size_t struct_size`: Stores the unpadded size of the struct. + * `void* ext`: A free-form field that can be populated by a plugin in `SP_*` structs or potential future extension points in `SE_` structs. +* We use `struct_size` for version checking. + * It is exempt from the `SE/SP` rule above and should be set both by core and plug-in. + * It can be checked to determine which struct fields are available for current version of TensorFlow. + * For example, `create_device` function receives `SP_Device*` as input with `struct_size` populated by core. The plug-in is responsible for setting `struct_size` as well, along with all other fields. +* When a member is added to a struct, the struct size definition must be updated to use the new last member of the struct. + +## Usage Overview + +The table below summarizes all structures defined and the functionality they involve. +| Action | Function call(s) | Populated by Core TensorFlow | Populated by plug-in | +| :----- | :-------------- | :--------------------------- | :------------------- | +| Register platform | `SE_InitPlugin` | `SE_PlatformRegistrationParams` | `SP_Platform`, `SP_PlatformFns` | +| Create device | `SP_PlatformFns::create_device` | `SE_CreateDeviceParams` | `SP_Device` | +| Create stream executor | `SP_PlatformFns::create_stream_executor` | `SE_CreateStreamExecutorParams` | `SP_StreamExecutor` | +| Create timer functions | `SP_PlatformFns::create_timer_fns` | None | `SP_TimerFns` | +| Get allocator stats | `SP_StreamExecutor::get_allocator_stats` | None | `SP_AllocatorStats` | +| Memory management | `SP_StreamExecutor::*allocate*`, `SP_StreamExecutor::*memcpy*` | None | `SP_DeviceMemoryBase` | + +### Registration +Core TensorFlow will register a new StreamExecutor platform as well as a new TensorFlow device with [DeviceFactory](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/core/common_runtime/device_factory.h;l=30?q=DeviceFactory). +1. Core TensorFlow links to plug-in's dynamic library and loads the function `SE_InitPlugin`. +2. Core TensorFlow populates `SE_PlatformRegistrationParams` and passes it in a call to `SE_InitPlugin`. + * In `SE_InitPlugin`, plug-in populates `SE_PlatformRegistrationParams::SP_Platform` and `SE_PlatformRegistrationParams::SP_PlatformFns`. +3. Core TensorFlow can now create a device, a stream executor, and a timer through functions in `SP_PlatformFns`. + * Core TensorFlow populates `SE_CreateDeviceParams` and pass it in a call to `SP_PlatformFns::create_device`. + * Plug-in populates `SE_CreateDeviceParams::SP_Device`. + * Core TensorFlow populates `SE_CreateStreamExecutorParams` and pass it in a call to `SP_PlatformFns::create_stream_executor`. + * Plug-in populates `SE_CreateStreamExecutorParams::SP_StreamExecutor`. + * Core TensorFlow sets `struct_size` in `SP_Timer` and pass it in a call to `SP_PlatformFns::create_timer_fns`. + * Plug-in populates `SP_TimerFns`. +4. Core TensorFlow registers a new `PluggableDeviceFactory`. + +`PluggableDevice` is covered in a separate RFC: [Adding Pluggable Device For TensorFlow](https://github.com/tensorflow/community/pull/262). + + +### Definitions from Plug-in +Plug-in needs to provide: +* Methods: `SE_InitPlugin` and other methods declared in `SP_*` structs. +* Structures: `SP_Stream_st`, `SP_Event_st`, and `SP_Timer_st`. + +## Detailed API +```c++ #define SE_MAJOR 0 #define SE_MINOR 0 #define SE_PATCH 1 +// TF_Bool is the C API typedef for unsigned char, while TF_BOOL is +// the datatype for boolean tensors. +#ifndef TF_Bool +#define TF_Bool unsigned char +#endif // TF_Bool + +// Macro used to calculate struct size for maintaining ABI stability across +// different struct implementations. +#ifndef TF_OFFSET_OF_END +#define TF_OFFSET_OF_END(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) +#endif // TF_OFFSET_OF_END + #ifdef __cplusplus extern "C" { #endif -typedef SP_Stream_st* SP_Stream; -typedef SP_Event_st* SP_Event; -typedef SP_Timer_st* SP_Timer; -typedef TF_Status* (*TF_StatusCallbackFn)(void*); - -#ifndef TF_BOOL_DEFINED -#define TF_BOOL unsigned char -#endif // TF_BOOL_DEFINED - -#ifndef TF_OFFSET_OF_END -#define TF_OFFSET_OF_END(TYPE, MEMBER) (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) -#endif // TF_OFFSET_OF_END +typedef struct SP_Stream_st* SP_Stream; +typedef struct SP_Event_st* SP_Event; +typedef struct SP_Timer_st* SP_Timer; +// Takes `callback_arg` passed to `host_callback` as the first argument. +typedef void (*SE_StatusCallbackFn)(void* const, TF_Status* const); typedef struct SP_TimerFns { size_t struct_size; - void* ext; - uint64_t (*nanoseconds)(SE_Timer timer); - uint64_t (*microseconds)(SE_Timer timer); + void* ext; // reserved for future use + uint64_t (*nanoseconds)(SP_Timer timer); } SP_TimerFns; -#define SP_TIMER_FNS_STRUCT_SIZE TF_OFFSET_OF_END(SP_TimerFns, microseconds) +#define SP_TIMER_FNS_STRUCT_SIZE TF_OFFSET_OF_END(SP_TimerFns, nanoseconds) typedef struct SP_AllocatorStats { size_t struct_size; - void* ext; int64_t num_allocs; int64_t bytes_in_use; int64_t peak_bytes_in_use; @@ -148,8 +206,11 @@ typedef struct SP_AllocatorStats { int64_t largest_free_block_bytes; } SP_AllocatorStats; -#define SP_ALLOCATORSTATS_STRUCT_SIZE TF_OFFSET_OF_END(SP_AllocatorStats, largest_free_block_bytes) +#define SP_ALLOCATORSTATS_STRUCT_SIZE \ + TF_OFFSET_OF_END(SP_AllocatorStats, largest_free_block_bytes) +// Potential states for an SP_Event. If `poll_for_status` returns anything aside +// from kPending or kComplete, an error has occurred; kUnknown is a bad state. typedef enum SE_EventStatus { SE_EVENT_UNKNOWN, SE_EVENT_ERROR, @@ -157,29 +218,25 @@ typedef enum SE_EventStatus { SE_EVENT_COMPLETE, } SE_EventStatus; -typedef struct SE_Options { - size_t struct_size; - void* ext; - int32_t ordinal; -} SE_Options; - -#define SE_OPTIONS_STRUCT_SIZE TF_OFFSET_OF_END(SE_Options, ordinal) - -typedef struct SE_DeviceMemoryBase { +// Memory allocation information. +// This matches DeviceMemoryBase defined here: +// https://cs.opensource.google/tensorflow/tensorflow/+/refs/tags/v2.3.0:tensorflow/stream_executor/device_memory.h;l=57 +typedef struct SP_DeviceMemoryBase { size_t struct_size; - void* ext; + void* ext; // free-form data set by plugin + // Platform-dependent value representing allocated memory. void* opaque; - uint64_t size; - uint64_t payload; -} SE_DeviceMemoryBase; + uint64_t size; // Size in bytes of this allocation. + uint64_t payload; // Value for plugin's use +} SP_DeviceMemoryBase; -#define SE_DEVICE_MEMORY_BASE_STRUCT_SIZE TF_OFFSET_OF_END(SE_DeviceMemoryBase, payload) +#define SP_DEVICE_MEMORY_BASE_STRUCT_SIZE \ + TF_OFFSET_OF_END(SP_DeviceMemoryBase, size) typedef struct SP_Device { size_t struct_size; - void* ext; // free-form field filled by plugin - const char* name; - size_t name_len; + void* ext; // free-form data set by plugin + int32_t ordinal; // device index // Device vendor can store handle to their device representation // here. @@ -188,146 +245,154 @@ typedef struct SP_Device { #define SP_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SP_Device, device_handle) +typedef struct SE_CreateDeviceParams { + size_t struct_size; + void* ext; // reserved for future use + int32_t ordinal; // device index + + SP_Device* device; // Input/output, struct_size set by TF for plugin to read. + // Subsequently plugin fills the entire struct. +} SE_CreateDeviceParams; + +#define SE_CREATE_DEVICE_PARAMS_STRUCT_SIZE \ + TF_OFFSET_OF_END(SE_CreateDeviceParams, device) + typedef struct SP_StreamExecutor { size_t struct_size; - void* ext; + void* ext; // reserved for future use /*** ALLOCATION CALLBACKS ***/ - // Synchronously allocates size bytes on the underlying platform and returns - // a DeviceMemoryBase representing that allocation. In the case of failure, + // Synchronously allocates `size` bytes on the underlying platform and returns + // `SP_DeviceMemoryBase` representing that allocation. In the case of failure, // nullptr is returned. - // memory_space is reserved for a potential future usage and should be set + // `memory_space` is reserved for a potential future usage and should be set // to 0. - TF_DeviceMemoryBase* (*allocate)( - SP_Device* se, uint64_t size, int64_t memory_space); - + void (*allocate)(const SP_Device* device, uint64_t size, int64_t memory_space, + SP_DeviceMemoryBase* mem); // Deallocate the device memory previously allocated via this interface. // Deallocation of a nullptr-representative value is permitted. - void (*deallocate)( - SP_Device* se, SE_DeviceMemoryBase* memory); + void (*deallocate)(const SP_Device* device, SP_DeviceMemoryBase* memory); + // Allocates a region of host memory and registers it with the platform API. + // Memory allocated in this manner is required for use in asynchronous memcpy + // operations, such as `memcpy_dtoh`. + void* (*host_memory_allocate)(const SP_Device* device, uint64_t size); + + // Deallocates a region of host memory allocated by `host_memory_allocate`. + void (*host_memory_deallocate)(const SP_Device* device, void* mem); + + // Allocates unified memory space of the given size, if supported. Unified + // memory support should be added by setting `supports_unified_memory` field + // in `SP_Platform`. + void* (*unified_memory_allocate)(const SP_Device* device, uint64_t bytes); - // Fill SP_AllocatorStats with allocator statistics. - TF_BOOL (*get_allocator_stats)(SP_Device* executor, - SP_AllocatorStats* stats); - // Returns the underlying device memory usage information, if it is available. - // If it is not available (false is returned), free/total may not be - // initialized. - TF_BOOL (*device_memory_usage)( - SP_Device* executor, int64_t* free, int64_t* total); - - // Allocate host memory. - void* (*host_memory_allocate)(TF_Device* device, uint64_t size); - - // Deallocate host memory. - void (*host_memory_deallocate)(TF_Device* device, void *mem); - - // Allocates unified memory space of the given size, if supported. Support - // should be added by setting `supports_unified_memory` field in - // `DeviceDescription`. - void* (*unified_memory_allocate)(TF_Device* device, uint64_t bytes); - // Deallocates unified memory space previously allocated with - // `unified_memory_allocate`. - void (*unified_memory_deallocate)(TF_Device* device, void* location); + // `unified_memory_allocate`. Unified + // memory support should be added by setting `supports_unified_memory` field + // in `SP_Platform`. + void (*unified_memory_deallocate)(const SP_Device* device, void* location); + + // Fills SP_AllocatorStats with allocator statistics, if it is available. + // If it is not available, return false. + TF_Bool (*get_allocator_stats)(const SP_Device* device, + SP_AllocatorStats* stats); + // Fills the underlying device memory usage information, if it is + // available. If it is not available (false is returned), free/total need not + // be initialized. + TF_Bool (*device_memory_usage)(const SP_Device* device, int64_t* free, + int64_t* total); /*** STREAM CALLBACKS ***/ - // Creates SE_Stream. This call should also Allocate stream + // Creates SP_Stream. This call should also allocate stream // resources on the underlying platform and initializes its // internals. - void (*create_stream)(SP_Device* executor, SP_Stream*, TF_Status*); + void (*create_stream)(const SP_Device* device, SP_Stream* stream, + TF_Status* status); - // Destroys SE_Stream and deallocates any underlying resources. - void (*destroy_stream)(SP_Device* executor, SP_Stream stream); + // Destroys SP_Stream and deallocates any underlying resources. + void (*destroy_stream)(const SP_Device* device, SP_Stream stream); - // Causes dependent to not begin execution until other has finished its + // Causes `dependent` to not begin execution until `other` has finished its // last-enqueued work. - TF_BOOL (*create_stream_dependency)( - SP_Device* executor, SP_Stream dependent, - SP_Stream other); + void (*create_stream_dependency)(const SP_Device* device, SP_Stream dependent, + SP_Stream other, TF_Status* status); // Without blocking the device, retrieve the current stream status. - void (*get_status)(SP_Device* executor, SP_Stream stream, - TF_Status* status); + void (*get_stream_status)(const SP_Device* device, SP_Stream stream, + TF_Status* status); /*** EVENT CALLBACKS ***/ - // Create SP_Event. Performs platform-specific allocation and initialization of an event. - void (*create_event)( - SP_Device* executor, SP_Event* event, TF_Status* status); + // Create SP_Event. Performs platform-specific allocation and initialization + // of an event. + void (*create_event)(const SP_Device* device, SP_Event* event, + TF_Status* status); - // Destroy SE_Event and perform any platform-specific deallocation and cleanup of an event. - void (*destroy_event)( - SP_Device* executor, SP_Event event, TF_Status* status); + // Destroy SE_Event and perform any platform-specific deallocation and + // cleanup of an event. + void (*destroy_event)(const SP_Device* device, SP_Event event); // Requests the current status of the event from the underlying platform. - SE_EventStatus (*poll_for_event_status)( - SP_Device* executor, SP_Event event); + SE_EventStatus (*get_event_status)(const SP_Device* device, SP_Event event); // Inserts the specified event at the end of the specified stream. - void (*record_event)( - SP_Device* executor, SP_Stream stream, - SP_Event event, TF_Status* status); + void (*record_event)(const SP_Device* device, SP_Stream stream, + SP_Event event, TF_Status* status); // Wait for the specified event at the end of the specified stream. - void (*wait_for_event)( - SP_Device* executor, SP_Stream stream, - SP_Event event, TF_Status* status); + void (*wait_for_event)(const SP_Device* const device, SP_Stream stream, + SP_Event event, TF_Status* const status); /*** TIMER CALLBACKS ***/ - // Creates TF_Timer. Allocates timer resources on the underlying platform and initializes its - // internals, setting `timer` output variable. Sets values in `timer_fns` struct. - void (*create_timer)(SP_Device* executor, SP_Timer* timer, SP_TimerFns* timer_fns, TF_Status* status); + // Creates SP_Timer. Allocates timer resources on the underlying platform + // and initializes its internals, setting `timer` output variable. Sets + // values in `timer_fns` struct. + void (*create_timer)(const SP_Device* device, SP_Timer* timer, + TF_Status* status); // Destroy timer and deallocates timer resources on the underlying platform. - void (*destroy_timer)(SP_Device* executor, SP_Timer timer, SP_TimerFns* timer_fns); + void (*destroy_timer)(const SP_Device* device, SP_Timer timer); // Records a start event for an interval timer. - TF_BOOL (*start_timer)( - SP_Device* executor, SP_Stream stream, SP_Timer timer); - + void (*start_timer)(const SP_Device* device, SP_Stream stream, SP_Timer timer, + TF_Status* status); // Records a stop event for an interval timer. - TF_BOOL (*stop_timer)( - SP_Device* executor, SP_Stream stream, SP_Timer timer); + void (*stop_timer)(const SP_Device* device, SP_Stream stream, SP_Timer timer, + TF_Status* status); /*** MEMCPY CALLBACKS ***/ // Enqueues a memcpy operation onto stream, with a host destination location - // host_dst and a device memory source, with target size size. - TF_BOOL (*memcpy_dtoh)( - SP_Device* executor, SP_Stream stream, - void* host_dst, - const SE_DeviceMemoryBase* device_src, - uint64_t size); - - // Enqueues a memcpy operation onto stream, with a device destination location - // and a host memory source, with target size size - TF_BOOL (*memcpy_htod)( - SP_Device* executor, SP_Stream stream, - SE_DeviceMemoryBase* device_dst, - const void* host_src, uint64_t size); - + // `host_dst` and a device memory source, with target size `size`. + void (*memcpy_dtoh)(const SP_Device* device, SP_Stream stream, void* host_dst, + const SP_DeviceMemoryBase* device_src, uint64_t size, + TF_Status* status); + + // Enqueues a memcpy operation onto stream, with a device destination + // location and a host memory source, with target size `size`. + void (*memcpy_htod)(const SP_Device* device, SP_Stream stream, + SP_DeviceMemoryBase* device_dst, const void* host_src, + uint64_t size, TF_Status* status); + // Enqueues a memcpy operation onto stream, with a device destination // location and a device memory source, with target size `size`. void (*memcpy_dtod)(const SP_Device* device, SP_Stream stream, SP_DeviceMemoryBase* device_dst, const SP_DeviceMemoryBase* device_src, uint64_t size, TF_Status* status); - + // Blocks the caller while a data segment of the given size is // copied from the device source to the host destination. - TF_BOOL (*sync_memcpy_dtoh)( - SP_Device* executor, - void* host_dst, - const SE_DeviceMemoryBase* device_src, - uint64_t size); + void (*sync_memcpy_dtoh)(const SP_Device* device, void* host_dst, + const SP_DeviceMemoryBase* device_src, uint64_t size, + TF_Status* status); // Blocks the caller while a data segment of the given size is // copied from the host source to the device destination. - TF_BOOL (*sync_memcpy_htod)( - SP_Device* executor, - SE_DeviceMemoryBase* device_dst, - const void* host_src, uint64_t size); - + void (*sync_memcpy_htod)(const SP_Device* device, + SP_DeviceMemoryBase* device_dst, + const void* host_src, uint64_t size, + TF_Status* status); + // Blocks the caller while a data segment of the given size is copied from the // device source to the device destination. void (*sync_memcpy_dtod)(const SP_Device* device, @@ -336,94 +401,145 @@ typedef struct SP_StreamExecutor { TF_Status* status); // Causes the host code to synchronously wait for the event to complete. - void (*block_host_for_event)( - SP_Device* executor, SP_Event event, TF_Status* status); + void (*block_host_for_event)(const SP_Device* device, SP_Event event, + TF_Status* status); + + // [Optional] + // Causes the host code to synchronously wait for operations entrained onto + // stream to complete. Effectively a join on the asynchronous device + // operations enqueued on the stream before this program point. + // If not set, then corresponding functionality will be implemented + // by registering an event on the `stream` and waiting for it using + // `block_host_for_event`. + void (*block_host_until_done)(const SP_Device* device, SP_Stream stream, + TF_Status* status); // Synchronizes all activity occurring in the StreamExecutor's context (most // likely a whole device). - TF_BOOL (*synchronize_all_activity)(SP_Device* executor); - - // Obtains metadata about the underlying device. - void (*fill_device_description)(SP_Device* executor, - SP_DeviceDescription* description, - TF_Status* status); + void (*synchronize_all_activity)(const SP_Device* device, TF_Status* status); // Enqueues on a stream a user-specified function to be run on the host. - TF_BOOL (*host_callback)(SP_Device* executor, SP_Stream* stream, - TF_StatusCallbackFn callback_fn, void* ctx); + // `callback_arg` should be passed as the first argument to `callback_fn`. + TF_Bool (*host_callback)(SP_Device* device, SP_Stream stream, + SE_StatusCallbackFn callback_fn, void* callback_arg); } SP_StreamExecutor; -#define SP_STREAMEXECUTOR_STRUCT_SIZE TF_OFFSET_OF_END(SP_StreamExecutor, host_callback) +#define SP_STREAMEXECUTOR_STRUCT_SIZE \ + TF_OFFSET_OF_END(SP_StreamExecutor, host_callback) + +typedef struct SE_CreateStreamExecutorParams { + size_t struct_size; + void* ext; // reserved for future use + + SP_StreamExecutor* stream_executor; // output, to be filled by plugin +} SE_CreateStreamExecutorParams; + +#define SE_CREATE_STREAM_EXECUTOR_PARAMS_STRUCT_SIZE \ + TF_OFFSET_OF_END(SE_CreateStreamExecutorParams, stream_executor) typedef struct SP_Platform { size_t struct_size; - - // Free form data set by plugin. - void* ext; - - // Platform name + + void* ext; // free-form data set by plugin + + // Platform name. Must be null-terminated. const char* name; - size_t name_len; - - // Device type name, for example GPU. - char* type; - size_t type_len; - - // Callbacks for creating/destroying. - void (*create_device)( - SP_Device* device, \\ out - SE_Options* options, \\ in - TF_Status* status); \\ out - void (*destroy_device)(SP_Device* device); - - // Callbacks for creating/destroying SE_StreamExecutor. - void (*create_stream_executor)( - SP_StreamExecutor*, \\ out - TF_Status* status); \\ out - void (*destroy_stream_executor)(SP_StreamExecutor* stream_executor); + + // Device type name, for example GPU. Must be null-terminated. + const char* type; + + // Number of visible devices + size_t visible_device_count; + + // Whether this platform supports unified memory. + // Unified memory is a single memory address space accessible from any device. + TF_Bool supports_unified_memory; } SP_Platform; -#define SP_PLATFORM_SIZE TF_OFFSET_OF_END(SP_Platform, destroy_stream_executor) +#define SP_PLATFORM_STRUCT_SIZE \ + TF_OFFSET_OF_END(SP_Platform, supports_unified_memory) + +typedef struct SP_PlatformFns { + size_t struct_size; + + void* ext; // reserved for future use + + // Callbacks for creating/destroying SP_Device. + void (*create_device)(const SP_Platform* platform, + SE_CreateDeviceParams* params, TF_Status* status); + + // Clean up fields inside SP_Device that were allocated + // by the plugin. `device` itself should not be deleted here. + void (*destroy_device)(const SP_Platform* platform, SP_Device* device); + + // Callbacks for creating/destroying SP_StreamExecutor. + void (*create_stream_executor)(const SP_Platform* platform, + SE_CreateStreamExecutorParams* params, + TF_Status* status); + // Clean up fields inside SP_StreamExecutor that were allocated + // by the plugin. `stream_executor` itself should not be deleted here. + void (*destroy_stream_executor)(const SP_Platform* platform, + SP_StreamExecutor* stream_executor); + + // Callbacks for creating/destroying SP_TimerFns. + void (*create_timer_fns)(const SP_Platform* platform, SP_TimerFns* timer, + TF_Status* status); + + void (*destroy_timer_fns)(const SP_Platform* platform, + SP_TimerFns* timer_fns); +} SP_PlatformFns; + +#define SP_PLATFORM_FNS_STRUCT_SIZE \ + TF_OFFSET_OF_END(SP_PlatformFns, destroy_timer_fns) typedef struct SE_PlatformRegistrationParams { size_t struct_size; - void* ext; - + void* ext; // reserved for future use + // StreamExecutor C API version. int32_t major_version; int32_t minor_version; int32_t patch_version; - - // Must be filled by the plugin. - SP_Platform platform; // out + + SP_Platform* platform; // output, set by plugin + SP_PlatformFns* platform_fns; // output, set by plugin + // Clean up fields inside SP_Platform that were allocated + // by the plugin. `platform` itself should not be deleted here. + void (*destroy_platform)(SP_Platform* platform); // out, set by plugin + void (*destroy_platform_fns)( + SP_PlatformFns* platform_fns); // out, set by plugin } SE_PlatformRegistrationParams; -#define SE_PLATFORM_REGISTRATION_PARAMS_SIZE TF_OFFSET_OF_END(SE_PlatformRegistrationParams, platform) +#define SE_PLATFORM_REGISTRATION_PARAMS_STRUCT_SIZE \ + TF_OFFSET_OF_END(SE_PlatformRegistrationParams, destroy_platform_fns) -void SE_InitializePlugin(SE_PlatformRegistrationParams* params, TF_Status* status); +void SE_InitPlugin(SE_PlatformRegistrationParams* params, TF_Status* status); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif ``` -## Registration implementation +### PlatformId -Registration will be implemented by registering a new StreamExecutor platform as well as a new TensorFlow device with [DeviceFactory](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/core/common_runtime/device_factory.h;l=30?q=DeviceFactory). +StreamExecutor [Platform](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/platform.h;l=114) has an id parameter. This parameter will be hidden from the C API and set +internally by TensorFlow instead. + +## Usage Example +Code example for [PluggableDevice](https://github.com/tensorflow/community/pull/262) +registration outlined in the [Usage Overview](#Usage overview) section. + +### Core TensorFlow ```cpp -typedef (*SEPluginInitFn)(SE_PlatformRegistrationParams*, TF_Status*); +typedef void (*SEInitPluginFn)(SE_PlatformRegistrationParams*, TF_Status*); ... -void* plugin = dlopen("myplugin.so", ...); -if (!plugin) { - ... output error and skip this plugin ... -} -void* initialize_sym = dlsym(plugin, "SE_InitializePlugin"); +void* initialize_sym = dlsym(plugin_dso_handle, "SE_InitPlugin"); if (!initialize_sym) { - ... output error and skip this plugin ... + // Output error and skip this plug-in. } -SEPluginInitFn initialize_fn = reinterpret_cast(initialize_sym); +SEInitPluginFn initialize_fn = reinterpret_cast(initialize_sym); SE_PlatformRegistrationParams params; TF_Status* status = TF_NewStatus(); @@ -438,74 +554,78 @@ SE_CHECK_OK( std::move(platform))); // Register PluggableDevice -std::string platform_name_str(params.params.name, params.params.name_len); -std::string type_str(params.params.type, params.params.type_len); -DeviceFactory::Register(type_str, new PluggableDeviceFactory(platform_name_str), priority); +std::string platform_name_str(params.platform->name); +std::string type_str(params.platform->type); +DeviceFactory::Register(type_str, new PluggableDeviceFactory(platform_name_str), + priority); +... ``` -`PluggableDevice` is covered in a separate RFC: [RFC: Adding Pluggable Device For TensorFlow](https://github.com/tensorflow/community/pull/262). - -## PlatformId - -StreamExecutor [Platform](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/platform.h;l=114) has an id parameter. This parameter will be hidden from the C API and set internally by TensorFlow instead. - -## Usage example - -Define functions that create and destroy `SE_Device` and `SE_StreamExecutor`: +### Plug-in +Define functions that create and destroy `SP_Device`, `SP_StreamExecutor`, and +`SP_TimerFns`: ```cpp -void create_device(SP_Device* device, SE_Options* options, TF_Status* status) { - device->device_handle = get_my_device_handle(); +void create_device(const SP_Platform* platform, SE_CreateDeviceParams* params, + TF_Status* status) { + params->device->device_handle = get_my_device_handle(); ... } -void create_stream_executor(SP_StreamExecutor* se, TF_Status* status) { - se->memcpy_from_host = my_device_memcpy_from_host_function; +void create_stream_executor(const SP_Platform* platform, + SE_CreateStreamExecutorParams* params, + TF_Status* status) { + params->stream_executor->memcpy_htod = my_device_memcpy_from_host_function; ... } -void destroy_device(SP_Device* device) { - -- destroy device handle here -- +void create_timer_fns(const SP_Platform* platform, SP_TimerFns* timer_fns, + TF_Status* status) { + timer_fns->nanoseconds = nanoseconds; ... } -void destroy_stream_executor(SP_StreamExecutor* stream_executor) { - -- perform any clean up needed for stream executor -- +void destroy_device(const SP_Platform* platform, SP_Device* device) { + // Destroy device handle here. +} +void destroy_stream_executor(const SP_Platform* platform, + SP_StreamExecutor* se) { + // Perform any clean up needed for stream executor. +} +void destroy_timer_fns(const SP_Platform* platform, SP_TimerFns* timer_fns) { + // Destroy timer functions here. } ``` -Define `SE_InitializePlugin` that TensorFlow will call when registering the device plugin: +Define `SE_InitPlugin` that TensorFlow will call when registering the device +plug-in: ```cpp -void SE_InitializePlugin(SE_PlatformRegistrationParams* params, TF_Status* status) { +void SE_InitPlugin(SE_PlatformRegistrationParams* params, TF_Status* status) { int32_t visible_device_count = 2; - std::string name = "MyDevice"; std::string type = "GPU"; - params.params.id = id; - params.params.visible_device_count = visible_device_count; - params.params.create_device = create_device; - params.params.destroy_device = destroy_device; - params.params.create_stream_executor = create_stream_executor; - params.params.destroy_stream_executor = destroy_stream_executor; - params.params.name = name.c_str(); - params.params.name_len = name.size(); - params.params.type = type.c_str(); - params.params.type_len = type.size(); + params->platform->name = name.c_str(); + params->platform->type = type.c_str(); + params->platform->visible_device_count = visible_device_count; + params->platform_fns->create_device = create_device; + params->platform_fns->destroy_device = destroy_device; + params->platform_fns->create_stream_executor = create_stream_executor; + params->platform_fns->destroy_stream_executor = destroy_stream_executor; + params->platform_fns->create_timer_fns = create_timer_fns; + params->platform_fns->destroy_timer_fns = destroy_timer_fns; } ``` -TensorFlow will call `InitializeSEPlugin` when registering the plugin. +## Stream / Timer / Event Representation -## Stream/Timer/Event representation - -API extension would require defining SP\_Stream\_st, SP\_Event\_st and -SP\_Timer\_st structs. From the point of view of TensorFlow, we will treat their +API extension would require defining `SP_Stream_st`, `SP_Event_st`, and +`SP_Timer_st` structs. From the point of view of TensorFlow, we will treat their pointers as opaque. Underneath, StreamExecutor will rely on customized implementations of -[StreamInterface](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_internal.h;l=114?q=TimerInterface&ss=tensorflow%2Ftensorflow), -[TimerInterface](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_internal.h;l=145?q=TimerInterface&ss=tensorflow%2Ftensorflow) +[StreamInterface](https://cs.opensource.google/tensorflow/tensorflow/+/refs/tags/v2.3.0:tensorflow/stream_executor/stream_executor_internal.h;l=114), +[TimerInterface](https://cs.opensource.google/tensorflow/tensorflow/+/refs/tags/v2.3.0:tensorflow/stream_executor/stream_executor_internal.h;l=145) and -[EventInterface](https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/stream_executor/stream_executor_internal.h;l=76?q=TimerInterface&ss=tensorflow%2Ftensorflow). +[EventInterface](https://cs.opensource.google/tensorflow/tensorflow/+/refs/tags/v2.3.0:tensorflow/stream_executor/stream_executor_internal.h;l=76). For example, Stream customization might look as follows: ```cpp @@ -542,15 +662,6 @@ class CStream : public StreamInterface { }; ``` -## Stability / User Impact - -The C API will be placed under _tensorflow/c/experimental/_ directory. -Initially, we won’t have any compatibility guarantees. At the same time we will -make the best effort to perform any updates in a backwards compatible way. For -e.g. we plan to keep track of struct sizes. - -We will have an initial bake-in period before we consider moving the API out of experimental directory. - ## Alternatives Considered * **Forking:** Contributors could always fork the TensorFlow repository, From 9787cb128f421bf18b5bc29673c77f8f802c2265 Mon Sep 17 00:00:00 2001 From: Penporn Koanantakool <38085909+penpornk@users.noreply.github.com> Date: Wed, 9 Sep 2020 10:34:45 -0700 Subject: [PATCH 53/58] Added/updated versioning contents based on the discussion on the PluggableDevice RFC (PR #262). * Referenced semver and TensorFlow's Version Compatibility page. * Added the Updating Guidelines, Convention, and Detecting Incompatibility sections. * Merged the current code examples with @yisitu's newer ones on PR #262 and simplified them a bit. --- .../C_API_versioning_strategy.md | 514 ++++++++++++------ 1 file changed, 335 insertions(+), 179 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md b/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md index 1e394b718..723ac8794 100644 --- a/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md +++ b/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md @@ -1,226 +1,382 @@ -**Authors**: yisitu@, penporn@, annarev@ - -**Date**: 7/9/20 +# StreamExecutor C API Versioning Strategy +| Status | Proposed | +| :------------ | :------------------------------------------------------ | +| **RFC #** | Extension of #[257](https://github.com/tensorflow/community/pull/257) | +| **Authors** | Yi Situ (yisitu@google.com), Penporn Koanantakool (penporn@google.com), Anna Revinskaya (annarev@google.com) | +| **Sponsor** | Gunhan Gulsoy (gunan@google.com) | +| **Updated** | 2020-09-08 | In reply to a question on [PR #262](https://github.com/tensorflow/community/pull/262#issuecomment-653690654). -# TensorFlow Versioning Strategy - -TensorFlow StreamExecutorInterface (SEI) uses struct_size for version checking. Struct members are not allowed to be removed or reordered. Following are concrete examples of how TensorFlow remains compatible with plug-ins when functionality is added to or removed from StreamExecutorInterface. We will be using a simplified SE_Device as an example. - -## When TensorFlow extends functionality -### Backwards compatibility -TensorFlow is compiled against a newer SEI header (v2), which has SE_Device extended with device_handle. - -**Future TensorFlow compiled against StreamExecutorInterface v2** -```cpp -// StreamExecutorInterface header version 2 -typedef struct SE_Device { - size_t struct_size; - void* next; // Always set to zero, reserved by TF for future use. +StreamExecutor C API (SE C API) follows Semantic Versioning 2.0.0 +([semver](http://semver.org/)). Each release version has a format +`MAJOR.MINOR.PATCH`, as outlined in [TensorFlow version compatibility](https://www.tensorflow.org/guide/versions#semantic_versioning_20). +We also use `struct_size` to track compatibility. + +## Updating Guidelines +This section outlines when to update version numbers specific to SE C API +(`SE_MAJOR`, `SE_MINOR`, and `SE_PATCH`). + +### SE_MAJOR +* Potentially backwards incompatible changes. +* If a change is backwards incompatible, it requires an RFC because it will + break all current plug-ins. This should be rare. +* An `SE_MAJOR` update should be planned in a way that bundles as many pending + backwards incompatible changes together as possible to avoid breaking plug-ins + multiple times. +* There will be an announcement giving a grace period before the update happens. + +### SE_MINOR +* Backwards compatible changes. + * Adding a new variable, struct, method, enumeration, etc. + * Trivial deprecation of a variable, etc. by setting it to a no-op values, + e.g., 0 or `NULL`. + +### SE_PATCH +* Backwards compatible bug fixes. + +## Conventions +* Once a member is added to a struct, it cannot be removed, reordered, renamed, + or repurposed (i.e., assigned a different functionality). +* "Renaming" a member is equivalent to adding a new member with a new name and + eventually deprecating the member with the old name. +* Fields that cannot be 0 or `NULL` can be deprecated in a backwards compatible + manner by zero-initialization. + * Plug-ins must perform input validation on these fields for 0 and `NULL` + before consuming them. + * Plug-ins know the fields are deprecated when they find 0 or `NULL` in + these fields. + * Such fields must be explicitly marked by comments, to ensure all plug-ins + have consistent behavior (e.g., non of the plug-ins is using 0 or `NULL` as + a special case). See `// 0 is no-op` and `// NULL is no-op` in the + [By value inspection](#By-value-inspection) section for example. + + +## Detecting Incompatibility + +### By Comparing SE_MAJOR at Registration Time +At load time, both plug-in and core TensorFlow should check for version +compatibility. If the versions are not compatible, plug-in should output an +error and core TensorFlow should unload the plug-in. See code example below. + +Core TensorFlow passes its SE C API version number when calling plug-in's +initialization routine (`SE_InitPlugin`): +```c++ +typedef void (*SEInitPluginFn)(SE_PlatformRegistrationParams*, TF_Status*); +SE_PlatformRegistrationParams params{SE_PLATFORM_REGISTRATION_PARAMS_SIZE}; +params.major_version = SE_MAJOR; +params.minor_version = SE_MINOR; +params.patch_version = SE_PATCH; +TF_Status status; + +// Core TensorFlow sends its view of version numbers to plugin. +void* initialize_sym = dlsym(plugin, "SE_InitPlugin"); +if (!initialize_sym) { + // Output error and skip this plug-in. +} +SEInitPluginFn initialize_plugin_fn = reinterpret_cast(initialize_sym); +initialize_plugin_fn(¶ms, &status); +if(!tensorflow::StatusFromTF_Status(status).ok()) { + // Output error and skip this plug-in. +} +``` - const char* name; - size_t name_len; - void* device_handle; -} SE_Device; +Plug-in checks the `SE_MAJOR` version numbers and outputs error if they don't +match: +```c++ +void SE_InitPlugin(SE_PlatformRegistrationParams* params, + TF_Status* status) { + if (params->struct_size == 0) { + // *status = ... + LOG(ERROR) << "Invalid argument."; + return; + } + if (SE_MAJOR != params->major) { + // *status = ... + LOG(ERROR) << "Unsupported major version. Given: " << params->major + << " Expected: " << SE_MAJOR; + return; + } + ... +} +``` -// Evaluates to 40 -#define SE_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, device_handle) +### By Value Inspection +Deprecation of an attribute can sometimes be done in a backwards compatible +manner by leaving the attribute zero initialized. + +* The plugin performs input validation on each field for `NULL` or 0 value + before consuming it, preventing it from entering a bad state. +* If deprecation by zero-initialization is not possible (e.g., because default + value of zero may be a valid input), then the change is API incompatible; + TensorFlow has to bump the major version when the attribute is deprecated. + +For example, + +```c++ +struct example { + int cannot_be_zero; // 0 is no-op. + void* cannot_be_null; // NULL is no-op. + int can_be_zero; + void* can_be_null; + int optional_zero_default; // Optional. 0 by default. + void* optional_null_default; // Optional. NULL by default. +}; ``` - -The plugin was compiled against an older version of SEI header without device_handle. - -**Older Plugin compiled against StreamExecutorInterface v1** -```cpp -// StreamExecutorInterface header version 1 -typedef struct SE_Device { +* `cannot_be_zero` and `cannot_be_null` here can be deprecated by + zero-initializing. +* `can_be_zero` and `can_be_null` need a MAJOR version bump for deprecation, + since 0 and `NULL` are valid values for them. +* `optional_zero_default` and `optional_null_default` are optional fields that + use 0 / `NULL` to indicate that the field is not provided. This needs an + `SE_MAJOR` version bump for deprecation as well, since 0 and `NULL` are valid + here. + +For other unintentional changes which are caused by bugs (e.g., data was +forgotten to be initialized by mistake), file a Github issue. + +### By Checking Struct Size +Backwards compatible changes within the same `SE_MINOR` version can only add new +members to a struct and cannot modify any existing member. Because of this, we +can check the byte offset of the variable we want to consume against the struct +size to see if the struct has the variable or not. + +# Usage Example + +Following are concrete examples of how TensorFlow remains compatible with +plug-ins when functionality is added to or removed from StreamExecutorInterface. + +## Extending Functionality +The following snippet shows `void* new_field1` and `int new_field2` being added +to a `Toy` struct. + +```diff +#define SE_MAJOR 1 +- #define SE_MINOR 0 ++ #define SE_MINOR 1 // Increment minor version. +#define SE_PATCH 0 + +typedef struct Toy { size_t struct_size; - void* next; - - const char* name; - size_t name_len; -} SE_Device; - + void* ext; // Free-form data set by plugin. + int32_t old_field; // Device index. ++ void* new_field1; // NULL is no-op. ++ int new_field2; // 0 is no-op. +} Toy; + +- // Evaluates to 20 +- #define TOY_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, old_field) ++ // Evaluates to 36 ++ #define TOY_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, new_field2) +``` -// Evaluates to 32 -#define SE_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, name_len) +To concisely cover compatibility of cases where structs are created by core +TensorFlow and by plug-ins, we will call the side that creates the struct +`producer`, and the side that takes the struct `consumer`. -// Plugin Implementation +### Producer Has Older Header Files -SE_Device* Plugin_CreateDevice() { - SE_Device* se = new SE_Device{ SE_DEVICE_STRUCT_SIZE }; - // Based on header v1, se->struct_size will be 32 +```cpp +// Producer implementation has v1.0.0 headers. +Toy* create_toy() { + Toy* toy = new Toy{TOY_STRUCT_SIZE}; + // Based on header v1.0.0, toy->struct_size is 20. ... - return se; + old_field = set_old_field(); + return toy; } -``` - -TensorFlow checks that struct_size must be greater than the offset of device_handle before accessing it. - -```cpp -// TF Implementation - -void TF_Foo(const SE_Device* device) { - // TF checks for struct_size greater than 32. - if (device->struct_size > offsetof(SE_Device, device_handle)) { - // TF knows that device_handle can be safely read from. - DoSomething(device->device_handle); +// Consumer implementation has v1.1.0 headers. +void take_toy(const Toy* toy) { + // Consumer checks for `struct_size` greater than 24 (offset of `new_field1`). + // In this case, `toy->struct_size` = 20 so this `if` is not entered. + if (toy->struct_size > offsetof(Toy, new_field1) && new_field1 != NULL) { + // Safe to access `new_field1`. + } + // Consumer checks for `struct_size` greater than 32 (offset of `new_field2`). + // In this case, `toy->struct_size` = 20 so this `if` is not entered. + if (toy->struct_size > offsetof(Toy, new_field2) && new_field2 != 0) { + // Safe to access `new_field2`. } } ``` -### Forwards compatibility - -In the event that a plugin is up to date or newer, se->struct_size would have been initialized to 48. This would then pass the TF_Foo() check above and device_handle can be safely accessed. - -**Future Plugin compiled against StreamExecutorInterface v3** +### Producer Has Newer Header Files ```cpp -// StreamExecutorInterface header version 3 -typedef struct SE_Device { - size_t struct_size; - void* next; - - const char* name; - size_t name_len; - void* device_handle; - void* data; // Added in v3 -} SE_Device; - -// Evaluates to 48 -#define SE_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, data) - - -// Plugin Implementation - -SE_Device* Plugin_CreateDevice() { - SE_Device* se = new SE_Device{ SE_DEVICE_STRUCT_SIZE }; - // se->struct_size will be 48 +// Producer implementation has v1.1.0 headers. +Toy* create_toy() { + Toy* toy = new Toy{TOY_STRUCT_SIZE}; + // Based on header v1.1.0, toy->struct_size is 36. ... - return se; + old_field = set_old_field(); + new_field1 = set_new_field1(); + new_field2 = set_new_field2(); + return toy; +} +// Consumer implementation has v1.0.0 headers. +void take_toy(const Toy* toy) { + // `new_field1` and `new_field2` are safely ignored + // because consumer doesn't know about them. } ``` + +If `producer` depends on `consumer` knowing about `new_field1` and `new_field2`, +adding `new_field1` and `new_field2` would be a backwards incompatible change +and `SE_MAJOR` should be bumped instead. -Using the same TF_Foo() above, TF_Foo() was implemented before SE_Device::data was added after SE_Device::device_handle. Since TensorFlow only knows about the members that come before SE_Device::data, the newly added device->data will not be accessed. - -## When TensorFlow deprecates functionality -When functionality is being deprecated, there will be comments next to the member indicating so. The member is left in place to preserve the alignment and offset of the existing structure members. - -Since members are not allowed to be removed or reordered, refactors (e.g. renaming device_handle to dev_handle) or changing of member types (e.g. from int to float) are considered as deprecation. - -### Backwards compatibility -SE_Device::data has been deprecated in version 4, and a comment in the header indicated as such. +## Deprecating Functionality + +When functionality is being deprecated, there will be comments next to the +member indicating so. The member is left in place to preserve the alignment and +offset of the existing structure members. General guidelines: +* Add comments saying which field will be deprecated. +* The minor update will still support `deprecating_feature` to allow time for + transition. This would be a good time to raise concerns on Github. +* After the transition time has passed, `deprecating_feature` can be removed in + a major update. -**Future TensorFlow compiled against StreamExecutorInterface v4** +Since members are not allowed to be removed or reordered, refactors (e.g., +renaming device_handle to dev_handle) or changing of member types (e.g., from +`int` to `float`) are considered as +[deprecation with extension](#Deprecation-with-extension). -```cpp -// StreamExecutorInterface header version 4 -typedef struct SE_Device { - size_t struct_size; - void* next; - const char* name; - size_t name_len; - void* device_handle; - void* data; // Deprecated -} SE_Device; +```diff +#define SE_MAJOR 1 +- #define SE_MINOR 1 ++ #define SE_MINOR 2 // Increment minor version. +#define SE_PATCH 0 -// Evaluates to 48 -#define SE_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, data) +typedef struct Toy { + size_t struct_size; + void* ext; // Free-form data set by plugin. + int32_t old_field; // Device index. +- void* new_field1; // NULL is no-op. ++ void* new_field1; // Deprecated. // NULL is no-op. + int new_field2; // 0 is no-op. +} Toy; + +// Evaluates to 36 +#define TOY_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, new_field2) +``` -// TF Implementation +To concisely cover compatibility of cases where structs are created by core +TensorFlow and by plug-ins, we will call the side that creates the struct +`producer`, and the side that takes the struct `consumer`. -void TF_Foo(const SE_Device* device) { - // TF checks for struct_size greater than 32. - if (device->struct_size > offsetof(SE_Device, device_handle)) { - // TF knows that device_handle can be safely accessed. - DoSomething(device->device_handle); - } +### Producer Has Older Header Files - // TensorFlow removes implementation to stop using deprecated functionality. - /* - if (device->struct_size > offsetof(SE_Device, data)) { - // TF knows that device->data can be safely accessed. - DoSomething(device->data); +```diff +// Producer implementation has v1.1.0 headers. +Toy* create_toy() { + Toy* toy = new Toy{TOY_STRUCT_SIZE}; + ... + old_field = set_old_field(); + new_field1 = set_new_field1(); + new_field2 = set_new_field2(); + return toy; +} +// Consumer implementation has v1.2.0 headers. +void take_toy(const Toy* toy) { +- // Consumer removes the code using `new_field1`. +- if (toy->struct_size > offsetof(Toy, new_field1) && new_field1 != NULL) { +- // Safe to access `new_field1`. +- } + if (toy->struct_size > offsetof(Toy, new_field2) && new_field2 != 0) { + // Safe to access `new_field2`. } - */ - } ``` -The plugin, being older, was initializing the recently deprecated SE_Device::data. Since TF_Foo() does not access it anymore, SE_Device::data will be safely ignored (even though it was initialized). - -### Forwards compatibility -Plugins may choose to support older TensorFlow releases that have deprecated functionality. - -In a simple case, TensorFlow is already performing input validation and capable of providing best effort forward compatibility with newer plugins. - -**Older TensorFlow compiled against StreamExecutorInterface v4 with data validation** +The producer, being older, initializes the recently deprecated `new_field1`. +Since `take_toy` does not access it anymore, `new_field1` will be safely ignored +(even though it was initialized). -```cpp -void TF_Foo(const SE_Device* device) { - ... - // TF checks for struct_size greater than offset of data, and also validates device->data. - if (device->struct_size > offsetof(SE_Device, data) && device->data != nullptr) { - // TF knows that data can be safely accessed. - DoSomething(device->data); +### Producer Has Newer Header Files + +```diff +// Producer implementation has v1.2.0 headers. +Toy* create_toy() { + Toy* toy = new Toy{TOY_STRUCT_SIZE}; ++ // `new_field1` is zero-initialized with the line above. + ... + old_field = set_old_field(); +- new_field1 = set_new_field1(); // Stops setting the deprecated `new_field1`. + new_field2 = set_new_field2(); + return toy; +} +// Consumer implementation has v1.1.0 headers. +void take_toy(const Toy* toy) { ++ // `new_field1` is `NULL` so it is safely ignored. ++ // Can also add code raise an error here when `NULL` is detected. + if (toy->struct_size > offsetof(Toy, new_field1) && new_field1 != NULL) { + // Safe to access `new_field1`. + } + if (toy->struct_size > offsetof(Toy, new_field2) && new_field2 != 0) { + // Safe to access `new_field2`. } } ``` - -This way, plugins can safely remove implementation of deprecated functionality. - -**Future Plugin compiled against StreamExecutorInterface v5** -```cpp -// StreamExecutorInterface header version 5 -typedef struct SE_Device { - size_t struct_size; - void* next; - - const char* name; - size_t name_len; - void* device_handle; - void* data; // Deprecated in v4 - void* data2; -} SE_Device; +This way, plug-ins can safely remove implementation of deprecated functionality. -// Evaluates to 56 -#define SE_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SE_Device, data2) +## Deprecation with Extension +This is the more common form of deprecation where the struct is extended with a +new attribute that replaces an existing one. The analysis is the same as +[Adding functionality](#Adding-functionality) and +[Deprecating functionality](#Deprecating-functionality) combined. +General guidelines: +* Add comments saying which field will be deprecated and which one will replace + it. +* Increment the minor version. +* The minor update will support both `name` and `better_name` to allow time for + transition. This would be a good time to raise concerns on Github. +* After the transition time has passed, `name` can be removed in a major update. -// Plugin Implementation +Below are some examples. -SE_Device* Plugin_CreateDevice() { - SE_Device* se = new SE_Device{ SE_DEVICE_STRUCT_SIZE }; - // se->struct_size will be 56 - se->device_handle = GetHandle(); +```diff +#define SE_MAJOR 5 +- #define SE_MINOR 0 ++ #define SE_MINOR 1 // Increment minor version +#define SE_PATCH 0 - // se->data was deprecated so ignore it. It was already zero initialized - // at “SE_Device{ SE_DEVICE_STRUCT_SIZE }” above. +// Case 1 - Renaming an attribute +typedef struct Device { + size_t struct_size; + void* ext; + int32_t ordinal; - se->data2 = GetData2(); - return se; -} +- const char* name; ++ const char* name; // Deprecating soon. Use `better_name`. + void* device_handle; + const char* better_name; // Replaces `name`. +} Device; + + +// Case 2 - Deprecation of an entire struct can be done without a replacement... ++ // `Device` struct will be deprecated soon. +typedef struct Device { +... +} Device; + +// ...or with a replacement ++ // Replaces `Device`. ++ typedef struct BetterDevice { ++ ... ++ } Device; + +// Case 3 - Renaming a function. +typedef struct ExportFunctions { +... ++ // create_device will be deprecated soon. + void (*create_device)(Device* device); + ++ // Replaces `create_device`. ++ void (*create_better_device)(BetterDevice* device); +} ExportFunctions; ``` - -In a more complex scenario, an older TensorFlow release might consume deprecated functionality for granted. - -**Older TensorFlow compiled against StreamExecutorInterface v4 without data validation** -```cpp -void TF_Foo(const SE_Device* device) { - ... - // TF checks for struct_size greater than offset of data. - // No input validation. - if (device->struct_size > offsetof(SE_Device, data)) { - // Will crash on null pointer dereference. - Dereference(device->data); - } -} -``` - -In this case, it is recommended for plugins to continue to keep the deprecated implementation around. Once the plugin stops supporting the latest version of TensorFlow that uses the deprecated functionality, the implementation can be safely removed. This comes at the cost of maintenance of legacy deprecated code on the plugin side. - -## Limitations +# Limitations * Maximum supported alignment is 8 bytes. From d1ec35114f43ba66b04d4afb494f106d932f7bfe Mon Sep 17 00:00:00 2001 From: Penporn Koanantakool <38085909+penpornk@users.noreply.github.com> Date: Wed, 9 Sep 2020 13:23:51 -0700 Subject: [PATCH 54/58] Fixes according to annarev@'s comments. Also some more minor edits. --- .../C_API_versioning_strategy.md | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md b/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md index 723ac8794..bf6b2308f 100644 --- a/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md +++ b/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md @@ -42,12 +42,14 @@ This section outlines when to update version numbers specific to SE C API eventually deprecating the member with the old name. * Fields that cannot be 0 or `NULL` can be deprecated in a backwards compatible manner by zero-initialization. - * Plug-ins must perform input validation on these fields for 0 and `NULL` - before consuming them. + * If the field is set by core TensorFlow, plug-ins must perform input validation + on these fields for 0 and `NULL` before accessing them. * Plug-ins know the fields are deprecated when they find 0 or `NULL` in these fields. + * If the field is set by plug-in, TF can check if the field is non-zero (or not + `NULL`) and print a warning if so. * Such fields must be explicitly marked by comments, to ensure all plug-ins - have consistent behavior (e.g., non of the plug-ins is using 0 or `NULL` as + have consistent behavior (e.g., none of the plug-ins is using 0 or `NULL` as a special case). See `// 0 is no-op` and `// NULL is no-op` in the [By value inspection](#By-value-inspection) section for example. @@ -240,7 +242,7 @@ renaming device_handle to dev_handle) or changing of member types (e.g., from `int` to `float`) are considered as [deprecation with extension](#Deprecation-with-extension). - +The following code snippet shows deprecation of `new_field1`. ```diff #define SE_MAJOR 1 - #define SE_MINOR 1 @@ -289,8 +291,8 @@ void take_toy(const Toy* toy) { ``` The producer, being older, initializes the recently deprecated `new_field1`. -Since `take_toy` does not access it anymore, `new_field1` will be safely ignored -(even though it was initialized). +Since consumer's `take_toy` does not access it anymore, `new_field1` will be +safely ignored (even though it was initialized). ### Producer Has Newer Header Files @@ -308,7 +310,7 @@ Toy* create_toy() { // Consumer implementation has v1.1.0 headers. void take_toy(const Toy* toy) { + // `new_field1` is `NULL` so it is safely ignored. -+ // Can also add code raise an error here when `NULL` is detected. ++ // Can also add code to raise an error here when `NULL` is detected. if (toy->struct_size > offsetof(Toy, new_field1) && new_field1 != NULL) { // Safe to access `new_field1`. } @@ -324,7 +326,7 @@ This way, plug-ins can safely remove implementation of deprecated functionality. This is the more common form of deprecation where the struct is extended with a new attribute that replaces an existing one. The analysis is the same as -[Adding functionality](#Adding-functionality) and +[Extending functionality](#Extending-functionality) and [Deprecating functionality](#Deprecating-functionality) combined. General guidelines: * Add comments saying which field will be deprecated and which one will replace From 1a6f2cf9a85e218caec00ec1ca5dd5789c07cced Mon Sep 17 00:00:00 2001 From: Penporn Koanantakool <38085909+penpornk@users.noreply.github.com> Date: Wed, 9 Sep 2020 18:03:06 -0700 Subject: [PATCH 55/58] Apply suggestions from code review Co-authored-by: Situ Yi <60493024+yisitu@users.noreply.github.com> --- rfcs/20200612-stream-executor-c-api.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index b915ef203..b0e748128 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -90,12 +90,12 @@ StreamExecutor C API follows Semantic Versioning 2.0.0 ([semver](http://semver.o Each release version has a format `MAJOR.MINOR.PATCH`, as outlined in [TensorFlow version compatibility](https://www.tensorflow.org/guide/versions#semantic_versioning_20). We also use struct sizes to track compatibility. More details on functionality -extension and deprecation in [StreamExecutor C API Versioning Strategy](20200612-stream-executor-c-api/C_API_versioning_strategy.md). +extension and deprecation can be found in [StreamExecutor C API Versioning Strategy](20200612-stream-executor-c-api/C_API_versioning_strategy.md). The C API will have an initial bake-in period, where we won’t have any compatibility guarantees. However, we will make the best effort to perform any updates in a backwards compatible way. For example, we plan to keep track of -struct sizes. +struct sizes. During this period, the API will be kept at `MAJOR` version 0. The C API will be placed in [tensorflow/c/experimental](https://cs.opensource.google/tensorflow/tensorflow/+/refs/tags/v2.3.0:tensorflow/c/experimental/). We will consider moving the API out of the experimental directory once it is @@ -103,16 +103,16 @@ more stable. ## Implementation Conventions -* Struct prefix indicates whether struct fields should be filled by the plug-in or core implementation: +* Struct prefix indicates whether struct fields should be filled by the plug-in or core TensorFlow implementation: * `SE_`: Set/filled by core, unless marked otherwise. * `SP_`: Set/filled by plug-in, unless marked otherwise. * This prefix rule only applies to structures. Enumerations and methods are all prefixed with `SE_`. * Structs begin with two fields: * `size_t struct_size`: Stores the unpadded size of the struct. - * `void* ext`: A free-form field that can be populated by a plugin in `SP_*` structs or potential future extension points in `SE_` structs. -* We use `struct_size` for version checking. - * It is exempt from the `SE/SP` rule above and should be set both by core and plug-in. - * It can be checked to determine which struct fields are available for current version of TensorFlow. + * `void* ext`: A reserved field that may be populated by a plugin in `SP_*` structs or potential future extension points in `SE_` structs. Must be set to zero by default if it unused. +* We use `struct_size` for version checking by both core and plug-in. + * It is exempt from the `SE/SP` rule above and must be set both by core and plug-in. + * It can be checked programmatically to determine which struct fields are available in the structure. * For example, `create_device` function receives `SP_Device*` as input with `struct_size` populated by core. The plug-in is responsible for setting `struct_size` as well, along with all other fields. * When a member is added to a struct, the struct size definition must be updated to use the new last member of the struct. @@ -134,9 +134,9 @@ Core TensorFlow will register a new StreamExecutor platform as well as a new Ten 2. Core TensorFlow populates `SE_PlatformRegistrationParams` and passes it in a call to `SE_InitPlugin`. * In `SE_InitPlugin`, plug-in populates `SE_PlatformRegistrationParams::SP_Platform` and `SE_PlatformRegistrationParams::SP_PlatformFns`. 3. Core TensorFlow can now create a device, a stream executor, and a timer through functions in `SP_PlatformFns`. - * Core TensorFlow populates `SE_CreateDeviceParams` and pass it in a call to `SP_PlatformFns::create_device`. + * Core TensorFlow populates `SE_CreateDeviceParams` and pass it as a parameter to `SP_PlatformFns::create_device()`. * Plug-in populates `SE_CreateDeviceParams::SP_Device`. - * Core TensorFlow populates `SE_CreateStreamExecutorParams` and pass it in a call to `SP_PlatformFns::create_stream_executor`. + * Core TensorFlow populates `SE_CreateStreamExecutorParams` and pass it to `SP_PlatformFns::create_stream_executor()`. * Plug-in populates `SE_CreateStreamExecutorParams::SP_StreamExecutor`. * Core TensorFlow sets `struct_size` in `SP_Timer` and pass it in a call to `SP_PlatformFns::create_timer_fns`. * Plug-in populates `SP_TimerFns`. @@ -264,14 +264,14 @@ typedef struct SP_StreamExecutor { /*** ALLOCATION CALLBACKS ***/ // Synchronously allocates `size` bytes on the underlying platform and returns // `SP_DeviceMemoryBase` representing that allocation. In the case of failure, - // nullptr is returned. + // NULL is returned. // `memory_space` is reserved for a potential future usage and should be set // to 0. void (*allocate)(const SP_Device* device, uint64_t size, int64_t memory_space, SP_DeviceMemoryBase* mem); // Deallocate the device memory previously allocated via this interface. - // Deallocation of a nullptr-representative value is permitted. + // Deallocation of a NULL representative value is permitted. void (*deallocate)(const SP_Device* device, SP_DeviceMemoryBase* memory); // Allocates a region of host memory and registers it with the platform API. From 0218043d6019af519adc7acd87b10a571089e4d0 Mon Sep 17 00:00:00 2001 From: Penporn Koanantakool <38085909+penpornk@users.noreply.github.com> Date: Wed, 9 Sep 2020 21:42:15 -0700 Subject: [PATCH 56/58] Apply suggestions from code review Co-authored-by: Situ Yi <60493024+yisitu@users.noreply.github.com> --- rfcs/20200612-stream-executor-c-api.md | 15 ++++++++++----- .../C_API_versioning_strategy.md | 14 +++++++------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index b0e748128..e918f225e 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -231,7 +231,7 @@ typedef struct SP_DeviceMemoryBase { } SP_DeviceMemoryBase; #define SP_DEVICE_MEMORY_BASE_STRUCT_SIZE \ - TF_OFFSET_OF_END(SP_DeviceMemoryBase, size) + TF_OFFSET_OF_END(SP_DeviceMemoryBase, payload) typedef struct SP_Device { size_t struct_size; @@ -285,7 +285,7 @@ typedef struct SP_StreamExecutor { // Allocates unified memory space of the given size, if supported. Unified // memory support should be added by setting `supports_unified_memory` field // in `SP_Platform`. - void* (*unified_memory_allocate)(const SP_Device* device, uint64_t bytes); + void* (*unified_memory_allocate)(const SP_Device* device, uint64_t size); // Deallocates unified memory space previously allocated with // `unified_memory_allocate`. Unified @@ -419,7 +419,7 @@ typedef struct SP_StreamExecutor { void (*synchronize_all_activity)(const SP_Device* device, TF_Status* status); // Enqueues on a stream a user-specified function to be run on the host. - // `callback_arg` should be passed as the first argument to `callback_fn`. + // `callback_arg` shall be passed as the first argument to `callback_fn`. TF_Bool (*host_callback)(SP_Device* device, SP_Stream stream, SE_StatusCallbackFn callback_fn, void* callback_arg); } SP_StreamExecutor; @@ -448,7 +448,7 @@ typedef struct SP_Platform { // Device type name, for example GPU. Must be null-terminated. const char* type; - // Number of visible devices + // Number of visible devices. size_t visible_device_count; // Whether this platform supports unified memory. @@ -535,6 +535,7 @@ registration outlined in the [Usage Overview](#Usage overview) section. typedef void (*SEInitPluginFn)(SE_PlatformRegistrationParams*, TF_Status*); ... +// On Windows, use `GetProcAddress` instead of `dlsym`. void* initialize_sym = dlsym(plugin_dso_handle, "SE_InitPlugin"); if (!initialize_sym) { // Output error and skip this plug-in. @@ -542,7 +543,9 @@ if (!initialize_sym) { SEInitPluginFn initialize_fn = reinterpret_cast(initialize_sym); SE_PlatformRegistrationParams params; -TF_Status* status = TF_NewStatus(); +TF_Status status; + +initialize_fn(¶ms, &status); initialize_fn(¶ms, status); @@ -603,6 +606,8 @@ void SE_InitPlugin(SE_PlatformRegistrationParams* params, TF_Status* status) { std::string name = "MyDevice"; std::string type = "GPU"; + // Sets struct_size to a valid value, and zero initializes other attributes. + *params = { SE_PLATFORM_REGISTRATION_PARAMS_STRUCT_SIZE }; params->platform->name = name.c_str(); params->platform->type = type.c_str(); params->platform->visible_device_count = visible_device_count; diff --git a/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md b/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md index bf6b2308f..de922bdca 100644 --- a/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md +++ b/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md @@ -51,7 +51,7 @@ This section outlines when to update version numbers specific to SE C API * Such fields must be explicitly marked by comments, to ensure all plug-ins have consistent behavior (e.g., none of the plug-ins is using 0 or `NULL` as a special case). See `// 0 is no-op` and `// NULL is no-op` in the - [By value inspection](#By-value-inspection) section for example. + [By value inspection](#by-value-inspection) section for example. ## Detecting Incompatibility @@ -116,13 +116,13 @@ manner by leaving the attribute zero initialized. For example, ```c++ -struct example { - int cannot_be_zero; // 0 is no-op. - void* cannot_be_null; // NULL is no-op. - int can_be_zero; +struct Example { + int32_t cannot_be_zero; // 0 is no-op. + void* cannot_be_null; // NULL is no-op. + int32_t can_be_zero; void* can_be_null; - int optional_zero_default; // Optional. 0 by default. - void* optional_null_default; // Optional. NULL by default. + int32_t optional_zero_default; // Optional. 0 by default. + void* optional_null_default; // Optional. NULL by default. }; ``` * `cannot_be_zero` and `cannot_be_null` here can be deprecated by From cc0f721764de733951438f9664ea05b61bf62e47 Mon Sep 17 00:00:00 2001 From: Penporn Koanantakool <38085909+penpornk@users.noreply.github.com> Date: Wed, 9 Sep 2020 22:03:48 -0700 Subject: [PATCH 57/58] More clean-ups. Changed "shall" to "must" to make sure people would not misinterpret "shall". --- rfcs/20200612-stream-executor-c-api.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index e918f225e..88f95b8af 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -419,7 +419,7 @@ typedef struct SP_StreamExecutor { void (*synchronize_all_activity)(const SP_Device* device, TF_Status* status); // Enqueues on a stream a user-specified function to be run on the host. - // `callback_arg` shall be passed as the first argument to `callback_fn`. + // `callback_arg` must be passed as the first argument to `callback_fn`. TF_Bool (*host_callback)(SP_Device* device, SP_Stream stream, SE_StatusCallbackFn callback_fn, void* callback_arg); } SP_StreamExecutor; @@ -452,7 +452,8 @@ typedef struct SP_Platform { size_t visible_device_count; // Whether this platform supports unified memory. - // Unified memory is a single memory address space accessible from any device. + // Unified memory is a single memory address space that virtualizes device and + // host memory addresses. It is accessible to both the device and host. TF_Bool supports_unified_memory; } SP_Platform; @@ -546,8 +547,6 @@ SE_PlatformRegistrationParams params; TF_Status status; initialize_fn(¶ms, &status); - -initialize_fn(¶ms, status); // Register new platform std::unique_ptr platform( From 21be6e9065d5c3d8bc811be3f8e887e87276931f Mon Sep 17 00:00:00 2001 From: Anna Revinskaya Date: Wed, 16 Sep 2020 17:19:54 -0700 Subject: [PATCH 58/58] Custom and default allocator added --- rfcs/20200612-stream-executor-c-api.md | 188 ++++++++++++++++++++++++- 1 file changed, 182 insertions(+), 6 deletions(-) diff --git a/rfcs/20200612-stream-executor-c-api.md b/rfcs/20200612-stream-executor-c-api.md index 88f95b8af..97bec78b7 100644 --- a/rfcs/20200612-stream-executor-c-api.md +++ b/rfcs/20200612-stream-executor-c-api.md @@ -437,6 +437,151 @@ typedef struct SE_CreateStreamExecutorParams { #define SE_CREATE_STREAM_EXECUTOR_PARAMS_STRUCT_SIZE \ TF_OFFSET_OF_END(SE_CreateStreamExecutorParams, stream_executor) +typedef struct SP_Allocator { + size_t struct_size; + void* ext; // free-form field set by plugin. + + // Whether this platform supports unified memory. + // Unified memory is a single memory address space accessible from any device. + TF_Bool supports_unified_memory; +} SP_Allocator; + +#define SP_ALLOCATOR_STRUCT_SIZE \ + TF_OFFSET_OF_END(SP_Allocator, supports_unified_memory) + +typedef struct SP_AllocatorFns { + size_t struct_size; + void* ext; // reserved for future use. + + // Synchronously allocates `size` bytes on the underlying platform and returns + // `SP_DeviceMemoryBase` representing that allocation. In the case of failure, + // nullptr is returned. + // `memory_space` is reserved for a potential future usage and should be set + // to 0. + void (*allocate)(const SP_Device* device, const SP_Allocator* allocator, + uint64_t size, int64_t memory_space, + SP_DeviceMemoryBase* mem); + + // Deallocate the device memory previously allocated via this interface. + // Deallocation of a nullptr-representative value is permitted. + void (*deallocate)(const SP_Device* device, const SP_Allocator* allocator, + SP_DeviceMemoryBase* memory); + + // Allocates a region of host memory and registers it with the platform API. + // Memory allocated in this manner is required for use in asynchronous memcpy + // operations, such as `memcpy_dtoh`. + void* (*host_memory_allocate)(const SP_Device* device, + const SP_Allocator* allocator, uint64_t size); + + // Deallocates a region of host memory allocated by `host_memory_allocate`. + void (*host_memory_deallocate)(const SP_Device* device, + const SP_Allocator* allocator, void* mem); + + // Allocates unified memory space of the given size, if supported. Unified + + // memory support should be added by setting `supports_unified_memory` field + // in `SP_Platform`. + void* (*unified_memory_allocate)(const SP_Device* device, + const SP_Allocator* allocator, + uint64_t bytes); + + // Deallocates unified memory space previously allocated with + // `unified_memory_allocate`. Unified + // memory support should be added by setting `supports_unified_memory` field + // in `SP_Platform`. + void (*unified_memory_deallocate)(const SP_Device* device, + const SP_Allocator* allocator, + void* location); + + // Fills SP_AllocatorStats with allocator statistics, if it is available. + // If it is not available, return false. + TF_Bool (*get_allocator_stats)(const SP_Device* device, + const SP_Allocator* allocator, + SP_AllocatorStats* stats); + + // Fills the underlying device memory usage information, if it is + // available. If it is not available (false is returned), free/total need not + // be initialized. + TF_Bool (*device_memory_usage)(const SP_Device* device, + const SP_Allocator* allocator, int64_t* free, + int64_t* total); +} SP_AllocatorFns; + +#define SP_ALLOCATOR_FNS_STRUCT_SIZE \ + TF_OFFSET_OF_END(SP_AllocatorFns, device_memory_usage) + +typedef struct SP_CustomAllocator { + size_t struct_size; + void* ext; // free-form data set by plugin +} SP_CustomAllocator; + +#define SP_CUSTOM_ALLOCATOR_STRUCT_SIZE \ + TF_OFFSET_OF_END(SP_CustomAllocator, ext) + +typedef struct SP_CustomAllocatorFns { + size_t struct_size; + void* ext; // reserved for future use + + // Synchronously allocates `size` bytes on the underlying platform and returns + // a pointer to that allocation. In the case of failure, + // nullptr is returned. + void* (*allocate_raw)(const SP_Device* device, + const SP_CustomAllocator* allocator, size_t size, + size_t alignment); + + // Deallocate the device memory previously allocated via `allocate_raw`. + // Deallocation of a nullptr-representative value is permitted. + void (*deallocate_raw)(const SP_Device* device, + const SP_CustomAllocator* allocator, void* ptr); + + // Allocates a region of host memory. + void* (*host_allocate_raw)(const SP_Device* device, + const SP_CustomAllocator* allocator, + uint64_t size); + + // Deallocates a region of host memory allocated by `host_allocate_raw`. + void (*host_deallocate_raw)(const SP_Device* device, + const SP_CustomAllocator* allocator, void* mem); + + // Fills SP_AllocatorStats with allocator statistics, if it is available. + // If it is not available, return false. + TF_Bool (*get_allocator_stats)(const SP_Device* device, + const SP_CustomAllocator* allocator, + SP_AllocatorStats* stats); + + // Fills the underlying device memory usage information, if it is + // available. If it is not available (false is returned), free/total need not + // be initialized. + TF_Bool (*device_memory_usage)(const SP_Device* device, + const SP_CustomAllocator* allocator, + int64_t* free, int64_t* total); +} SP_CustomAllocatorFns; + +#define SP_CUSTOM_ALLOCATOR_FNS_STRUCT_SIZE \ + TF_OFFSET_OF_END(SP_CustomAllocatorFns, device_memory_usage) + +typedef struct SE_CreateAllocatorParams { + size_t struct_size; + void* ext; // reserved for future use + + SP_Allocator* allocator; + SP_AllocatorFns* allocator_fns; +} SE_CreateAllocatorParams; + +#define SE_CREATE_ALLOCATOR_PARAMS_STRUCT_SIZE \ + TF_OFFSET_OF_END(SE_CreateAllocatorParams, allocator_fns) + +typedef struct SE_CreateCustomAllocatorParams { + size_t struct_size; + void* ext; // reserved for future use + + SP_CustomAllocator* custom_allocator; + SP_CustomAllocatorFns* custom_allocator_fns; +} SE_CreateCustomAllocatorParams; + +#define SE_CREATE_CUSTOM_ALLOCATOR_PARAMS_STRUCT_SIZE \ + TF_OFFSET_OF_END(SE_CreateCustomAllocatorParams, custom_allocator_fns) + typedef struct SP_Platform { size_t struct_size; @@ -450,15 +595,10 @@ typedef struct SP_Platform { // Number of visible devices. size_t visible_device_count; - - // Whether this platform supports unified memory. - // Unified memory is a single memory address space that virtualizes device and - // host memory addresses. It is accessible to both the device and host. - TF_Bool supports_unified_memory; } SP_Platform; #define SP_PLATFORM_STRUCT_SIZE \ - TF_OFFSET_OF_END(SP_Platform, supports_unified_memory) + TF_OFFSET_OF_END(SP_Platform, visible_device_count) typedef struct SP_PlatformFns { size_t struct_size; @@ -488,11 +628,38 @@ typedef struct SP_PlatformFns { void (*destroy_timer_fns)(const SP_Platform* platform, SP_TimerFns* timer_fns); + + // Set only one of `create_allocator` or `create_custom_allocator` functions + // below. + + // Callback for creating an allocator that uses default TensorFlow allocation + // strategy (BFC: best-fit with coalescing). For more details, see + // https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/core/common_runtime/bfc_allocator.h. + // If `create_allocator` is set, then `create_custom_allocator` should *not* + // be set. + void (*create_allocator)(const SP_Platform* platform, + SE_CreateAllocatorParams* params, TF_Status* status); + void (*destroy_allocator)(const SP_Platform* platform, + SP_Allocator* allocator, + SP_AllocatorFns* allocator_fns); + + // Callback for creating a custom allocator. Allows using a custom allocation + // strategy. + // If `create_custom_allocator` is set, then `create_allocator` should *not* + // be set. + // Note: deallocator functions must be set in params. + void (*create_custom_allocator)(const SP_Platform* platform, + SE_CreateCustomAllocatorParams* params, + TF_Status* status); + void (*destroy_custom_allocator)(const SP_Platform* platform, + SP_CustomAllocator* allocator, + SP_CustomAllocatorFns* allocator_fns); } SP_PlatformFns; #define SP_PLATFORM_FNS_STRUCT_SIZE \ TF_OFFSET_OF_END(SP_PlatformFns, destroy_timer_fns) + typedef struct SE_PlatformRegistrationParams { size_t struct_size; void* ext; // reserved for future use @@ -584,6 +751,10 @@ void create_timer_fns(const SP_Platform* platform, SP_TimerFns* timer_fns, timer_fns->nanoseconds = nanoseconds; ... } +void create_allocator(const SP_Platform* platform, SP_CreateAllocatorParams* params, + TF_Status* status) { + ... +} void destroy_device(const SP_Platform* platform, SP_Device* device) { // Destroy device handle here. } @@ -594,6 +765,9 @@ void destroy_stream_executor(const SP_Platform* platform, void destroy_timer_fns(const SP_Platform* platform, SP_TimerFns* timer_fns) { // Destroy timer functions here. } +void destroy_allocator(const SP_Platform* platform, SP_Allocator* allocator, SP_AllocatorFns* allocator_fns) { + // Clean up allocator here. +} ``` Define `SE_InitPlugin` that TensorFlow will call when registering the device @@ -616,6 +790,8 @@ void SE_InitPlugin(SE_PlatformRegistrationParams* params, TF_Status* status) { params->platform_fns->destroy_stream_executor = destroy_stream_executor; params->platform_fns->create_timer_fns = create_timer_fns; params->platform_fns->destroy_timer_fns = destroy_timer_fns; + params->platform_fns->create_allocator = create_allocator; + params->platform_fns->destroy_allocator = destroy_allocator; } ```