diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md old mode 100644 new mode 100755 index 8211fd174e5e1..380fc18932289 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -238,6 +238,7 @@ variables in production code. | `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` | Integer | When set to a positive value enables use of Level Zero immediate commandlists, which means there is no batching and all commands are immediately submitted for execution. Default is 0. Note: When immediate commandlist usage is enabled it is necessary to also set SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS to either 0 or 1. | | `SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS` | Integer | When set to a positive value enables use of multiple Level Zero commandlists when submitting barriers. Default is 1. | | `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL` | Integer | When set to a positive value enables use of a copy engine for memory fill operations. Default is 0. | +| `SYCL_PI_LEVEL_ZERO_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION` | Integer | When set to "0" tells to use single root-device allocation for all devices in a context where all devices have same root. Otherwise performs regular buffer migration. Default is 1. | ## Debugging variables for CUDA Plugin diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index d70c77ca39796..659675af7c6db 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -8812,10 +8812,29 @@ pi_result _pi_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, LastDeviceWithValidAllocation = Device; return PI_SUCCESS; } + // Reads user setting on how to deal with buffers in contexts where + // all devices have the same root-device. Returns "true" if the + // preference is to have allocate on each [sub-]device and migrate + // normally (copy) to other sub-devices as needed. Returns "false" + // if the preference is to have single root-device allocations + // serve the needs of all [sub-]devices, meaning potentially more + // cross-tile traffic. + // + static const bool SingleRootDeviceBufferMigration = [] { + const char *EnvStr = + std::getenv("SYCL_PI_LEVEL_ZERO_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION"); + if (EnvStr) + return (std::stoi(EnvStr) != 0); + // The default is to migrate normally, which may not always be the + // best option (depends on buffer access patterns), but is an + // overall win on the set of the available benchmarks. + return true; + }(); // Peform actual device allocation as needed. if (!Allocation.ZeHandle) { - if (Context->SingleRootDevice && Context->SingleRootDevice != Device) { + if (!SingleRootDeviceBufferMigration && Context->SingleRootDevice && + Context->SingleRootDevice != Device) { // If all devices in the context are sub-devices of the same device // then we reuse root-device allocation by all sub-devices in the // context.