Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[refactor] Add ArrayMetadata to store the array runtime size #4950

Merged
merged 8 commits into from
May 11, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions taichi/backends/cuda/codegen_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,16 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
bool transferred = false;
for (int i = 0; i < (int)args.size(); i++) {
if (args[i].is_array) {
if (args[i].size == 0)
const auto &arr_meta = context.array_metadata[i];
const auto arr_runtime_sz = arr_meta.runtime_size;
if (arr_runtime_sz == 0) {
continue;
}
arg_buffers[i] = context.get_arg<void *>(i);
if (!context.is_device_allocation[i]) {
if (!arr_meta.is_device_allocation) {
// Note: both numpy and PyTorch support arrays/tensors with zeros
// in shapes, e.g., shape=(0) or shape=(100, 0, 200). This makes
// args[i].size = 0.
// `arr_runtime_sz` zero.
unsigned int attr_val = 0;
uint32_t ret_code =
CUDADriver::get_instance().mem_get_attribute.call(
Expand All @@ -87,18 +90,18 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
// See CUDA driver API `cuPointerGetAttribute` for more details.
transferred = true;
CUDADriver::get_instance().malloc(&device_buffers[i],
args[i].size);
arr_runtime_sz);
CUDADriver::get_instance().memcpy_host_to_device(
(void *)device_buffers[i], arg_buffers[i], args[i].size);
(void *)device_buffers[i], arg_buffers[i], arr_runtime_sz);
} else {
device_buffers[i] = arg_buffers[i];
}
// device_buffers[i] saves a raw ptr on CUDA device.
ctx_builder.set_arg_external_array(i, (uint64)device_buffers[i],
args[i].size,
arr_runtime_sz,
/*is_device_allocation=*/false);

} else if (args[i].size > 0) {
} else if (arr_runtime_sz > 0) {
// arg_buffers[i] is a DeviceAllocation*
// TODO: Unwraps DeviceAllocation* can be done at CodeGenLLVM since
// it's shared by cpu and cuda.
Expand All @@ -110,11 +113,13 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
// if transfer happened.
// TODO: this logic can be improved but I'll leave it to a followup
// PR.
// FIXME: What if after the kernel launch, we still need the old
// `arg_buffers[i]`?
arg_buffers[i] = device_buffers[i];

// device_buffers[i] saves the unwrapped raw ptr from arg_buffers[i]
ctx_builder.set_arg_external_array(i, (uint64)device_buffers[i],
args[i].size,
arr_runtime_sz,
/*is_device_allocation=*/false);
}
}
Expand All @@ -134,8 +139,10 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
CUDADriver::get_instance().stream_synchronize(nullptr);
for (int i = 0; i < (int)args.size(); i++) {
if (device_buffers[i] != arg_buffers[i]) {
TI_ASSERT(args[i].is_array);
CUDADriver::get_instance().memcpy_device_to_host(
arg_buffers[i], (void *)device_buffers[i], args[i].size);
arg_buffers[i], (void *)device_buffers[i],
context.array_metadata[i].runtime_size);
CUDADriver::get_instance().mem_free((void *)device_buffers[i]);
}
}
Expand Down
8 changes: 5 additions & 3 deletions taichi/codegen/codegen_llvm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2296,14 +2296,16 @@ FunctionType CodeGenLLVM::compile_module_to_executable() {
// For taichi ndarrays, context.args saves pointer to its
// |DeviceAllocation|, CPU backend actually want to use the raw ptr here.
for (int i = 0; i < (int)args.size(); i++) {
if (args[i].is_array && context.is_device_allocation[i] &&
args[i].size > 0) {
const auto &arr_meta = context.array_metadata[i];
if (args[i].is_array && arr_meta.is_device_allocation &&
arr_meta.runtime_size > 0) {
DeviceAllocation *ptr =
static_cast<DeviceAllocation *>(context.get_arg<void *>(i));
uint64 host_ptr = (uint64)kernel->program->get_llvm_program_impl()
->get_ndarray_alloc_info_ptr(*ptr);
context.set_arg(i, host_ptr);
context.set_device_allocation(i, false);
context.set_array_is_device_allocation(i,
/*is_device_allocation=*/false);
}
}
for (auto task : offloaded_tasks_local) {
Expand Down
1 change: 0 additions & 1 deletion taichi/program/callable.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ class TI_DLL_EXPORT Callable {
std::vector<int> element_shape = {})
: dt(dt),
is_array(is_array),
size(size),
total_dim(total_dim),
element_shape(std::move(element_shape)) {
}
Expand Down
20 changes: 14 additions & 6 deletions taichi/program/context.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@ struct RuntimeContext {
int32 extra_args[taichi_max_num_args_extra][taichi_max_num_indices];
int32 cpu_thread_id;
// |is_device_allocation| is true iff args[i] is a DeviceAllocation*.
bool is_device_allocation[taichi_max_num_args_total]{false};
struct ArrayMetadata {
uint64 runtime_size{0};
bool is_device_allocation{false};
};
ArrayMetadata array_metadata[taichi_max_num_args_total];
// We move the pointer of result buffer from LLVMRuntime to RuntimeContext
// because each real function need a place to store its result, but
// LLVMRuntime is shared among functions. So we moved the pointer to
Expand All @@ -45,11 +49,15 @@ struct RuntimeContext {
template <typename T>
void set_arg(int i, T v) {
args[i] = taichi_union_cast_with_different_sizes<uint64>(v);
set_device_allocation(i, false);
set_array_is_device_allocation(i, /*is_device_allocation=*/false);
}

void set_device_allocation(int i, bool is_device_allocation_) {
is_device_allocation[i] = is_device_allocation_;
void set_array_runtime_size(int i, uint64 size) {
array_metadata[i].runtime_size = size;
}

void set_array_is_device_allocation(int i, bool is_device_allocation) {
array_metadata[i].is_device_allocation = is_device_allocation;
}

template <typename T>
Expand All @@ -61,7 +69,7 @@ struct RuntimeContext {
DeviceAllocation &alloc,
const std::vector<int> &shape) {
args[arg_id] = taichi_union_cast_with_different_sizes<uint64>(&alloc);
set_device_allocation(arg_id, true);
set_array_is_device_allocation(arg_id, /*is_device_allocation=*/true);
k-ye marked this conversation as resolved.
Show resolved Hide resolved
TI_ASSERT(shape.size() <= taichi_max_num_indices);
for (int i = 0; i < shape.size(); i++) {
extra_args[arg_id][i] = shape[i];
Expand All @@ -73,7 +81,7 @@ struct RuntimeContext {
const std::vector<int> &shape,
const std::vector<int> &element_shape) {
args[arg_id] = taichi_union_cast_with_different_sizes<uint64>(&alloc);
set_device_allocation(arg_id, true);
set_array_is_device_allocation(arg_id, /*is_device_allocation=*/true);
k-ye marked this conversation as resolved.
Show resolved Hide resolved
TI_ASSERT(shape.size() + element_shape.size() <= taichi_max_num_indices);
for (int i = 0; i < shape.size(); i++) {
extra_args[arg_id][i] = shape[i];
Expand Down
11 changes: 8 additions & 3 deletions taichi/program/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,17 +238,21 @@ void Kernel::LaunchContextBuilder::set_arg_external_array(
ActionArg("address", fmt::format("0x{:x}", ptr)),
ActionArg("array_size_in_bytes", (int64)size)});

// FIXME(https://github.com/taichi-dev/taichi/issues/4949): Make the Metal
// backend support Ndarray, then remove this line below.
kernel_->args[arg_id].size = size;
ctx_->set_arg(arg_id, ptr);
ctx_->set_device_allocation(arg_id, is_device_allocation);
ctx_->set_array_runtime_size(arg_id, size);
ctx_->set_array_is_device_allocation(arg_id, is_device_allocation);
}

void Kernel::LaunchContextBuilder::set_arg_external_array_with_shape(
int arg_id,
uintptr_t ptr,
uint64 size,
const std::vector<int64> &shape) {
this->set_arg_external_array(arg_id, ptr, size, false);
this->set_arg_external_array(arg_id, ptr, size,
/*is_device_allocation=*/false);
TI_ASSERT_INFO(shape.size() <= taichi_max_num_indices,
"External array cannot have > {max_num_indices} indices");
for (uint64 i = 0; i < shape.size(); ++i) {
Expand All @@ -260,7 +264,8 @@ void Kernel::LaunchContextBuilder::set_arg_ndarray(int arg_id,
const Ndarray &arr) {
intptr_t ptr = arr.get_device_allocation_ptr_as_int();
uint64 arr_size = arr.get_element_size() * arr.get_nelement();
this->set_arg_external_array(arg_id, ptr, arr_size, true);
this->set_arg_external_array(arg_id, ptr, arr_size,
/*is_device_allocation=*/true);
TI_ASSERT_INFO(arr.shape.size() <= taichi_max_num_indices,
"External array cannot have > {max_num_indices} indices");
for (uint64 i = 0; i < arr.shape.size(); ++i) {
Expand Down
23 changes: 14 additions & 9 deletions taichi/runtime/opengl/opengl_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ void CompiledTaichiKernel::init_args(Kernel *kernel) {
for (int i = 0; i < arg_count; i++) {
const auto dtype_name = kernel->args[i].dt.to_string();
if (kernel->args[i].is_array) {
constexpr uint64 kUnkownRuntimeSize = 0;
arr_args[i] = CompiledArrayArg(
{/*dtype_enum=*/to_gl_dtype_enum(kernel->args[i].dt), dtype_name,
/*field_dim=*/kernel->args[i].total_dim -
Expand All @@ -240,7 +241,7 @@ void CompiledTaichiKernel::init_args(Kernel *kernel) {
/*element_shape=*/kernel->args[i].element_shape,
/*shape_offset_in_bytes_in_args_buf=*/taichi_opengl_extra_args_base +
i * taichi_max_num_indices * sizeof(int),
/*total_size=*/kernel->args[i].size});
kUnkownRuntimeSize});
} else {
scalar_args[i] = ScalarArg(
{dtype_name, /*offset_in_bytes_in_args_buf=*/i * sizeof(uint64_t)});
Expand Down Expand Up @@ -400,23 +401,26 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
for (auto &item : program_.arr_args) {
int i = item.first;
TI_ASSERT(args[i].is_array);
if (args[i].size == 0 || ctx.is_device_allocation[i])
const auto &arr_meta = ctx.array_metadata[i];
const auto arr_runtime_sz = arr_meta.runtime_size;
if ((arr_runtime_sz == 0) || arr_meta.is_device_allocation) {
continue;
}
has_ext_arr = true;
if (args[i].size != item.second.total_size ||
if (arr_runtime_sz != item.second.runtime_size ||
k-ye marked this conversation as resolved.
Show resolved Hide resolved
ext_arr_bufs_[i] == kDeviceNullAllocation) {
if (ext_arr_bufs_[i] != kDeviceNullAllocation) {
device_->dealloc_memory(ext_arr_bufs_[i]);
}
ext_arr_bufs_[i] = device_->allocate_memory(
{args[i].size, /*host_write=*/true, /*host_read=*/true,
{arr_runtime_sz, /*host_write=*/true, /*host_read=*/true,
/*export_sharing=*/false});
item.second.total_size = args[i].size;
item.second.runtime_size = arr_runtime_sz;
}
void *host_ptr = (void *)ctx.args[i];
void *baseptr = device_->map(ext_arr_bufs_[i]);
if (program_.check_ext_arr_read(i)) {
std::memcpy((char *)baseptr, host_ptr, args[i].size);
std::memcpy((char *)baseptr, host_ptr, arr_runtime_sz);
}
device_->unmap(ext_arr_bufs_[i]);
}
Expand Down Expand Up @@ -468,7 +472,7 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
// On most devices this number is 8. But I need to look up how
// to query this information so currently this is thrown from OpenGl.
for (const auto [arg_id, bind_id] : program_.used.arr_arg_to_bind_idx) {
if (ctx.is_device_allocation[arg_id]) {
if (ctx.array_metadata[arg_id].is_device_allocation) {
DeviceAllocation *ptr =
static_cast<DeviceAllocation *>((void *)ctx.args[arg_id]);

Expand Down Expand Up @@ -503,9 +507,10 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
if (has_ext_arr) {
for (auto &item : program_.arr_args) {
int i = item.first;
if (args[i].size != 0 && !ctx.is_device_allocation[i]) {
const auto &arr_meta = ctx.array_metadata[i];
if (arr_meta.runtime_size != 0 && !arr_meta.is_device_allocation) {
uint8_t *baseptr = (uint8_t *)device_->map(ext_arr_bufs_[i]);
memcpy((void *)ctx.args[i], baseptr, args[i].size);
memcpy((void *)ctx.args[i], baseptr, arr_meta.runtime_size);
device_->unmap(ext_arr_bufs_[i]);
}
}
Expand Down
2 changes: 1 addition & 1 deletion taichi/runtime/opengl/opengl_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ struct CompiledArrayArg {
bool is_scalar{false};
std::vector<int> element_shape;
size_t shape_offset_in_bytes_in_args_buf{0};
size_t total_size{0}; // Runtime information
size_t runtime_size{0}; // Runtime information

TI_IO_DEF(field_dim,
is_scalar,
Expand Down
15 changes: 10 additions & 5 deletions taichi/runtime/vulkan/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ class HostDeviceContextBlitter {
char *device_ptr = device_base + arg.offset_in_mem;
do {
if (arg.is_array) {
if (!host_ctx_->is_device_allocation[i] && ext_arr_size.at(i)) {
if (!host_ctx_->array_metadata[i].is_device_allocation &&
ext_arr_size.at(i)) {
// Only need to blit ext arrs (host array)
DeviceAllocation buffer = ext_arrays.at(i);
char *const device_arr_ptr =
Expand Down Expand Up @@ -150,7 +151,8 @@ class HostDeviceContextBlitter {
for (int i = 0; i < ctx_attribs_->args().size(); ++i) {
const auto &arg = ctx_attribs_->args()[i];
if (arg.is_array) {
if (!host_ctx_->is_device_allocation[i] && ext_arr_size.at(i)) {
if (!host_ctx_->array_metadata[i].is_device_allocation &&
ext_arr_size.at(i)) {
require_sync = true;
}
}
Expand All @@ -166,7 +168,8 @@ class HostDeviceContextBlitter {
for (int i = 0; i < ctx_attribs_->args().size(); ++i) {
const auto &arg = ctx_attribs_->args()[i];
if (arg.is_array) {
if (!host_ctx_->is_device_allocation[i] && ext_arr_size.at(i)) {
if (!host_ctx_->array_metadata[i].is_device_allocation &&
ext_arr_size.at(i)) {
// Only need to blit ext arrs (host array)
DeviceAllocation buffer = ext_arrays.at(i);
char *const device_arr_ptr =
Expand Down Expand Up @@ -455,7 +458,7 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
const auto &args = ti_kernel->ti_kernel_attribs().ctx_attribs.args();
for (auto &arg : args) {
if (arg.is_array) {
if (host_ctx->is_device_allocation[i]) {
if (host_ctx->array_metadata[i].is_device_allocation) {
// NDArray
if (host_ctx->args[i]) {
any_arrays[i] = *(DeviceAllocation *)(host_ctx->args[i]);
Expand All @@ -464,6 +467,8 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
}
} else {
// Compute ext arr sizes
// TODO: Consider using `arr_metadata.runtime_size` instead of
// computing on our own?
size_t size = arg.stride;
bool has_zero_axis = false;

Expand Down Expand Up @@ -546,7 +551,7 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
// Dealloc external arrays
for (auto pair : any_arrays) {
if (pair.second != kDeviceNullAllocation) {
if (!host_ctx->is_device_allocation[pair.first]) {
if (!host_ctx->array_metadata[pair.first].is_device_allocation) {
device_->dealloc_memory(pair.second);
}
}
Expand Down