Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Amdgpu backend #10

Merged
merged 13 commits into from
Dec 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ jobs:
. .github/workflows/scripts/common-utils.sh

ci-docker-run-amdgpu --name taichi-build \
registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.3 \
registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.5 \
/home/dev/taichi/.github/workflows/scripts/build.py

env:
Expand All @@ -302,6 +302,7 @@ jobs:
-DTI_WITH_VULKAN:BOOL=OFF
-DTI_WITH_OPENGL:BOOL=OFF
-DTI_BUILD_TESTS:BOOL=ON
-DTI_WITH_AMDGPU:BOOL=ON

- name: Test
id: test
Expand All @@ -310,7 +311,7 @@ jobs:
. .github/workflows/scripts/common-utils.sh

ci-docker-run-amdgpu --name taichi-test \
registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.3 \
registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.5 \
/home/dev/taichi/.github/workflows/scripts/unix_test.sh
env:
PY: '3.8'
Expand Down
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,10 @@ if (TI_WITH_CUDA)
set(CUDA_ARCH "cuda")
endif()

if (TI_WITH_AMDGPU)
set(AMDGPU_ARCH "amdgpu")
endif()

if (TI_WITH_DX12)
set(DX12_ARCH "dx12")
endif()
Expand Down
25 changes: 25 additions & 0 deletions cmake/TaichiCore.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ option(TI_WITH_LLVM "Build with LLVM backends" ON)
option(TI_WITH_METAL "Build with the Metal backend" ON)
option(TI_WITH_CUDA "Build with the CUDA backend" ON)
option(TI_WITH_CUDA_TOOLKIT "Build with the CUDA toolkit" OFF)
option(TI_WITH_AMDGPU "Build with the AMDGPU backend" OFF)
option(TI_WITH_OPENGL "Build with the OpenGL backend" ON)
option(TI_WITH_CC "Build with the C backend" ON)
option(TI_WITH_VULKAN "Build with the Vulkan backend" OFF)
Expand Down Expand Up @@ -34,6 +35,10 @@ if(ANDROID)
set(TI_WITH_DX12 OFF)
endif()

if (TI_WITH_AMDGPU AND TI_WITH_CUDA)
message(WARNING "Compiling CUDA and AMDGPU backends simultaneously")
endif()

if(UNIX AND NOT APPLE)
# Handy helper for Linux
# https://stackoverflow.com/a/32259072/12003165
Expand All @@ -53,13 +58,21 @@ if (APPLE)
set(TI_WITH_CC OFF)
message(WARNING "C backend not supported on OS X. Setting TI_WITH_CC to OFF.")
endif()
if (TI_WITH_AMDGPU)
set(TI_WITH_AMDGPU OFF)
message(WARNING "AMDGPU backend not supported on OS X. Setting TI_WITH_AMDGPU to OFF.")
endif()
endif()

if (WIN32)
if (TI_WITH_CC)
set(TI_WITH_CC OFF)
message(WARNING "C backend not supported on Windows. Setting TI_WITH_CC to OFF.")
endif()
if (TI_WITH_AMDGPU)
set(TI_WITH_AMDGPU OFF)
message(WARNING "AMDGPU backend not supported on Windows. Setting TI_WITH_AMDGPU to OFF.")
endif()
endif()

if(TI_WITH_VULKAN)
Expand Down Expand Up @@ -108,6 +121,12 @@ if (TI_WITH_CUDA)
list(APPEND TAICHI_CORE_SOURCE ${TAICHI_CUDA_RUNTIME_SOURCE})
endif()

if (TI_WITH_AMDGPU)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_AMDGPU")
# file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp")
list(APPEND TAIHI_CORE_SOURCE ${TAICHI_AMDGPU_RUNTIME_SOURCE})
endif()

if (TI_WITH_DX12)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_DX12")
endif()
Expand Down Expand Up @@ -215,6 +234,12 @@ if(TI_WITH_LLVM)
target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE cuda_rhi)
endif()

if (TI_WITH_AMDGPU)
llvm_map_components_to_libnames(llvm_amdgpu_libs AMDGPU)
add_subdirectory(taichi/rhi/amdgpu)
target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE amdgpu_rhi)
endif()

if (TI_WITH_DX12)
llvm_map_components_to_libnames(llvm_directx_libs DirectX)

Expand Down
21 changes: 21 additions & 0 deletions taichi/rhi/amdgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# ./taichi/rhi/amdgpu/CMakeLists.txt

set(AMDGPU_RHI amdgpu_rhi)
add_library(${AMDGPU_RHI})
target_sources(${AMDGPU_RHI}
PRIVATE
amdgpu_device.cpp
amdgpu_caching_allocator.cpp
amdgpu_context.cpp
amdgpu_driver.cpp
)

target_include_directories(${AMDGPU_RHI}
PRIVATE
${PROJECT_SOURCE_DIR}
${PROJECT_SOURCE_DIR}/external/eigen
${PROJECT_SOURCE_DIR}/external/spdlog/include
${LLVM_INCLUDE_DIRS}
)

target_link_libraries(${AMDGPU_RHI} PRIVATE interop_rhi)
40 changes: 40 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_caching_allocator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#include "taichi/rhi/amdgpu/amdgpu_caching_allocator.h"

namespace taichi {
namespace lang {
namespace amdgpu {

AmdgpuCachingAllocator::AmdgpuCachingAllocator(LlvmDevice *device)
: device_(device) {
}

uint64_t *AmdgpuCachingAllocator::allocate(
const LlvmDevice::LlvmRuntimeAllocParams &params) {
uint64_t *ret{nullptr};
auto size_aligned = taichi::iroundup(params.size, taichi_page_size);
auto it_blk = mem_blocks_.lower_bound(size_aligned);

if (it_blk != mem_blocks_.end()) {
size_t remaining_sz = it_blk->first - size_aligned;
if (remaining_sz > 0) {
TI_ASSERT(remaining_sz % taichi_page_size == 0);
auto remaining_head =
reinterpret_cast<uint8_t *>(it_blk->second) + size_aligned;
mem_blocks_.insert(
{remaining_sz, reinterpret_cast<uint64_t *>(remaining_head)});
}
ret = it_blk->second;
mem_blocks_.erase(it_blk);
} else {
ret = device_->allocate_llvm_runtime_memory_jit(params);
}
return ret;
}

void AmdgpuCachingAllocator::release(size_t sz, uint64_t *ptr) {
mem_blocks_.insert({sz, ptr});
}

} // namespace amdgpu
} // namespace lang
} // namespace taichi
28 changes: 28 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_caching_allocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#pragma once

#include "taichi/common/core.h"
#include "taichi/math/arithmetic.h"
#include "taichi/rhi/llvm/llvm_device.h"
#include "taichi/inc/constants.h"
#include <stdint.h>
#include <map>

namespace taichi {
namespace lang {
namespace amdgpu {

class AmdgpuCachingAllocator {
public:
AmdgpuCachingAllocator(LlvmDevice *device);

uint64_t *allocate(const LlvmDevice::LlvmRuntimeAllocParams &params);
void release(size_t sz, uint64_t *ptr);

private:
std::multimap<size_t, uint64_t *> mem_blocks_;
LlvmDevice *device_{nullptr};
};

} // namespace amdgpu
} // namespace lang
} // namespace taichi
93 changes: 93 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_context.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#define TI_RUNTIME_HOST
#include "amdgpu_context.h"

#include <unordered_map>
#include <mutex>

#include "taichi/util/lang_util.h"
#include "taichi/program/program.h"
#include "taichi/system/threading.h"
#include "taichi/rhi/amdgpu/amdgpu_driver.h"
#include "taichi/analysis/offline_cache_util.h"

namespace taichi {
namespace lang {

AMDGPUContext::AMDGPUContext()
: driver_(AMDGPUDriver::get_instance_without_context()) {
dev_count_ = 0;
driver_.init(0);
driver_.device_get_count(&dev_count_);
driver_.device_get(&device_, 0);

char name[128];
driver_.device_get_name(name, 128, device_);

TI_TRACE("Using AMDGPU device [id=0]: {}", name);

driver_.context_create(&context_, 0, device_);

const auto GB = std::pow(1024.0, 3.0);
TI_TRACE("Total memory {:.2f} GB; free memory {:.2f} GB",
get_total_memory() / GB, get_free_memory() / GB);

void *hip_device_prop = std::malloc(HIP_DEVICE_PROPERTIES_STRUCT_SIZE);
driver_.device_get_prop(hip_device_prop, device_);
compute_capability_ = *((int *)hip_device_prop + HIP_DEVICE_GCN_ARCH);
std::free(hip_device_prop);

mcpu_ = fmt::format("gfx{}", compute_capability_);

TI_TRACE("Emitting AMDGPU code for {}", mcpu_);
}

std::size_t AMDGPUContext::get_total_memory() {
std::size_t ret, _;
driver_.mem_get_info(&_, &ret);
return ret;
}

std::size_t AMDGPUContext::get_free_memory() {
std::size_t ret, _;
driver_.mem_get_info(&ret, &_);
return ret;
}

std::string AMDGPUContext::get_device_name() {
constexpr uint32_t kMaxNameStringLength = 128;
char name[kMaxNameStringLength];
driver_.device_get_name(name, kMaxNameStringLength /*=128*/, device_);
std::string str(name);
return str;
}

void AMDGPUContext::launch(void *func,
const std::string &task_name,
void *arg_pointers,
unsigned grid_dim,
unsigned block_dim,
std::size_t dynamic_shared_mem_bytes,
int arg_bytes) {
if (grid_dim > 0) {
std::lock_guard<std::mutex> _(lock_);
void *config[] = {(void *)0x01, const_cast<void *>(arg_pointers),
(void *)0x02, &arg_bytes, (void *)0x03};
driver_.launch_kernel(func, grid_dim, 1, 1, block_dim, 1, 1,
dynamic_shared_mem_bytes, nullptr, nullptr,
reinterpret_cast<void **>(&config));
}
if (debug_) {
driver_.stream_synchronize(nullptr);
}
}

AMDGPUContext::~AMDGPUContext() {
}

AMDGPUContext &AMDGPUContext::get_instance() {
static auto context = new AMDGPUContext();
return *context;
}

} // namespace lang
} // namespace taichi
99 changes: 99 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_context.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#pragma once

#include <mutex>
#include <unordered_map>
#include <thread>

#include "taichi/program/kernel_profiler.h"
#include "taichi/rhi/amdgpu/amdgpu_driver.h"

namespace taichi {
namespace lang {

class AMDGPUDriver;

class AMDGPUContext {
private:
void *device_;
void *context_;
int dev_count_;
int compute_capability_;
std::string mcpu_;
std::mutex lock_;
AMDGPUDriver &driver_;
bool debug_;

public:
AMDGPUContext();

std::size_t get_total_memory();
std::size_t get_free_memory();
std::string get_device_name();

bool detected() const {
return dev_count_ != 0;
}

void launch(void *func,
const std::string &task_name,
void *arg_pointers,
unsigned grid_dim,
unsigned block_dim,
std::size_t dynamic_shared_mem_bytes,
int arg_bytes);

void set_debug(bool debug) {
debug_ = debug;
}

std::string get_mcpu() const {
return mcpu_;
}

void *get_context() {
return context_;
}

void make_current() {
driver_.context_set_current(context_);
}

int get_compute_capability() const {
return compute_capability_;
}

~AMDGPUContext();

class ContextGuard {
private:
void *old_ctx_;
void *new_ctx_;

public:
ContextGuard(AMDGPUContext *new_ctx)
: old_ctx_(nullptr), new_ctx_(new_ctx) {
AMDGPUDriver::get_instance().context_get_current(&old_ctx_);
if (old_ctx_ != new_ctx)
new_ctx->make_current();
}

~ContextGuard() {
if (old_ctx_ != new_ctx_) {
AMDGPUDriver::get_instance().context_set_current(old_ctx_);
}
}
};

ContextGuard get_guard() {
return ContextGuard(this);
}

std::unique_lock<std::mutex> get_lock_guard() {
return std::unique_lock<std::mutex>(lock_);
}

static AMDGPUContext &get_instance();
};

} // namespace lang
} // namespace taichi
Loading