Skip to content

Commit

Permalink
Refactor CPU module (#1268)
Browse files Browse the repository at this point in the history
* Move threads spawning from cpu_module to runtime

* Add thread affinity

* Disable macos thread affinity

* Fix macos thread affinity

* Change nncase_cpu_runtime to OBJECT

* Link pthread

* Init cpu_mt

* Fix win

* Add cid

* Use pthread key on Mac

* Fix win&linux

* Apply code-format changes

---------

Co-authored-by: sunnycase <sunnycase@users.noreply.github.com>
  • Loading branch information
sunnycase and sunnycase authored Nov 15, 2024
1 parent daf719c commit ccf4387
Show file tree
Hide file tree
Showing 30 changed files with 462 additions and 323 deletions.
4 changes: 2 additions & 2 deletions modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceBuiltn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,14 @@ public static string TopoAwareRuntimeDef(CpuTargetOptions options, ulong dataAli

public static string CMakeDef(string name)
{
var cmakePath = CMakePath(Path.Combine(Path.GetDirectoryName(typeof(CSourceBuiltn).Assembly.Location)!, "Runtime", "cmake", "cpu_runtime.cmake"));
var cmakePath = CMakePath(Path.Combine(Path.GetDirectoryName(typeof(CSourceBuiltn).Assembly.Location)!, "Runtime", "cmake", "ntt_module.cmake"));
var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/CMakeLists.txt.cshtml", new { CMakePath = cmakePath }).Result;
return content;
}

public static string MakeMain(TIR.PrimFunction primFunction, ulong dataAlign, ulong dataUsage, ulong rdataPoolSize, IEnumerable<TIR.Buffer> rdataBuffers, CpuTargetOptions options)
{
var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/main.cpp.cshtml", new KernelMainModel(primFunction, rdataBuffers.ToArray(), options, dataAlign, dataUsage, rdataPoolSize)).Result;
var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/thread_main.cpp.cshtml", new KernelMainModel(primFunction, rdataBuffers.ToArray(), options, dataAlign, dataUsage, rdataPoolSize)).Result;
return content;
}

Expand Down
4 changes: 1 addition & 3 deletions modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceCompiler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,7 @@ private void ArchSpecific()
private string ArgumentsSpecific(string sourcePath, string outPath)
{
var archConfig = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ?
"-DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl" :
RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ? "-DBUILD_SHARED=ON" :
string.Empty;
"-DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl" : string.Empty;

#if DEBUG
var config = "Debug";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ public static string[] ToSlicing(this IEnumerable<string> dims, string[] begins,
dimi = dimi[(s + 1)..e].Trim();
}

begins[i] += " + " + sp.Skip(1).Aggregate($"{placement.Name[sp[0].H]}id", (acc, p) => $"({acc} + {TensorUtilities.GetProduct(splitHierarchy[i].ToArray().AsSpan()[(p.H + 1)..])} * {placement.Name[p.H]}id)") + $" * {dimi}";
begins[i] += " + " + sp.Skip(1).Aggregate($"{placement.Name[sp[0].H]}id()", (acc, p) => $"({acc} + {TensorUtilities.GetProduct(splitHierarchy[i].ToArray().AsSpan()[(p.H + 1)..])} * {placement.Name[p.H]}id())") + $" * {dimi}";
}
}

Expand Down
19 changes: 13 additions & 6 deletions modules/Nncase.Modules.CPU/CodeGen/CPU/FunctionBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ public unsafe ILinkableFunction Build(TIR.PrimFunction function)
using (var writer = _sectionManager.GetWriter(KernelHeaderSectionName))
{
var header = default(DescHeader);
header.DataPoolSize = function.SchedResult.DataUsage;
header.DataAlign = function.SchedResult.DataAlign;
header.ThreadDim = (uint)TargetOptions.Hierarchies[0][^1];
header.BlockDim = TargetOptions.Hierarchies[0].Length < 2 ? 1 : (uint)TargetOptions.Hierarchies[0][^2];
header.ChipDim = TargetOptions.Hierarchies[0].Length < 3 ? 1 : (uint)TargetOptions.Hierarchies[0][^3];
writer.Write(ref header);
}

Expand Down Expand Up @@ -82,10 +83,16 @@ public unsafe ILinkableFunction Build(TIR.PrimFunction function)
[StructLayout(LayoutKind.Sequential)]
private unsafe struct DescHeader
{
[MarshalAs(UnmanagedType.U8)]
public ulong DataPoolSize;
[MarshalAs(UnmanagedType.U4)]
public uint ThreadDim;

[MarshalAs(UnmanagedType.U8)]
public ulong DataAlign;
[MarshalAs(UnmanagedType.U4)]
public uint BlockDim;

[MarshalAs(UnmanagedType.U4)]
public uint ChipDim;

[MarshalAs(UnmanagedType.U4)]
public uint Reserved0;
}
}
6 changes: 2 additions & 4 deletions modules/Nncase.Modules.CPU/CodeGen/CPU/LinkableModule.cs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ public ILinkedModule Link(ILinkContext linkContext)
Directory.CreateDirectory(dumpPath);
}

using (var fs = File.Open(Path.Join(dumpPath, "main.cpp"), FileMode.Create))
using (var fs = File.Open(Path.Join(dumpPath, "thread_main.cpp"), FileMode.Create))
{
using (var writer = new StreamWriter(fs))
{
Expand Down Expand Up @@ -112,9 +112,7 @@ public ILinkedModule Link(ILinkContext linkContext)
private string CompileCSource(string sourcePath)
{
var compiler = new CSourceCompiler();
var binDir = RuntimeInformation.IsOSPlatform(OSPlatform.Windows)
? Path.Join(sourcePath, "build", "nncase_cpu_module.exe")
: Path.Join(sourcePath, "build", "nncase_cpu_module");
var binDir = Path.Join(sourcePath, "build", "nncase_ntt_module");
return compiler.Compile(sourcePath, binDir);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,41 +12,4 @@ endif()

include(@Html.Raw(Model.CMakePath))

if(NOT MSVC AND NOT APPLE AND BUILD_SHARED)
add_library(nncase_cpu_module SHARED main.cpp)
set_target_properties(nncase_cpu_module PROPERTIES PREFIX "" SUFFIX "")
set_target_properties(nncase_cpu_runtime PROPERTIES POSITION_INDEPENDENT_CODE ON)
else()
add_executable(nncase_cpu_module main.cpp)
endif()
target_compile_features(nncase_cpu_module PUBLIC cxx_std_20)
target_link_libraries(nncase_cpu_module PRIVATE nncase_cpu_runtime)
target_compile_definitions(nncase_cpu_module PUBLIC -DNNCASE_CPU_MODULE=1)

if (MSVC)
set_target_properties(nncase_cpu_module PROPERTIES LINK_FLAGS /SUBSYSTEM:CONSOLE)
target_link_options(nncase_cpu_module PRIVATE /ENTRY:kernel_entry /NODEFAULTLIB)
target_link_libraries(nncase_cpu_module PRIVATE libvcruntime msvcrt ucrt libcpmt)
set_property(TARGET nncase_cpu_module PROPERTY
MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
else()
if (APPLE)
if(BUILD_STANDALONE)
target_link_options(nncase_cpu_module PRIVATE -ld_classic -lc)
else()
target_link_options(nncase_cpu_module PRIVATE -static)
target_link_options(nncase_cpu_module PRIVATE -e _kernel_entry -bundle -ld_classic -lc)
target_compile_options(nncase_cpu_module PRIVATE "$<$<CONFIG:Debug>:-O1>")
endif(BUILD_STANDALONE)
else()
if (BUILD_SHARED)
target_link_options(nncase_cpu_module PRIVATE -e kernel_entry)
else()
if(NOT BUILD_STANDALONE)
target_link_options(nncase_cpu_module PRIVATE -static)
target_link_options(nncase_cpu_module PRIVATE -e kernel_entry -nostdlib)
endif(NOT BUILD_STANDALONE)
endif()
target_link_libraries(nncase_cpu_module PRIVATE gcc)
endif()
endif()
target_sources(nncase_ntt_module PRIVATE thread_main.cpp)
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,25 @@
var inputCount = Model.PrimFunction.Parameters.Length;
}

#include <nncase/ntt/cpu_runtime.h>
#include <nncase/ntt/runtime.h>
#include "topo_aware_runtime.h"
#include "../device.h"
@foreach(var (s,i) in Model.Options.MemoryCapacities.Select((s,i) => (s,i)).Skip(1).SkipLast(1)){
@:uint8_t L@(i)Data[@(s)];
}
#include "kernel.h"

extern "C" void kernel_entry(nncase_runtime_cpu_mt_t *cpu_mt, uint8_t **inputs, uint8_t *rdata) {
g_cpu_mt = cpu_mt;
//alignas(@(Model.Alignment)) static thread_local uint8_t local_data[@(Model.DataSize)];

extern "C" void thread_main(std::byte *const *inouts, const std::byte *rdata) {
/* prepare inputs */
@{
var names = new List<string>();
}
@foreach(var (b,i) in Model.PrimFunction.Parameters.ToArray().OfType<Nncase.TIR.Buffer>().Select((b,i)=>(Model.GetInfo(b),i)))
{
names.Add(b.Name);
@:std::span<@Html.Raw(b.ElemType), @b.Size> p@(b.Name)((@Html.Raw(b.ElemType) *)inputs[@i], @b.Size);
@:std::span<@Html.Raw(b.ElemType), @b.Size> p@(b.Name)((@Html.Raw(b.ElemType) *)inouts[@i], @b.Size);
@:tensor_view<@Html.Raw(b.ElemType), @Html.Raw(b.Dimensions), @Html.Raw(b.Strides)> @(b.Name)(p@(b.Name));
@:
}
Expand All @@ -39,34 +40,10 @@ extern "C" void kernel_entry(nncase_runtime_cpu_mt_t *cpu_mt, uint8_t **inputs,
@if (Model.Options.Hierarchies.Length > 1) {
throw new NotSupportedException($"not support multi form topology!");
}
@if (Model.Options.Hierarchies[0].Any(h => h != 1)) {
var hierarchy = Model.Options.Hierarchies[0];

@:/* prepare wrapped kernel */
@:auto wrapped_kernel = [&](@(string.Join(", ", Model.Options.HierarchyNames.Select(c => "size_t cur_" + c + "id")))) {
foreach (var c in Model.Options.HierarchyNames) {
@:@(c)id = cur_@(c)id;
}
@:uint8_t *data = (uint8_t *)cpu_mt->local_alloc(@(Model.DataSize), @(Model.Alignment));
@:@(Model.PrimFunction.Name)(@(string.Join(", ", names)), data);
@:cpu_mt->local_free(data);
@:};

@:/* invoke kernels */
int count = 0;
@foreach(var index in hierarchy.Select(i => Enumerable.Range(0, i)).CartesianProduct().Select(arr => arr.ToArray()))
{
@:std::thread t@(count++)(wrapped_kernel, @(string.Join(",", index)));
}
for (int i = 0; i < count; i++) {
@:t@(i).join();
}
} else {
@:/* invoke kernel */
@:uint8_t *data = (uint8_t *)cpu_mt->local_alloc(@(Model.DataSize), @(Model.Alignment));
@:@(Model.PrimFunction.Name)(@(string.Join(", ", names)), data);
@:cpu_mt->local_free(data);
}
auto local_data = (uint8_t *)nncase::ntt::runtime::thread_alloc(@Model.DataSize, @Model.Alignment);
@(Model.PrimFunction.Name)(@(string.Join(", ", names)), local_data);
nncase::ntt::runtime::thread_free(local_data);
}

#ifdef NNCASE_STANDALONE
Expand All @@ -78,7 +55,7 @@ static void *local_alloc(size_t bytes, size_t alignment) {
#else
size_t mask = alignment - 1;
size_t aligned_bytes = bytes + (-bytes & mask);
return aligned_alloc(alignment, bytes);
return aligned_alloc(alignment, aligned_bytes);
#endif
}

Expand Down Expand Up @@ -110,8 +87,6 @@ static nncase_runtime_cpu_mt_t nncase_cpu_mt_ = {
.sqrtf = sqrtf,
.tanhf = tanhf,
.sram_address = nullptr,
.local_alloc = local_alloc,
.local_free = local_free,
.failfast = nullptr,

#ifndef WIN32
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,6 @@
#include <thread>
#include <barrier>

@foreach (var c in Model.Options.HierarchyNames)
{
@:thread_local size_t @(c)id = 0;
}

/**
* @@brief topology aware runtime
*
Expand Down Expand Up @@ -112,7 +107,7 @@ class tensor_reduce_sync_impl {
public:
void reduce_group_sync() const noexcept {
@foreach(var comb in combinations) {
var reduce_group_index = string.Join(", ", Enumerable.Range(0, hierarchy.Length).Select(i => comb.Contains(i) ? "0" : hierarchyNames[i] + "id"));
var reduce_group_index = string.Join(", ", Enumerable.Range(0, hierarchy.Length).Select(i => comb.Contains(i) ? "0" : "ntt::" + hierarchyNames[i] + "id()"));
@:if constexpr (Kind == tar::reduce_kind::@(GetName(comb, string.Empty))) {
@: tar::@(GetName(comb))(@(reduce_group_index)).arrive_and_wait();
@:}
Expand Down Expand Up @@ -156,7 +151,7 @@ class tensor_reduce_sync_impl {
}

@{
var cur_index = string.Join(", ", Enumerable.Range(0, hierarchy.Length).Select(i => hierarchyNames[i] + "id"));
var cur_index = string.Join(", ", Enumerable.Range(0, hierarchy.Length).Select(i => "ntt::" + hierarchyNames[i] + "id()"));
}

template <class TIn, class TOut> void operator()(TIn &src, TOut &&dest) {
Expand Down
2 changes: 1 addition & 1 deletion modules/Nncase.Modules.CPU/Targets/CPUTargetOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ public class CpuTargetOptions : ICpuTargetOptions
[DisplayName("--hierarchy-names")]
[Description("the name identify of hierarchies.")]
[DefaultValue("b")]
public string HierarchyNames { get; set; } = "b";
public string HierarchyNames { get; set; } = "t";

[DisplayName("--hierarchy-sizes")]
[Description("the memory capacity of hierarchies.")]
Expand Down
12 changes: 0 additions & 12 deletions ntt/cmake/cpu_runtime.cmake

This file was deleted.

34 changes: 34 additions & 0 deletions ntt/cmake/ntt_module.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
cmake_minimum_required(VERSION 3.15)

include(${CMAKE_CURRENT_LIST_DIR}/compile_flags.cmake)

if (BUILD_STANDALONE)
add_executable(nncase_ntt_module ${CMAKE_CURRENT_LIST_DIR}/../src/dummy.cpp)
else()
add_library(nncase_ntt_module SHARED ${CMAKE_CURRENT_LIST_DIR}/../src/dummy.cpp)
endif()

target_compile_features(nncase_ntt_module PUBLIC cxx_std_20)
target_include_directories(nncase_ntt_module PUBLIC ${CMAKE_CURRENT_LIST_DIR}/../include)
set_target_properties(nncase_ntt_module PROPERTIES PREFIX "" SUFFIX "")
set_target_properties(nncase_ntt_module PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(nncase_ntt_module PUBLIC -DNNCASE_CPU_MODULE=1)

target_sources(nncase_ntt_module PRIVATE ${CMAKE_CURRENT_LIST_DIR}/../src/cpu_runtime.cpp)

if (BUILD_STANDALONE)
target_compile_definitions(nncase_ntt_module PUBLIC -DNNCASE_STANDALONE=1)
endif()

if (MSVC)
set_property(TARGET nncase_ntt_module PROPERTY
MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
set_target_properties(nncase_ntt_module PROPERTIES LINK_FLAGS /SUBSYSTEM:CONSOLE)
target_link_options(nncase_ntt_module PRIVATE /NODEFAULTLIB)
target_link_libraries(nncase_ntt_module PRIVATE "libvcruntime$<$<CONFIG:Debug>:d>"
"msvcrt$<$<CONFIG:Debug>:d>"
"ucrt$<$<CONFIG:Debug>:d>"
"libcpmt$<$<CONFIG:Debug>:d>")
elseif(APPLE)
target_link_options(nncase_ntt_module PRIVATE -ld_classic -lc)
endif()
57 changes: 0 additions & 57 deletions ntt/include/nncase/ntt/cpu_runtime.h

This file was deleted.

Loading

0 comments on commit ccf4387

Please sign in to comment.