From ccf4387ecc37bfc55c7d3d1f1ac890c1937debc7 Mon Sep 17 00:00:00 2001 From: sunnycase Date: Thu, 14 Nov 2024 17:11:21 -0800 Subject: [PATCH] Refactor CPU module (#1268) * Move threads spawning from cpu_module to runtime * Add thread affinity * Disable macos thread affinity * Fix macos thread affinity * Change nncase_cpu_runtime to OBJECT * Link pthread * Init cpu_mt * Fix win * Add cid * Use pthread key on Mac * Fix win&linux * Apply code-format changes --------- Co-authored-by: sunnycase --- .../CodeGen/CPU/CSourceBuiltn.cs | 4 +- .../CodeGen/CPU/CSourceCompiler.cs | 4 +- .../CodeGen/CPU/CSourceExtensions.cs | 2 +- .../CodeGen/CPU/FunctionBuilder.cs | 19 ++- .../CodeGen/CPU/LinkableModule.cs | 6 +- .../CPU/Templates/CMakeLists.txt.cshtml | 39 +---- ...main.cpp.cshtml => thread_main.cpp.cshtml} | 43 +---- .../CPU/Templates/topo_aware_runtime.cshtml | 9 +- .../Targets/CPUTargetOptions.cs | 2 +- ntt/cmake/cpu_runtime.cmake | 12 -- ntt/cmake/ntt_module.cmake | 34 ++++ ntt/include/nncase/ntt/cpu_runtime.h | 57 ------- ntt/include/nncase/ntt/distributed.h | 39 +++++ ntt/include/nncase/ntt/ntt.h | 3 + ntt/include/nncase/ntt/runtime.h | 30 ++++ ntt/include/nncase/ntt/runtime/cpu_runtime.h | 77 +++++++++ ntt/src/cpu_runtime.cpp | 155 +++++++++++------- ntt/src/dummy.cpp | 0 src/Native/include/nncase/io_utils.h | 8 +- src/Native/src/runtime/CMakeLists.txt | 2 +- .../runtime/cpu/loaders/elf/elf_loader.cpp | 2 +- .../cpu/loaders/macho/macho_loader.cpp | 45 ++++- .../runtime/cpu/loaders/macho/macho_loader.h | 11 +- .../src/runtime/cpu/loaders/pe/pe_loader.cpp | 17 +- .../src/runtime/cpu/loaders/pe/pe_loader.h | 5 +- .../src/runtime/cpu/runtime_function.cpp | 17 +- src/Native/src/runtime/cpu/runtime_function.h | 14 +- .../src/runtime/cpu/runtime_function.run.cpp | 97 ++++------- src/Native/src/runtime/cpu/runtime_module.cpp | 15 ++ src/Native/src/runtime/cpu/runtime_module.h | 17 ++ 30 files changed, 462 insertions(+), 323 deletions(-) rename modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/{main.cpp.cshtml => thread_main.cpp.cshtml} (67%) delete mode 100644 ntt/cmake/cpu_runtime.cmake create mode 100644 ntt/cmake/ntt_module.cmake delete mode 100644 ntt/include/nncase/ntt/cpu_runtime.h create mode 100644 ntt/include/nncase/ntt/distributed.h create mode 100644 ntt/include/nncase/ntt/runtime.h create mode 100644 ntt/include/nncase/ntt/runtime/cpu_runtime.h create mode 100644 ntt/src/dummy.cpp diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceBuiltn.cs b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceBuiltn.cs index de1c611b7b..e03b779f14 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceBuiltn.cs +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceBuiltn.cs @@ -55,14 +55,14 @@ public static string TopoAwareRuntimeDef(CpuTargetOptions options, ulong dataAli public static string CMakeDef(string name) { - var cmakePath = CMakePath(Path.Combine(Path.GetDirectoryName(typeof(CSourceBuiltn).Assembly.Location)!, "Runtime", "cmake", "cpu_runtime.cmake")); + var cmakePath = CMakePath(Path.Combine(Path.GetDirectoryName(typeof(CSourceBuiltn).Assembly.Location)!, "Runtime", "cmake", "ntt_module.cmake")); var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/CMakeLists.txt.cshtml", new { CMakePath = cmakePath }).Result; return content; } public static string MakeMain(TIR.PrimFunction primFunction, ulong dataAlign, ulong dataUsage, ulong rdataPoolSize, IEnumerable rdataBuffers, CpuTargetOptions options) { - var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/main.cpp.cshtml", new KernelMainModel(primFunction, rdataBuffers.ToArray(), options, dataAlign, dataUsage, rdataPoolSize)).Result; + var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/thread_main.cpp.cshtml", new KernelMainModel(primFunction, rdataBuffers.ToArray(), options, dataAlign, dataUsage, rdataPoolSize)).Result; return content; } diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceCompiler.cs b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceCompiler.cs index 9506fda5c1..0f2fd57cae 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceCompiler.cs +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceCompiler.cs @@ -169,9 +169,7 @@ private void ArchSpecific() private string ArgumentsSpecific(string sourcePath, string outPath) { var archConfig = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? - "-DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl" : - RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ? "-DBUILD_SHARED=ON" : - string.Empty; + "-DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl" : string.Empty; #if DEBUG var config = "Debug"; diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceExtensions.cs b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceExtensions.cs index 5c2094b59e..4d595bba7a 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceExtensions.cs +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceExtensions.cs @@ -113,7 +113,7 @@ public static string[] ToSlicing(this IEnumerable dims, string[] begins, dimi = dimi[(s + 1)..e].Trim(); } - begins[i] += " + " + sp.Skip(1).Aggregate($"{placement.Name[sp[0].H]}id", (acc, p) => $"({acc} + {TensorUtilities.GetProduct(splitHierarchy[i].ToArray().AsSpan()[(p.H + 1)..])} * {placement.Name[p.H]}id)") + $" * {dimi}"; + begins[i] += " + " + sp.Skip(1).Aggregate($"{placement.Name[sp[0].H]}id()", (acc, p) => $"({acc} + {TensorUtilities.GetProduct(splitHierarchy[i].ToArray().AsSpan()[(p.H + 1)..])} * {placement.Name[p.H]}id())") + $" * {dimi}"; } } diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/FunctionBuilder.cs b/modules/Nncase.Modules.CPU/CodeGen/CPU/FunctionBuilder.cs index 56d635ff21..67eef0a1b8 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/FunctionBuilder.cs +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/FunctionBuilder.cs @@ -40,8 +40,9 @@ public unsafe ILinkableFunction Build(TIR.PrimFunction function) using (var writer = _sectionManager.GetWriter(KernelHeaderSectionName)) { var header = default(DescHeader); - header.DataPoolSize = function.SchedResult.DataUsage; - header.DataAlign = function.SchedResult.DataAlign; + header.ThreadDim = (uint)TargetOptions.Hierarchies[0][^1]; + header.BlockDim = TargetOptions.Hierarchies[0].Length < 2 ? 1 : (uint)TargetOptions.Hierarchies[0][^2]; + header.ChipDim = TargetOptions.Hierarchies[0].Length < 3 ? 1 : (uint)TargetOptions.Hierarchies[0][^3]; writer.Write(ref header); } @@ -82,10 +83,16 @@ public unsafe ILinkableFunction Build(TIR.PrimFunction function) [StructLayout(LayoutKind.Sequential)] private unsafe struct DescHeader { - [MarshalAs(UnmanagedType.U8)] - public ulong DataPoolSize; + [MarshalAs(UnmanagedType.U4)] + public uint ThreadDim; - [MarshalAs(UnmanagedType.U8)] - public ulong DataAlign; + [MarshalAs(UnmanagedType.U4)] + public uint BlockDim; + + [MarshalAs(UnmanagedType.U4)] + public uint ChipDim; + + [MarshalAs(UnmanagedType.U4)] + public uint Reserved0; } } diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/LinkableModule.cs b/modules/Nncase.Modules.CPU/CodeGen/CPU/LinkableModule.cs index 474f35f86f..1c24067338 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/LinkableModule.cs +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/LinkableModule.cs @@ -56,7 +56,7 @@ public ILinkedModule Link(ILinkContext linkContext) Directory.CreateDirectory(dumpPath); } - using (var fs = File.Open(Path.Join(dumpPath, "main.cpp"), FileMode.Create)) + using (var fs = File.Open(Path.Join(dumpPath, "thread_main.cpp"), FileMode.Create)) { using (var writer = new StreamWriter(fs)) { @@ -112,9 +112,7 @@ public ILinkedModule Link(ILinkContext linkContext) private string CompileCSource(string sourcePath) { var compiler = new CSourceCompiler(); - var binDir = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) - ? Path.Join(sourcePath, "build", "nncase_cpu_module.exe") - : Path.Join(sourcePath, "build", "nncase_cpu_module"); + var binDir = Path.Join(sourcePath, "build", "nncase_ntt_module"); return compiler.Compile(sourcePath, binDir); } } diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/CMakeLists.txt.cshtml b/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/CMakeLists.txt.cshtml index ae7411a708..df93b610c6 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/CMakeLists.txt.cshtml +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/CMakeLists.txt.cshtml @@ -12,41 +12,4 @@ endif() include(@Html.Raw(Model.CMakePath)) -if(NOT MSVC AND NOT APPLE AND BUILD_SHARED) - add_library(nncase_cpu_module SHARED main.cpp) - set_target_properties(nncase_cpu_module PROPERTIES PREFIX "" SUFFIX "") - set_target_properties(nncase_cpu_runtime PROPERTIES POSITION_INDEPENDENT_CODE ON) -else() - add_executable(nncase_cpu_module main.cpp) -endif() -target_compile_features(nncase_cpu_module PUBLIC cxx_std_20) -target_link_libraries(nncase_cpu_module PRIVATE nncase_cpu_runtime) -target_compile_definitions(nncase_cpu_module PUBLIC -DNNCASE_CPU_MODULE=1) - -if (MSVC) - set_target_properties(nncase_cpu_module PROPERTIES LINK_FLAGS /SUBSYSTEM:CONSOLE) - target_link_options(nncase_cpu_module PRIVATE /ENTRY:kernel_entry /NODEFAULTLIB) - target_link_libraries(nncase_cpu_module PRIVATE libvcruntime msvcrt ucrt libcpmt) - set_property(TARGET nncase_cpu_module PROPERTY - MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") -else() - if (APPLE) - if(BUILD_STANDALONE) - target_link_options(nncase_cpu_module PRIVATE -ld_classic -lc) - else() - target_link_options(nncase_cpu_module PRIVATE -static) - target_link_options(nncase_cpu_module PRIVATE -e _kernel_entry -bundle -ld_classic -lc) - target_compile_options(nncase_cpu_module PRIVATE "$<$:-O1>") - endif(BUILD_STANDALONE) - else() - if (BUILD_SHARED) - target_link_options(nncase_cpu_module PRIVATE -e kernel_entry) - else() - if(NOT BUILD_STANDALONE) - target_link_options(nncase_cpu_module PRIVATE -static) - target_link_options(nncase_cpu_module PRIVATE -e kernel_entry -nostdlib) - endif(NOT BUILD_STANDALONE) - endif() - target_link_libraries(nncase_cpu_module PRIVATE gcc) - endif() -endif() +target_sources(nncase_ntt_module PRIVATE thread_main.cpp) diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/main.cpp.cshtml b/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/thread_main.cpp.cshtml similarity index 67% rename from modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/main.cpp.cshtml rename to modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/thread_main.cpp.cshtml index 15498bff2c..ee6a8f5b7e 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/main.cpp.cshtml +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/thread_main.cpp.cshtml @@ -6,7 +6,7 @@ var inputCount = Model.PrimFunction.Parameters.Length; } -#include +#include #include "topo_aware_runtime.h" #include "../device.h" @foreach(var (s,i) in Model.Options.MemoryCapacities.Select((s,i) => (s,i)).Skip(1).SkipLast(1)){ @@ -14,8 +14,9 @@ } #include "kernel.h" -extern "C" void kernel_entry(nncase_runtime_cpu_mt_t *cpu_mt, uint8_t **inputs, uint8_t *rdata) { - g_cpu_mt = cpu_mt; +//alignas(@(Model.Alignment)) static thread_local uint8_t local_data[@(Model.DataSize)]; + +extern "C" void thread_main(std::byte *const *inouts, const std::byte *rdata) { /* prepare inputs */ @{ var names = new List(); @@ -23,7 +24,7 @@ extern "C" void kernel_entry(nncase_runtime_cpu_mt_t *cpu_mt, uint8_t **inputs, @foreach(var (b,i) in Model.PrimFunction.Parameters.ToArray().OfType().Select((b,i)=>(Model.GetInfo(b),i))) { names.Add(b.Name); - @:std::span<@Html.Raw(b.ElemType), @b.Size> p@(b.Name)((@Html.Raw(b.ElemType) *)inputs[@i], @b.Size); + @:std::span<@Html.Raw(b.ElemType), @b.Size> p@(b.Name)((@Html.Raw(b.ElemType) *)inouts[@i], @b.Size); @:tensor_view<@Html.Raw(b.ElemType), @Html.Raw(b.Dimensions), @Html.Raw(b.Strides)> @(b.Name)(p@(b.Name)); @: } @@ -39,34 +40,10 @@ extern "C" void kernel_entry(nncase_runtime_cpu_mt_t *cpu_mt, uint8_t **inputs, @if (Model.Options.Hierarchies.Length > 1) { throw new NotSupportedException($"not support multi form topology!"); } - @if (Model.Options.Hierarchies[0].Any(h => h != 1)) { - var hierarchy = Model.Options.Hierarchies[0]; - @:/* prepare wrapped kernel */ - @:auto wrapped_kernel = [&](@(string.Join(", ", Model.Options.HierarchyNames.Select(c => "size_t cur_" + c + "id")))) { - foreach (var c in Model.Options.HierarchyNames) { - @:@(c)id = cur_@(c)id; - } - @:uint8_t *data = (uint8_t *)cpu_mt->local_alloc(@(Model.DataSize), @(Model.Alignment)); - @:@(Model.PrimFunction.Name)(@(string.Join(", ", names)), data); - @:cpu_mt->local_free(data); - @:}; - - @:/* invoke kernels */ - int count = 0; - @foreach(var index in hierarchy.Select(i => Enumerable.Range(0, i)).CartesianProduct().Select(arr => arr.ToArray())) - { - @:std::thread t@(count++)(wrapped_kernel, @(string.Join(",", index))); - } - for (int i = 0; i < count; i++) { - @:t@(i).join(); - } - } else { - @:/* invoke kernel */ - @:uint8_t *data = (uint8_t *)cpu_mt->local_alloc(@(Model.DataSize), @(Model.Alignment)); - @:@(Model.PrimFunction.Name)(@(string.Join(", ", names)), data); - @:cpu_mt->local_free(data); - } + auto local_data = (uint8_t *)nncase::ntt::runtime::thread_alloc(@Model.DataSize, @Model.Alignment); + @(Model.PrimFunction.Name)(@(string.Join(", ", names)), local_data); + nncase::ntt::runtime::thread_free(local_data); } #ifdef NNCASE_STANDALONE @@ -78,7 +55,7 @@ static void *local_alloc(size_t bytes, size_t alignment) { #else size_t mask = alignment - 1; size_t aligned_bytes = bytes + (-bytes & mask); - return aligned_alloc(alignment, bytes); + return aligned_alloc(alignment, aligned_bytes); #endif } @@ -110,8 +87,6 @@ static nncase_runtime_cpu_mt_t nncase_cpu_mt_ = { .sqrtf = sqrtf, .tanhf = tanhf, .sram_address = nullptr, - .local_alloc = local_alloc, - .local_free = local_free, .failfast = nullptr, #ifndef WIN32 diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/topo_aware_runtime.cshtml b/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/topo_aware_runtime.cshtml index c67cfc984c..b5cb2e6979 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/topo_aware_runtime.cshtml +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/topo_aware_runtime.cshtml @@ -18,11 +18,6 @@ #include #include -@foreach (var c in Model.Options.HierarchyNames) -{ -@:thread_local size_t @(c)id = 0; -} - /** * @@brief topology aware runtime * @@ -112,7 +107,7 @@ class tensor_reduce_sync_impl { public: void reduce_group_sync() const noexcept { @foreach(var comb in combinations) { - var reduce_group_index = string.Join(", ", Enumerable.Range(0, hierarchy.Length).Select(i => comb.Contains(i) ? "0" : hierarchyNames[i] + "id")); + var reduce_group_index = string.Join(", ", Enumerable.Range(0, hierarchy.Length).Select(i => comb.Contains(i) ? "0" : "ntt::" + hierarchyNames[i] + "id()")); @:if constexpr (Kind == tar::reduce_kind::@(GetName(comb, string.Empty))) { @: tar::@(GetName(comb))(@(reduce_group_index)).arrive_and_wait(); @:} @@ -156,7 +151,7 @@ class tensor_reduce_sync_impl { } @{ - var cur_index = string.Join(", ", Enumerable.Range(0, hierarchy.Length).Select(i => hierarchyNames[i] + "id")); + var cur_index = string.Join(", ", Enumerable.Range(0, hierarchy.Length).Select(i => "ntt::" + hierarchyNames[i] + "id()")); } template void operator()(TIn &src, TOut &&dest) { diff --git a/modules/Nncase.Modules.CPU/Targets/CPUTargetOptions.cs b/modules/Nncase.Modules.CPU/Targets/CPUTargetOptions.cs index e131819b32..3959a13fa6 100644 --- a/modules/Nncase.Modules.CPU/Targets/CPUTargetOptions.cs +++ b/modules/Nncase.Modules.CPU/Targets/CPUTargetOptions.cs @@ -68,7 +68,7 @@ public class CpuTargetOptions : ICpuTargetOptions [DisplayName("--hierarchy-names")] [Description("the name identify of hierarchies.")] [DefaultValue("b")] - public string HierarchyNames { get; set; } = "b"; + public string HierarchyNames { get; set; } = "t"; [DisplayName("--hierarchy-sizes")] [Description("the memory capacity of hierarchies.")] diff --git a/ntt/cmake/cpu_runtime.cmake b/ntt/cmake/cpu_runtime.cmake deleted file mode 100644 index dd17b7f019..0000000000 --- a/ntt/cmake/cpu_runtime.cmake +++ /dev/null @@ -1,12 +0,0 @@ -cmake_minimum_required(VERSION 3.15) - -include(${CMAKE_CURRENT_LIST_DIR}/compile_flags.cmake) - -add_library(nncase_cpu_runtime STATIC ${CMAKE_CURRENT_LIST_DIR}/../src/cpu_runtime.cpp) -target_compile_features(nncase_cpu_runtime PUBLIC cxx_std_20) -target_include_directories(nncase_cpu_runtime PUBLIC ${CMAKE_CURRENT_LIST_DIR}/../include) - -if (MSVC) - set_property(TARGET nncase_cpu_runtime PROPERTY - MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") -endif() diff --git a/ntt/cmake/ntt_module.cmake b/ntt/cmake/ntt_module.cmake new file mode 100644 index 0000000000..f34e21e4ce --- /dev/null +++ b/ntt/cmake/ntt_module.cmake @@ -0,0 +1,34 @@ +cmake_minimum_required(VERSION 3.15) + +include(${CMAKE_CURRENT_LIST_DIR}/compile_flags.cmake) + +if (BUILD_STANDALONE) + add_executable(nncase_ntt_module ${CMAKE_CURRENT_LIST_DIR}/../src/dummy.cpp) +else() + add_library(nncase_ntt_module SHARED ${CMAKE_CURRENT_LIST_DIR}/../src/dummy.cpp) +endif() + +target_compile_features(nncase_ntt_module PUBLIC cxx_std_20) +target_include_directories(nncase_ntt_module PUBLIC ${CMAKE_CURRENT_LIST_DIR}/../include) +set_target_properties(nncase_ntt_module PROPERTIES PREFIX "" SUFFIX "") +set_target_properties(nncase_ntt_module PROPERTIES POSITION_INDEPENDENT_CODE ON) +target_compile_definitions(nncase_ntt_module PUBLIC -DNNCASE_CPU_MODULE=1) + +target_sources(nncase_ntt_module PRIVATE ${CMAKE_CURRENT_LIST_DIR}/../src/cpu_runtime.cpp) + +if (BUILD_STANDALONE) + target_compile_definitions(nncase_ntt_module PUBLIC -DNNCASE_STANDALONE=1) +endif() + +if (MSVC) + set_property(TARGET nncase_ntt_module PROPERTY + MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") + set_target_properties(nncase_ntt_module PROPERTIES LINK_FLAGS /SUBSYSTEM:CONSOLE) + target_link_options(nncase_ntt_module PRIVATE /NODEFAULTLIB) + target_link_libraries(nncase_ntt_module PRIVATE "libvcruntime$<$:d>" + "msvcrt$<$:d>" + "ucrt$<$:d>" + "libcpmt$<$:d>") +elseif(APPLE) + target_link_options(nncase_ntt_module PRIVATE -ld_classic -lc) +endif() diff --git a/ntt/include/nncase/ntt/cpu_runtime.h b/ntt/include/nncase/ntt/cpu_runtime.h deleted file mode 100644 index 7297404b6b..0000000000 --- a/ntt/include/nncase/ntt/cpu_runtime.h +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright 2019-2021 Canaan Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include -#include - -extern "C" { -struct nncase_runtime_cpu_mt_t { - float (*acosf)(float v); - float (*acoshf)(float v); - float (*asinf)(float v); - float (*asinhf)(float v); - float (*copysignf)(float mag, float sgn); - float (*cosf)(float v); - float (*coshf)(float v); - float (*erff)(float v); - float (*expf)(float v); - float (*fmodf)(float x, float y); - float (*logf)(float v); - float (*nearbyintf)(float v); - float (*powf)(float x, float y); - float (*roundf)(float v); - float (*sinf)(float v); - float (*sinhf)(float v); - float (*sqrtf)(float v); - float (*tanhf)(float v); - - uint8_t *(*sram_address)(int bid, int tid); - void *(*local_alloc)(size_t bytes, size_t alignment); - void (*local_free)(void *ptr); - - void (*failfast)(const char *format, va_list args); - -#ifndef WIN32 - void *(*memcpy)(void *dst, const void *src, size_t len); - void *(*memmove)(void *dst, const void *src, size_t len); - void *(*memset)(void *b, int c, size_t len); -#endif -}; - -#ifdef NNCASE_CPU_MODULE -extern nncase_runtime_cpu_mt_t *g_cpu_mt; -#endif -} diff --git a/ntt/include/nncase/ntt/distributed.h b/ntt/include/nncase/ntt/distributed.h new file mode 100644 index 0000000000..436299bc38 --- /dev/null +++ b/ntt/include/nncase/ntt/distributed.h @@ -0,0 +1,39 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "tensor.h" +#include + +namespace nncase::ntt { +template struct program_id_getter { + static size_t id() noexcept; + static size_t dim() noexcept; +}; + +template size_t program_id() noexcept { + return program_id_getter::id(); +} + +template size_t program_dim() noexcept { + return program_id_getter::dim(); +} + +inline size_t tid() noexcept { return program_id<0>(); } +inline size_t tdim() noexcept { return program_dim<0>(); } +inline size_t bid() noexcept { return program_id<1>(); } +inline size_t bdim() noexcept { return program_dim<1>(); } +inline size_t cid() noexcept { return program_id<2>(); } +inline size_t cdim() noexcept { return program_dim<2>(); } +} // namespace nncase::ntt diff --git a/ntt/include/nncase/ntt/ntt.h b/ntt/include/nncase/ntt/ntt.h index 7c20869989..94070e9e13 100644 --- a/ntt/include/nncase/ntt/ntt.h +++ b/ntt/include/nncase/ntt/ntt.h @@ -13,6 +13,7 @@ * limitations under the License. */ #pragma once +#include "distributed.h" #include "kernels/binary.h" #include "kernels/cast.h" #include "kernels/clamp.h" @@ -58,3 +59,5 @@ #include "arch/riscv64/tensor_ops.h" #include "arch/riscv64/ukernels.h" #endif + +#include "runtime/cpu_runtime.h" diff --git a/ntt/include/nncase/ntt/runtime.h b/ntt/include/nncase/ntt/runtime.h new file mode 100644 index 0000000000..878a37234d --- /dev/null +++ b/ntt/include/nncase/ntt/runtime.h @@ -0,0 +1,30 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include + +#if defined(_MSC_VER) +#define NTT_RUNTIME_API __declspec(dllexport) +#else +#define NTT_RUNTIME_API __attribute__((visibility("default"))) +#endif + +namespace nncase::ntt::runtime { +void *thread_alloc(size_t bytes, size_t alignment); +void thread_free(void *ptr); +} // namespace nncase::ntt::runtime + +extern "C" void thread_main(std::byte *const *inouts, const std::byte *rdata); diff --git a/ntt/include/nncase/ntt/runtime/cpu_runtime.h b/ntt/include/nncase/ntt/runtime/cpu_runtime.h new file mode 100644 index 0000000000..5b66861a3f --- /dev/null +++ b/ntt/include/nncase/ntt/runtime/cpu_runtime.h @@ -0,0 +1,77 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "../distributed.h" +#include "../runtime.h" +#include + +#ifdef __APPLE__ +#include +#endif + +namespace nncase::ntt::runtime { +struct cpu_block_entry_params_t { + size_t tdim; + size_t bdim; + size_t cdim; + size_t bid; + size_t cid; + size_t cpu_id_offset; + std::byte *const *inouts; + const std::byte *rdata; +#ifdef __APPLE__ + pthread_key_t cpu_thread_context_key; +#endif +}; + +struct cpu_thread_context_t { + size_t tid; + size_t bid; + size_t cid; + + static cpu_thread_context_t ¤t() noexcept; +}; + +extern size_t tdim; +extern size_t bdim; +extern size_t cdim; +} // namespace nncase::ntt::runtime + +namespace nncase::ntt { +template <> struct program_id_getter<0> { + static size_t id() noexcept { + return runtime::cpu_thread_context_t::current().tid; + } + static size_t dim() noexcept { return runtime::tdim; } +}; + +template <> struct program_id_getter<1> { + static size_t id() noexcept { + return runtime::cpu_thread_context_t::current().bid; + } + static size_t dim() noexcept { return runtime::bdim; } +}; + +template <> struct program_id_getter<2> { + static size_t id() noexcept { + return runtime::cpu_thread_context_t::current().cid; + } + static size_t dim() noexcept { return runtime::cdim; } +}; +} // namespace nncase::ntt + +extern "C" NTT_RUNTIME_API void +block_entry(const nncase::ntt::runtime::cpu_block_entry_params_t ¶ms); +using block_entry_t = decltype(block_entry) *; diff --git a/ntt/src/cpu_runtime.cpp b/ntt/src/cpu_runtime.cpp index 57c2bfa77d..96c0f58759 100644 --- a/ntt/src/cpu_runtime.cpp +++ b/ntt/src/cpu_runtime.cpp @@ -12,81 +12,116 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include #include #include -#include #include -#include +#include +#include +#include +#include -extern "C" { -nncase_runtime_cpu_mt_t *g_cpu_mt; - -#ifndef NNCASE_STANDALONE -// compiler support -#if defined(_MSC_VER) -#pragma function(acosf) -#pragma function(asinf) -#pragma function(cosf) -#pragma function(coshf) -#pragma function(erff) -#pragma function(expf) -#pragma function(fmodf) -#pragma function(logf) -#pragma function(powf) -#pragma function(roundf) -#pragma function(sinf) -#pragma function(sinhf) -#pragma function(sqrtf) -#pragma function(tanhf) +#ifdef WIN32 +#include +#elif defined(__APPLE__) +#include +#include +#else +#include #endif -float acosf(float v) { return g_cpu_mt->acosf(v); } -float acoshf(float v) { return g_cpu_mt->acoshf(v); } -float asinf(float v) { return g_cpu_mt->asinf(v); } -float asinhf(float v) { return g_cpu_mt->asinhf(v); } -float copysignf(float mag, float sgn) { return g_cpu_mt->copysignf(mag, sgn); } -float cosf(float v) { return g_cpu_mt->cosf(v); } -float coshf(float v) { return g_cpu_mt->coshf(v); } -float erff(float v) { return g_cpu_mt->erff(v); } -float expf(float v) { return g_cpu_mt->expf(v); } -float fmodf(float x, float y) { return g_cpu_mt->fmodf(x, y); } -float logf(float v) { return g_cpu_mt->logf(v); } -float nearbyintf(float v) { return g_cpu_mt->nearbyintf(v); } -float powf(float x, float y) { return g_cpu_mt->powf(x, y); } -float roundf(float v) { return g_cpu_mt->roundf(v); } -float sinf(float v) { return g_cpu_mt->sinf(v); } -float sinhf(float v) { return g_cpu_mt->sinhf(v); } -float sqrtf(float v) { return g_cpu_mt->sqrtf(v); } -float tanhf(float v) { return g_cpu_mt->tanhf(v); } +namespace nncase::ntt::runtime { +size_t tdim; +size_t bdim; +size_t cdim; + +#ifdef __APPLE__ +pthread_key_t cpu_thread_context_key; +#else +thread_local cpu_thread_context_t cpu_thread_context; +#endif +void *thread_alloc(size_t bytes, size_t alignment) { #ifdef WIN32 -void _invalid_parameter(wchar_t const *const expression, - wchar_t const *const function_name, - wchar_t const *const file_name, - unsigned int const line_number, - uintptr_t const reserved) { - g_cpu_mt->failfast("invalid_parameter", (va_list)0); + return _aligned_malloc(bytes, alignment); +#else + size_t mask = alignment - 1; + size_t aligned_bytes = bytes + (-bytes & mask); + auto ptr = aligned_alloc(alignment, aligned_bytes); + if (!ptr) { + std::terminate(); + } + return ptr; +#endif } -int _CrtDbgReport(int reportType, const char *filename, int linenumber, - const char *moduleName, const char *format, ...) { - va_list args; - va_start(args, format); - g_cpu_mt->failfast(format, args); - va_end(args); - return 0; -} +void thread_free(void *ptr) { +#ifdef WIN32 + _aligned_free(ptr); #else -void *memcpy(void *dst, const void *src, size_t len) { - return g_cpu_mt->memcpy(dst, src, len); + free(ptr); +#endif } +} // namespace nncase::ntt::runtime + +using namespace nncase::ntt::runtime; -void *memmove(void *dst, const void *src, size_t len) { - return g_cpu_mt->memmove(dst, src, len); +cpu_thread_context_t &cpu_thread_context_t::current() noexcept { +#ifndef __APPLE__ + return cpu_thread_context; +#else + return *reinterpret_cast( + pthread_getspecific(cpu_thread_context_key)); +#endif } -void *memset(void *b, int c, size_t len) { return g_cpu_mt->memset(b, c, len); } +extern "C" void block_entry(const cpu_block_entry_params_t ¶ms) { + tdim = params.tdim; + bdim = params.bdim; + cdim = params.cdim; + +#ifdef __APPLE__ + cpu_thread_context_key = params.cpu_thread_context_key; #endif + + std::vector threads; + for (size_t tid = 0; tid < tdim; tid++) { + threads.emplace_back([tid, params] { +#ifdef __APPLE__ + pthread_setspecific(cpu_thread_context_key, + new cpu_thread_context_t +#else + cpu_thread_context_t::current() = #endif + { + .tid = tid, + .bid = params.bid, + .cid = params.cid, + } +#ifdef __APPLE__ + ); +#else + ; +#endif + + size_t cpu_id = params.cpu_id_offset + tid; +#if WIN32 + SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR)1 << cpu_id); +#elif defined(__APPLE__) + thread_affinity_policy_data_t policy = {(int)cpu_id}; + thread_policy_set(pthread_mach_thread_np(pthread_self()), + THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, + THREAD_AFFINITY_POLICY_COUNT); +#else + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(cpu_id, &cpuset); + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); +#endif + cpu_thread_context_t::current().tid = tid; + thread_main(params.inouts, params.rdata); + }); + } + + for (auto &t : threads) + t.join(); } diff --git a/ntt/src/dummy.cpp b/ntt/src/dummy.cpp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/Native/include/nncase/io_utils.h b/src/Native/include/nncase/io_utils.h index c28a335369..de5255afa2 100644 --- a/src/Native/include/nncase/io_utils.h +++ b/src/Native/include/nncase/io_utils.h @@ -46,7 +46,13 @@ template class aligned_allocator { throw std::bad_alloc(); } - void deallocate(T *p, std::size_t) noexcept { std::free(p); } + void deallocate(T *p, std::size_t) noexcept { +#ifdef _WIN32 + _aligned_free(p); +#else + std::free(p); +#endif + } template struct rebind { using other = aligned_allocator; diff --git a/src/Native/src/runtime/CMakeLists.txt b/src/Native/src/runtime/CMakeLists.txt index 3196fe7bee..a7f11b73d1 100644 --- a/src/Native/src/runtime/CMakeLists.txt +++ b/src/Native/src/runtime/CMakeLists.txt @@ -65,7 +65,7 @@ else() add_library(nncaseruntime SHARED dummy.cpp) target_link_libraries(nncaseruntime PRIVATE nncasebase kernels simulator compiler simulator_stackvm simulator_cpu cpu_loaders fmt::fmt) if (NOT (WIN32 OR APPLE)) - target_link_libraries(nncaseruntime PRIVATE dl) + target_link_libraries(nncaseruntime PRIVATE dl pthread) endif() set_target_properties(nncaseruntime PROPERTIES OUTPUT_NAME "Nncase.Runtime.Native") diff --git a/src/Native/src/runtime/cpu/loaders/elf/elf_loader.cpp b/src/Native/src/runtime/cpu/loaders/elf/elf_loader.cpp index e98393ef79..42f8fcaa16 100644 --- a/src/Native/src/runtime/cpu/loaders/elf/elf_loader.cpp +++ b/src/Native/src/runtime/cpu/loaders/elf/elf_loader.cpp @@ -90,7 +90,7 @@ void elf_loader::load(std::span elf) { throw std::runtime_error("dlopen error:" + std::string(dlerror())); } - entry_ = dlsym(handle_, "kernel_entry"); + entry_ = dlsym(handle_, "block_entry"); if (!entry_) { throw std::runtime_error("dlsym error:" + std::string(dlerror())); } diff --git a/src/Native/src/runtime/cpu/loaders/macho/macho_loader.cpp b/src/Native/src/runtime/cpu/loaders/macho/macho_loader.cpp index 5d72093c14..f7ae78fdb7 100644 --- a/src/Native/src/runtime/cpu/loaders/macho/macho_loader.cpp +++ b/src/Native/src/runtime/cpu/loaders/macho/macho_loader.cpp @@ -14,6 +14,8 @@ */ #include "macho_loader.h" #include +#include +#include #include #include #include @@ -21,18 +23,31 @@ using namespace nncase::runtime; +#define THROW_SYS_IF_NOT(x) \ + if (!(x)) { \ + throw std::system_error(errno, std::system_category()); \ + } + macho_loader::~macho_loader() { +#if 0 if (!NSUnLinkModule(reinterpret_cast(mod_), NSUNLINKMODULE_OPTION_NONE)) { - // throw std::runtime_error("NSUnLinkModule failed"); + abort(); } if (!NSDestroyObjectFileImage(reinterpret_cast(ofi_))) { - // throw std::runtime_error("NSDestroyObjectFileImage failed"); + + abort(); } +#else + if (mod_) { + dlclose(mod_); + } +#endif } void macho_loader::load(std::span macho) { +#if 0 if (NSCreateObjectFileImageFromMemory( macho.data(), macho.size_bytes(), reinterpret_cast(&ofi_)) != @@ -47,12 +62,36 @@ void macho_loader::load(std::span macho) { } sym_ = reinterpret_cast(NSLookupSymbolInModule( - reinterpret_cast(mod_), "_kernel_entry")); + reinterpret_cast(mod_), "_module_entry")); if (sym_ == NULL) { throw std::runtime_error("NSLookupSymbolInModule failed"); } +#else + char temp_path[] = "/tmp/nncase.function.cpu.XXXXXX"; + { + auto func_file = mkstemp(temp_path); + THROW_SYS_IF_NOT(func_file != -1); + THROW_SYS_IF_NOT(write(func_file, (char *)macho.data(), macho.size()) != + -1); + THROW_SYS_IF_NOT(close(func_file) != -1); + } + + mod_ = dlopen(temp_path, RTLD_NOW); + if (!mod_) { + throw std::runtime_error("dlopen error:" + std::string(dlerror())); + } + + sym_ = dlsym(mod_, "block_entry"); + if (!sym_) { + throw std::runtime_error("dlsym error:" + std::string(dlerror())); + } +#endif } void *macho_loader::entry() const noexcept { +#if 0 return NSAddressOfSymbol(reinterpret_cast(sym_)); +#else + return sym_; +#endif } diff --git a/src/Native/src/runtime/cpu/loaders/macho/macho_loader.h b/src/Native/src/runtime/cpu/loaders/macho/macho_loader.h index efba7af607..c91ac39fa7 100644 --- a/src/Native/src/runtime/cpu/loaders/macho/macho_loader.h +++ b/src/Native/src/runtime/cpu/loaders/macho/macho_loader.h @@ -20,14 +20,23 @@ BEGIN_NS_NNCASE_RUNTIME class macho_loader { public: - macho_loader() noexcept : ofi_(nullptr), mod_(nullptr), sym_(nullptr) {} + macho_loader() noexcept + : +#if 0 + ofi_(nullptr), +#endif + mod_(nullptr), + sym_(nullptr) { + } ~macho_loader(); void load(std::span macho); void *entry() const noexcept; private: +#if 0 void *ofi_; +#endif void *mod_; void *sym_; }; diff --git a/src/Native/src/runtime/cpu/loaders/pe/pe_loader.cpp b/src/Native/src/runtime/cpu/loaders/pe/pe_loader.cpp index 8f62966122..19120fb538 100644 --- a/src/Native/src/runtime/cpu/loaders/pe/pe_loader.cpp +++ b/src/Native/src/runtime/cpu/loaders/pe/pe_loader.cpp @@ -18,7 +18,7 @@ using namespace nncase::runtime; -#ifndef NDEBUG +#if 1 #define THROW_WIN32_IF_NOT(x) \ if (!(x)) { \ throw std::system_error(GetLastError(), std::system_category()); \ @@ -41,7 +41,7 @@ static int ProtectionFlags[2][2][2] = { pe_loader::~pe_loader() { if (image_) { -#ifndef NDEBUG +#if 1 FreeModule((HMODULE)image_); #else VirtualFree(image_, 0, MEM_RELEASE); @@ -50,7 +50,7 @@ pe_loader::~pe_loader() { } void pe_loader::load(std::span pe) { -#ifndef NDEBUG +#if 1 wchar_t temp_path[MAX_PATH]; wchar_t temp_filename[MAX_PATH]; @@ -71,9 +71,10 @@ void pe_loader::load(std::span pe) { FILE_ATTRIBUTE_TEMPORARY | FILE_FLAG_DELETE_ON_CLOSE, nullptr); THROW_WIN32_IF_NOT(func_file != INVALID_HANDLE_VALUE); - auto func_mod = LoadLibraryW(temp_filename); - THROW_WIN32_IF_NOT(func_mod); - image_ = (std::byte *)func_mod; + image_ = LoadLibraryW(temp_filename); + THROW_WIN32_IF_NOT(image_); + entry_ = (void *)GetProcAddress((HMODULE)image_, "block_entry"); + THROW_WIN32_IF_NOT(entry_); #else auto dos_header = reinterpret_cast(pe.data()); auto nt_header = reinterpret_cast( @@ -134,8 +135,12 @@ void pe_loader::load(std::span pe) { } void *pe_loader::entry() const noexcept { +#if 1 + return entry_; +#else auto dos_header = reinterpret_cast(image_); auto nt_header = reinterpret_cast( image_ + dos_header->e_lfanew); return image_ + nt_header->OptionalHeader.AddressOfEntryPoint; +#endif } diff --git a/src/Native/src/runtime/cpu/loaders/pe/pe_loader.h b/src/Native/src/runtime/cpu/loaders/pe/pe_loader.h index 076f94da65..c739b48514 100644 --- a/src/Native/src/runtime/cpu/loaders/pe/pe_loader.h +++ b/src/Native/src/runtime/cpu/loaders/pe/pe_loader.h @@ -20,14 +20,15 @@ BEGIN_NS_NNCASE_RUNTIME class pe_loader { public: - pe_loader() noexcept : image_(nullptr) {} + pe_loader() noexcept : image_(nullptr), entry_(nullptr) {} ~pe_loader(); void load(std::span pe); void *entry() const noexcept; private: - std::byte *image_; + void *image_; + void *entry_; }; END_NS_NNCASE_RUNTIME diff --git a/src/Native/src/runtime/cpu/runtime_function.cpp b/src/Native/src/runtime/cpu/runtime_function.cpp index ac0367d267..55a88e2dd7 100644 --- a/src/Native/src/runtime/cpu/runtime_function.cpp +++ b/src/Native/src/runtime/cpu/runtime_function.cpp @@ -13,6 +13,7 @@ * limitations under the License. */ #include "runtime_function.h" +#include "nncase/ntt/runtime/cpu_runtime.h" #include #include #include @@ -26,14 +27,17 @@ using namespace nncase; using namespace nncase::runtime; using namespace nncase::runtime::cpu; +using namespace nncase::ntt::runtime; typedef struct { - uint64_t DataPoolSize; - uint64_t DataAlign; + uint32_t tdim; + uint32_t bdim; + uint32_t cdim; + uint32_t reserved0; } desc_header; cpu_runtime_function::cpu_runtime_function(runtime_module &rt_module) - : runtime_function(rt_module), kernel_entry_(nullptr), data_pool_size_(0) {} + : runtime_function(rt_module), block_entry_(nullptr), tdim_(0), bdim_(0) {} cpu_runtime_function::~cpu_runtime_function() {} @@ -46,14 +50,15 @@ result cpu_runtime_function::initialize_core( try_(context.read_section( ".desc", [this](auto reader, size_t) -> result { auto header = reader.template read(); - this->data_pool_size_ = header.DataPoolSize; - this->data_align_ = header.DataAlign; + this->tdim_ = header.tdim; + this->bdim_ = header.bdim; + this->cdim_ = header.cdim; return ok(); })); auto text = module().text().subspan(context.header().entrypoint, context.header().text_size); loader_.load(text); - kernel_entry_ = (kernel_entry_t)loader_.entry(); + block_entry_ = (block_entry_t)loader_.entry(); return ok(); } diff --git a/src/Native/src/runtime/cpu/runtime_function.h b/src/Native/src/runtime/cpu/runtime_function.h index 9393ab8395..51b3b1f165 100644 --- a/src/Native/src/runtime/cpu/runtime_function.h +++ b/src/Native/src/runtime/cpu/runtime_function.h @@ -15,7 +15,7 @@ #pragma once #include "runtime_module.h" #include -#include +#include #include #include @@ -29,12 +29,7 @@ BEGIN_NS_NNCASE_RT_MODULE(cpu) -#define CPU_ENTRY_NAME "kernel_entry" - class cpu_runtime_function final : public runtime_function { - typedef void (*kernel_entry_t)(nncase_runtime_cpu_mt_t *cpu_mt, - std::byte **inputs, const std::byte *rdata); - public: cpu_runtime_function(runtime_module &rt_module); virtual ~cpu_runtime_function(); @@ -59,9 +54,10 @@ class cpu_runtime_function final : public runtime_function { elf_loader loader_; #endif - kernel_entry_t kernel_entry_; - uint64_t data_pool_size_; - uint64_t data_align_; + block_entry_t block_entry_; + uint64_t tdim_; + uint64_t bdim_; + uint64_t cdim_; }; END_NS_NNCASE_RT_MODULE diff --git a/src/Native/src/runtime/cpu/runtime_function.run.cpp b/src/Native/src/runtime/cpu/runtime_function.run.cpp index 5f31dc3c9f..0ffe980dd5 100644 --- a/src/Native/src/runtime/cpu/runtime_function.run.cpp +++ b/src/Native/src/runtime/cpu/runtime_function.run.cpp @@ -12,88 +12,49 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "nncase/ntt/runtime.h" +#include "nncase/ntt/runtime/cpu_runtime.h" #include "runtime_function.h" #include #include #include #include #include +#include +#include using namespace nncase; using namespace nncase::runtime; using namespace nncase::runtime::cpu; +using namespace nncase::ntt::runtime; -namespace { -#define SRAM_SIZE_PER_BLOCK (1024 * 1024 * 4) -#define SRAM_SIZE_PER_THREAD (SRAM_SIZE_PER_BLOCK) - -static uint8_t _sram[1][SRAM_SIZE_PER_BLOCK]; -static uint8_t *_block_sram_ptr[] = {_sram[0]}; -static uint8_t *sram_address(int bid, int tid) { - return _block_sram_ptr[bid] + (SRAM_SIZE_PER_BLOCK * tid); -} - -static void failfast(const char *foramt, va_list args) { - char buffer[1024]; - vsprintf(buffer, foramt, args); - throw std::runtime_error(buffer); -} - -static void *local_alloc(size_t bytes, size_t alignment) { -#ifdef WIN32 - return _aligned_malloc(bytes, alignment); -#else - size_t mask = alignment - 1; - size_t aligned_bytes = bytes + (-bytes & mask); - auto ptr = aligned_alloc(alignment, aligned_bytes); - if (!ptr) { - throw std::runtime_error("aligned alloc error!"); - } - return ptr; -#endif -} - -static void local_free(void *ptr) { -#ifdef WIN32 - _aligned_free(ptr); -#else - free(ptr); +result cpu_runtime_function::run(std::span params) noexcept { + std::vector blocks; + for (size_t cid = 0; cid < cdim_; cid++) { + for (size_t bid = 0; bid < bdim_; bid++) { + blocks.emplace_back([cid, bid, params, this] { + cpu_block_entry_params_t block_entry_params{ + .tdim = tdim_, + .bdim = bdim_, + .cdim = cdim_, + .bid = bid, + .cid = cid, + .cpu_id_offset = (cid * bdim_ + bid) * tdim_, + .inouts = params.data(), + .rdata = module().rdata().data(), +#ifdef __APPLE__ + .cpu_thread_context_key = module().cpu_thread_context_key(), #endif -} + }; -nncase_runtime_cpu_mt_t nncase_cpu_mt_ = { - .acosf = acosf, - .acoshf = acoshf, - .asinf = asinf, - .asinhf = asinhf, - .copysignf = copysignf, - .cosf = cosf, - .coshf = coshf, - .erff = erff, - .expf = expf, - .fmodf = fmodf, - .logf = logf, - .nearbyintf = nearbyintf, - .powf = powf, - .roundf = roundf, - .sinf = sinf, - .sinhf = sinhf, - .sqrtf = sqrtf, - .tanhf = tanhf, - .sram_address = sram_address, - .local_alloc = local_alloc, - .local_free = local_free, - .failfast = failfast, + block_entry_(block_entry_params); + }); + } + } -#ifndef WIN32 - .memcpy = memcpy, - .memmove = memmove, - .memset = memset, -#endif -}; -} // namespace + for (auto &block : blocks) { + block.join(); + } -result cpu_runtime_function::run(std::span params) noexcept { - kernel_entry_(&nncase_cpu_mt_, params.data(), module().rdata().data()); return ok(); } diff --git a/src/Native/src/runtime/cpu/runtime_module.cpp b/src/Native/src/runtime/cpu/runtime_module.cpp index 8747b3ba79..9ac44f3271 100644 --- a/src/Native/src/runtime/cpu/runtime_module.cpp +++ b/src/Native/src/runtime/cpu/runtime_module.cpp @@ -14,6 +14,7 @@ */ #include "runtime_module.h" #include "runtime_function.h" +#include #include #include #include @@ -22,6 +23,20 @@ using namespace nncase; using namespace nncase::runtime; using namespace nncase::runtime::cpu; +using namespace nncase::ntt::runtime; + +cpu_runtime_module::cpu_runtime_module() noexcept { +#ifdef __APPLE__ + pthread_key_create(&cpu_thread_context_key_, + [](void *ptr) { delete (cpu_thread_context_t *)ptr; }); +#endif +} + +cpu_runtime_module::~cpu_runtime_module() { +#ifdef __APPLE__ + pthread_key_delete(cpu_thread_context_key_); +#endif +} result cpu_runtime_module::initialize_before_functions( runtime_module_init_context &context) noexcept { diff --git a/src/Native/src/runtime/cpu/runtime_module.h b/src/Native/src/runtime/cpu/runtime_module.h index af15d2fa8a..ce8c6f03db 100644 --- a/src/Native/src/runtime/cpu/runtime_module.h +++ b/src/Native/src/runtime/cpu/runtime_module.h @@ -17,15 +17,28 @@ #include #include +#ifdef __APPLE__ +#include +#endif + BEGIN_NS_NNCASE_RT_MODULE(cpu) class cpu_runtime_module : public runtime_module { public: + cpu_runtime_module() noexcept; + virtual ~cpu_runtime_module(); + kernels::kernel_context &kernel_context() noexcept; std::span text() const noexcept { return text_; } std::span rdata() const noexcept { return rdata_; } +#ifdef __APPLE__ + pthread_key_t cpu_thread_context_key() const noexcept { + return cpu_thread_context_key_; + } +#endif + protected: result initialize_before_functions( runtime_module_init_context &context) noexcept override; @@ -37,6 +50,10 @@ class cpu_runtime_module : public runtime_module { std::span rdata_; host_buffer_t text_storage_; host_buffer_t rdata_storage_; + +#ifdef __APPLE__ + pthread_key_t cpu_thread_context_key_ = {}; +#endif }; END_NS_NNCASE_RT_MODULE