From f7f17a7fd037c73c8bbb19310dae4f6d7d8390fa Mon Sep 17 00:00:00 2001 From: Jorge Aparicio Date: Thu, 9 Jun 2016 22:07:36 -0500 Subject: [PATCH 1/2] initial support for PTX generation this PR adds two targets: - `nvptx-unknown-unknown` (32-bit machine model) - `nvptx64-unknown-unknown` (64-bit machine model) that can be used to generate PTX code from Rust source code: ``` $ rustc --target nvptx64-unknown-unknown --emit=asm foo.rs $ head foo.s // // Generated by LLVM NVPTX Back-End // .version 3.2 .target sm_20 .address_size 64 (..) ``` this PR also adds new intrinsics that are equivalent to the following CUDA variables/functions: - `threadIdx.{x,y,z}` - `blockIdx.{x,y,z}` - `blockDim.{x,y,z}` - `gridDim.{x,y,z}` - `__syncthreads` this PR has been tested by writing a kernel that `memcpy`s a chunk of memory to other: ``` rust #![no_core] #[no_mangle] pub fn memcpy_(src: *const f32, dst: *mut f32, n: isize) { unsafe { let i = overflowing_add(overflowing_mul(block_idx_x(), block_dim_x()), thread_idx_x()) as isize; if i < n { *(offset(dst, i) as *mut f32) = *offset(src, i) } } } // undeclared functions are intrinsics // omitted: lang items ``` which translates to: ``` ptx // // Generated by LLVM NVPTX Back-End // .version 3.2 .target sm_20 .address_size 64 // .globl memcpy_ .visible .func memcpy_( .param .b64 memcpy__param_0, .param .b64 memcpy__param_1, .param .b64 memcpy__param_2 ) { .reg .pred %p<2>; .reg .s32 %r<6>; .reg .s64 %rd<8>; mov.u32 %r1, %ctaid.x; ld.param.u64 %rd5, [memcpy__param_2]; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; mad.lo.s32 %r4, %r2, %r1, %r3; cvt.s64.s32 %rd6, %r4; setp.ge.s64 %p1, %rd6, %rd5; @%p1 bra LBB0_2; ld.param.u64 %rd3, [memcpy__param_0]; ld.param.u64 %rd4, [memcpy__param_1]; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd3, %rd7; add.s64 %rd2, %rd4, %rd7; ld.u32 %r5, [%rd1]; st.u32 [%rd2], %r5; LBB0_2: ret; } ``` however, this PTX code can't be directly used in a CUDA program because the `memcpy_` function is marked as a "device function" (`.func memcpy_`). Device functions can only be called from other GPU code. To be usable from a CUDA program `memcpy_` should be marked as a "kernel function" (`.entry memcpy_`): ``` diff // .globl memcpy_ -.visible .entry memcpy_( +.visible .func memcpy_( .param .b64 memcpy__param_0, .param .b64 memcpy__param_1, .param .b64 memcpy__param_2 ``` After patching the generated PTX code the kernel became callable from a CUDA program. ### unresolved questions - we need to provide a way to differentiate functions that will be translated to "kernel functions" from the ones that will be translated to "device functions". CUDA uses the `__global__` and `__device__` attributes for this. - we need to provide a way to let the user choose on which memory region [2] variables should be placed. CUDA exposes the `__shared__` and `__constant__` attributes for this. ### FIXMEs - pointer arguments in kernel and device functions should be marked with the `addrspace(1)` attribute in LLVM IR. - compiling a rlib produces an empty archive (no PTX in it) [1]: http://llvm.org/docs/NVPTXUsage.html#kernel-metadata [2]: http://llvm.org/docs/NVPTXUsage.html#id10 --- src/bootstrap/build/native.rs | 2 +- src/libcore/intrinsics.rs | 18 ++++++++++ src/librustc_back/target/mod.rs | 4 ++- .../target/nvptx64_unknown_unknown.rs | 36 +++++++++++++++++++ .../target/nvptx_unknown_unknown.rs | 36 +++++++++++++++++++ src/librustc_llvm/build.rs | 2 +- src/librustc_llvm/lib.rs | 5 +++ src/librustc_trans/context.rs | 13 +++++++ src/librustc_trans/intrinsic.rs | 13 +++++++ src/librustc_typeck/check/intrinsic.rs | 9 +++++ 10 files changed, 135 insertions(+), 3 deletions(-) create mode 100644 src/librustc_back/target/nvptx64_unknown_unknown.rs create mode 100644 src/librustc_back/target/nvptx_unknown_unknown.rs diff --git a/src/bootstrap/build/native.rs b/src/bootstrap/build/native.rs index 5691b2da6a448..187b8c6b4ddd5 100644 --- a/src/bootstrap/build/native.rs +++ b/src/bootstrap/build/native.rs @@ -63,7 +63,7 @@ pub fn llvm(build: &Build, target: &str) { .out_dir(&dst) .profile(if build.config.llvm_optimize {"Release"} else {"Debug"}) .define("LLVM_ENABLE_ASSERTIONS", assertions) - .define("LLVM_TARGETS_TO_BUILD", "X86;ARM;AArch64;Mips;PowerPC") + .define("LLVM_TARGETS_TO_BUILD", "X86;ARM;AArch64;Mips;PowerPC;NVPTX") .define("LLVM_INCLUDE_EXAMPLES", "OFF") .define("LLVM_INCLUDE_TESTS", "OFF") .define("LLVM_INCLUDE_DOCS", "OFF") diff --git a/src/libcore/intrinsics.rs b/src/libcore/intrinsics.rs index 94baf188bcaee..1ffbf043c6b4f 100644 --- a/src/libcore/intrinsics.rs +++ b/src/libcore/intrinsics.rs @@ -602,4 +602,22 @@ extern "rust-intrinsic" { /// on MSVC it's `*mut [usize; 2]`. For more information see the compiler's /// source as well as std's catch implementation. pub fn try(f: fn(*mut u8), data: *mut u8, local_ptr: *mut u8) -> i32; + +} + +#[cfg(not(stage0))] +extern "rust-intrinsic" { + pub fn thread_idx_x() -> i32; + pub fn thread_idx_y() -> i32; + pub fn thread_idx_z() -> i32; + pub fn block_idx_x() -> i32; + pub fn block_idx_y() -> i32; + pub fn block_idx_z() -> i32; + pub fn block_dim_x() -> i32; + pub fn block_dim_y() -> i32; + pub fn block_dim_z() -> i32; + pub fn grid_dim_x() -> i32; + pub fn grid_dim_y() -> i32; + pub fn grid_dim_z() -> i32; + pub fn syncthreads(); } diff --git a/src/librustc_back/target/mod.rs b/src/librustc_back/target/mod.rs index 2163a8a1689b6..7c9fd9d7e4d26 100644 --- a/src/librustc_back/target/mod.rs +++ b/src/librustc_back/target/mod.rs @@ -141,7 +141,9 @@ supported_targets! { ("i586-pc-windows-msvc", i586_pc_windows_msvc), ("le32-unknown-nacl", le32_unknown_nacl), - ("asmjs-unknown-emscripten", asmjs_unknown_emscripten) + ("asmjs-unknown-emscripten", asmjs_unknown_emscripten), + ("nvptx-unknown-unknown", nvptx_unknown_unknown), + ("nvptx64-unknown-unknown", nvptx64_unknown_unknown) } /// Everything `rustc` knows about how to compile for a specific target. diff --git a/src/librustc_back/target/nvptx64_unknown_unknown.rs b/src/librustc_back/target/nvptx64_unknown_unknown.rs new file mode 100644 index 0000000000000..d5e6747a61e2d --- /dev/null +++ b/src/librustc_back/target/nvptx64_unknown_unknown.rs @@ -0,0 +1,36 @@ +// Copyright 2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use super::{Target, TargetOptions}; + +pub fn target() -> Target { + let opts = TargetOptions { + linker: "".to_string(), + ar: "".to_string(), + + cpu: "sm_20".to_string(), + dynamic_linking: false, + executables: false, + no_compiler_rt: true, + allow_asm: false, + .. Default::default() + }; + Target { + llvm_target: "nvptx64-unknown-unknown".to_string(), + target_endian: "little".to_string(), + target_pointer_width: "64".to_string(), + target_os: "none".to_string(), + target_env: "".to_string(), + target_vendor: "unknown".to_string(), + data_layout: "e-i64:64-v16:16-v32:32-n16:32:64".to_string(), + arch: "nvptx".to_string(), + options: opts, + } +} diff --git a/src/librustc_back/target/nvptx_unknown_unknown.rs b/src/librustc_back/target/nvptx_unknown_unknown.rs new file mode 100644 index 0000000000000..dc5005a6bc627 --- /dev/null +++ b/src/librustc_back/target/nvptx_unknown_unknown.rs @@ -0,0 +1,36 @@ +// Copyright 2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use super::{Target, TargetOptions}; + +pub fn target() -> Target { + let opts = TargetOptions { + linker: "".to_string(), + ar: "".to_string(), + + cpu: "sm_20".to_string(), + dynamic_linking: false, + executables: false, + no_compiler_rt: true, + allow_asm: false, + .. Default::default() + }; + Target { + llvm_target: "nvptx-unknown-unknown".to_string(), + target_endian: "little".to_string(), + target_pointer_width: "32".to_string(), + target_os: "none".to_string(), + target_env: "".to_string(), + target_vendor: "unknown".to_string(), + data_layout: "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64".to_string(), + arch: "nvptx".to_string(), + options: opts, + } +} diff --git a/src/librustc_llvm/build.rs b/src/librustc_llvm/build.rs index 250aafd77a826..a2917116bef99 100644 --- a/src/librustc_llvm/build.rs +++ b/src/librustc_llvm/build.rs @@ -68,7 +68,7 @@ fn main() { let host = env::var("HOST").unwrap(); let is_crossed = target != host; - let optional_components = ["x86", "arm", "aarch64", "mips", "powerpc", "pnacl"]; + let optional_components = ["x86", "arm", "aarch64", "mips", "powerpc", "pnacl", "nvptx"]; // FIXME: surely we don't need all these components, right? Stuff like mcjit // or interpreter the compiler itself never uses. diff --git a/src/librustc_llvm/lib.rs b/src/librustc_llvm/lib.rs index e757201c88633..36be86f3794ec 100644 --- a/src/librustc_llvm/lib.rs +++ b/src/librustc_llvm/lib.rs @@ -2407,6 +2407,11 @@ pub fn initialize_available_targets() { LLVMInitializeMipsTargetMC, LLVMInitializeMipsAsmPrinter, LLVMInitializeMipsAsmParser); + init_target!(llvm_component = "nvptx", + LLVMInitializeNVPTXTargetInfo, + LLVMInitializeNVPTXTarget, + LLVMInitializeNVPTXTargetMC, + LLVMInitializeNVPTXAsmPrinter); init_target!(llvm_component = "powerpc", LLVMInitializePowerPCTargetInfo, LLVMInitializePowerPCTarget, diff --git a/src/librustc_trans/context.rs b/src/librustc_trans/context.rs index bfcb1ae33b301..f65361bf2da93 100644 --- a/src/librustc_trans/context.rs +++ b/src/librustc_trans/context.rs @@ -1097,6 +1097,19 @@ fn declare_intrinsic(ccx: &CrateContext, key: &str) -> Option { ifn!("llvm.localrecover", fn(i8p, i8p, t_i32) -> i8p); ifn!("llvm.x86.seh.recoverfp", fn(i8p, i8p) -> i8p); + ifn!("llvm.cuda.syncthreads", fn() -> void); + ifn!("llvm.nvvm.read.ptx.sreg.tid.x", fn() -> t_i32); + ifn!("llvm.nvvm.read.ptx.sreg.tid.y", fn() -> t_i32); + ifn!("llvm.nvvm.read.ptx.sreg.tid.z", fn() -> t_i32); + ifn!("llvm.nvvm.read.ptx.sreg.ctaid.x", fn() -> t_i32); + ifn!("llvm.nvvm.read.ptx.sreg.ctaid.y", fn() -> t_i32); + ifn!("llvm.nvvm.read.ptx.sreg.ctaid.z", fn() -> t_i32); + ifn!("llvm.nvvm.read.ptx.sreg.ntid.x", fn() -> t_i32); + ifn!("llvm.nvvm.read.ptx.sreg.ntid.y", fn() -> t_i32); + ifn!("llvm.nvvm.read.ptx.sreg.ntid.z", fn() -> t_i32); + ifn!("llvm.nvvm.read.ptx.sreg.nctaid.x", fn() -> t_i32); + ifn!("llvm.nvvm.read.ptx.sreg.nctaid.y", fn() -> t_i32); + ifn!("llvm.nvvm.read.ptx.sreg.nctaid.z", fn() -> t_i32); ifn!("llvm.assume", fn(i1) -> void); if ccx.sess().opts.debuginfo != NoDebugInfo { diff --git a/src/librustc_trans/intrinsic.rs b/src/librustc_trans/intrinsic.rs index 54c825fa5face..5d2e1870c55fe 100644 --- a/src/librustc_trans/intrinsic.rs +++ b/src/librustc_trans/intrinsic.rs @@ -89,6 +89,19 @@ fn get_simple_intrinsic(ccx: &CrateContext, name: &str) -> Option { "roundf32" => "llvm.round.f32", "roundf64" => "llvm.round.f64", "assume" => "llvm.assume", + "thread_idx_x" => "llvm.nvvm.read.ptx.sreg.tid.x", + "thread_idx_y" => "llvm.nvvm.read.ptx.sreg.tid.y", + "thread_idx_z" => "llvm.nvvm.read.ptx.sreg.tid.z", + "block_idx_x" => "llvm.nvvm.read.ptx.sreg.ctaid.x", + "block_idx_y" => "llvm.nvvm.read.ptx.sreg.ctaid.y", + "block_idx_z" => "llvm.nvvm.read.ptx.sreg.ctaid.z", + "block_dim_x" => "llvm.nvvm.read.ptx.sreg.ntid.x", + "block_dim_y" => "llvm.nvvm.read.ptx.sreg.ntid.y", + "block_dim_z" => "llvm.nvvm.read.ptx.sreg.ntid.z", + "grid_dim_x" => "llvm.nvvm.read.ptx.sreg.nctaid.x", + "grid_dim_y" => "llvm.nvvm.read.ptx.sreg.nctaid.y", + "grid_dim_z" => "llvm.nvvm.read.ptx.sreg.nctaid.z", + "syncthreads" => "llvm.cuda.syncthreads", _ => return None }; Some(ccx.get_intrinsic(&llvm_name)) diff --git a/src/librustc_typeck/check/intrinsic.rs b/src/librustc_typeck/check/intrinsic.rs index f120e38630b8a..d84abd1a87f2b 100644 --- a/src/librustc_typeck/check/intrinsic.rs +++ b/src/librustc_typeck/check/intrinsic.rs @@ -298,6 +298,15 @@ pub fn check_intrinsic_type(ccx: &CrateCtxt, it: &hir::ForeignItem) { (0, vec![tcx.mk_fn_ptr(fn_ty), mut_u8, mut_u8], tcx.types.i32) } + "thread_idx_x" | "thread_idx_y" | "thread_idx_z" | + "block_idx_x" | "block_idx_y" | "block_idx_z" | + "block_dim_x" | "block_dim_y" | "block_dim_z" | + "grid_dim_x" | "grid_dim_y" | "grid_dim_z" => { + (0, vec![], tcx.types.i32) + } + + "syncthreads" => (0, vec![], tcx.mk_nil()), + ref other => { span_err!(tcx.sess, it.span, E0093, "unrecognized intrinsic function: `{}`", *other); From b7a628f3879f97f93f90098db03e2f754dc1c933 Mon Sep 17 00:00:00 2001 From: Jorge Aparicio Date: Thu, 16 Jun 2016 19:19:20 -0500 Subject: [PATCH 2/2] put PTX intrinsics behind a cfg these can't be used with other targets --- src/libcore/intrinsics.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libcore/intrinsics.rs b/src/libcore/intrinsics.rs index 1ffbf043c6b4f..5155e619bf07d 100644 --- a/src/libcore/intrinsics.rs +++ b/src/libcore/intrinsics.rs @@ -606,6 +606,7 @@ extern "rust-intrinsic" { } #[cfg(not(stage0))] +#[cfg(arch = "nvptx")] extern "rust-intrinsic" { pub fn thread_idx_x() -> i32; pub fn thread_idx_y() -> i32;