test rust-lang/rust#34195

japaric-archived · Jun 10, 2016 · 8e6b66f · 8e6b66f
1 parent 1f833cb
commit 8e6b66f
Show file tree

Hide file tree

Showing 4 changed files with 111 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 *.png
+*.ptx
 *.rs.bk
 Cargo.lock
 target
diff --git a/README.md b/README.md
@@ -2,6 +2,16 @@
 
 Experiments with CUDA and Rust
 
+Testing the [new PTX targets][Implementation].
+
+```
+$ rustc --target nvptx64-unknown-unknown --emit=asm -O kernel.rs
+$ mv kernel.s tests/kernel.ptx
+# change `.func memcpy_` to `.entry memcpy_`
+$ edit tests/kernel.ptx
+$ cargo test memcpy
+```
+
 ## Examples
 
 - [Query number of devices](/examples/query.rs).
@@ -11,7 +21,7 @@ Experiments with CUDA and Rust
 
 ## Areas to explore
 
-- Generating PTX from Rust code at compile time. ([prior art]).
+- ~~Generating PTX from Rust code at compile time. ([prior art]).~~ WIP. [Implementation]. [RFC].
 - Type safety for launching kernels. Arity and argument types should be validated at compile time.
 - Linear algebra library with transparent CUDA acceleration. A matrix type that stores its data
   in the GPU, with operator sugar that maps to CuBLAS/custom kernels.
@@ -22,6 +32,8 @@ Experiments with CUDA and Rust
 
 [linalg]: https://github.com/japaric/linalg.rs
 [prior art]: http://blog.theincredibleholk.org/blog/2012/12/05/compiling-rust-for-gpus/
+[Implementation]: https://github.com/rust-lang/rust/pull/34195
+[RFC]: https://github.com/rust-lang/rfcs/pull/1641
 
 ## License
 

diff --git a/kernel.rs b/kernel.rs
@@ -0,0 +1,93 @@
+#![allow(warnings)]
+#![feature(intrinsics)]
+#![feature(lang_items)]
+#![feature(no_core)]
+#![no_core]
+
+use Option::*;
+use Ordering::*;
+
+#[no_mangle]
+pub fn memcpy_(src: *const f32, dst: *mut f32, n: isize) {
+    unsafe {
+        let i = overflowing_add(overflowing_mul(block_idx_x(), block_dim_x()), thread_idx_x()) as isize;
+
+        if i < n {
+            *(offset(dst, i) as *mut f32) = *offset(src, i)
+        }
+    }
+}
+
+extern "rust-intrinsic" {
+    fn block_idx_x() -> i32;
+    fn block_dim_x() -> i32;
+    fn thread_idx_x() -> i32;
+
+    fn offset<T>(dst: *const T, offset: isize) -> *const T;
+    fn overflowing_add<T>(a: T, b: T) -> T;
+    fn overflowing_mul<T>(a: T, b: T) -> T;
+}
+
+#[lang = "copy"]
+trait Copy {}
+
+#[lang = "sized"]
+trait Sized {}
+// : PartialEq<Rhs>
+#[lang = "ord"]
+trait PartialOrd<Rhs: ?Sized = Self> {
+    fn partial_cmp(&self, other: &Rhs) -> Option<Ordering>;
+
+    #[inline]
+    fn lt(&self, other: &Rhs) -> bool {
+        match self.partial_cmp(other) {
+            Some(Less) => true,
+            _ => false,
+        }
+    }
+
+    #[inline]
+    fn le(&self, other: &Rhs) -> bool {
+        match self.partial_cmp(other) {
+            Some(Less) | Some(Equal) => true,
+            _ => false,
+        }
+    }
+
+    #[inline]
+    fn gt(&self, other: &Rhs) -> bool {
+        match self.partial_cmp(other) {
+            Some(Greater) => true,
+            _ => false,
+        }
+    }
+
+    #[inline]
+    fn ge(&self, other: &Rhs) -> bool {
+        match self.partial_cmp(other) {
+            Some(Greater) | Some(Equal) => true,
+            _ => false,
+        }
+    }
+}
+
+impl PartialOrd for isize {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        loop {}
+    }
+    fn lt(&self, other: &Self) -> bool { (*self) < (*other) }
+    fn le(&self, other: &Self) -> bool { (*self) <= (*other) }
+    fn ge(&self, other: &Self) -> bool { (*self) >= (*other) }
+    fn gt(&self, other: &Self) -> bool { (*self) > (*other) }
+}
+
+enum Option<T> {
+    None,
+    Some(T),
+}
+
+enum Ordering {
+    Less = -1,
+    Equal = 0,
+    Greater = 1,
+}
diff --git a/tests/memcpy.rs b/tests/memcpy.rs
@@ -5,12 +5,11 @@ extern crate uxx;
 use std::ffi::CStr;
 use std::mem;
 
-use cuda::compile;
 use cuda::driver::{self, Any, Block, Device, Direction, Grid, Result};
 use rand::{Rng, XorShiftRng};
 use uxx::u31;
 
-const KERNEL: &'static str = include_str!("memcpy.cu");
+const KERNEL: &'static str = include_str!("kernel.ptx");
 
 #[test]
 fn memcpy() {
@@ -21,7 +20,9 @@ fn run() -> Result<()> {
     const SIZE: usize = 1024 * 1024;
 
     // Compile KERNEL
-    let ref ptx = compile::source(KERNEL).unwrap();
+    let kernel = &mut KERNEL.to_owned().into_bytes();
+    kernel.push(0);
+    let ptx = CStr::from_bytes_with_nul(kernel).unwrap();
 
     // Allocate memory on host
     let ref mut rng: XorShiftRng = rand::thread_rng().gen();