From 0dbc329fd839d3e0ba6e080ffc77a3ed5131449e Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Thu, 16 Aug 2018 15:28:12 -0400 Subject: [PATCH] Fix reinterpret performnace This fixes #25014 by making it more obvious what's going on to LLVM. Instead of a memcpy loop, we use a new intrinsic that puts an actual llvm.memcpy into the IR, which is enough for LLVM to fold everything away. In the benchmark from #25014, we still see some regressions from 0.6, but that is because it needs to dereference through the pointers in the reinterpret and reshape wrappers. In any real code, that dereferencing should be loop-invariantly moved out of the inner loop. --- base/compiler/tfuncs.jl | 1 + base/reinterpretarray.jl | 32 +++++++++++--------------------- base/reshapedarray.jl | 2 +- src/intrinsics.cpp | 30 ++++++++++++++++++++++++++++++ src/intrinsics.h | 1 + src/julia_internal.h | 1 + src/runtime_intrinsics.c | 12 ++++++++++++ 7 files changed, 57 insertions(+), 22 deletions(-) diff --git a/base/compiler/tfuncs.jl b/base/compiler/tfuncs.jl index d4bc7253da23da..7864c5325ca0de 100644 --- a/base/compiler/tfuncs.jl +++ b/base/compiler/tfuncs.jl @@ -359,6 +359,7 @@ add_tfunc(pointerref, 3, 3, return Any end, 4) add_tfunc(pointerset, 4, 4, (@nospecialize(a), @nospecialize(v), @nospecialize(i), @nospecialize(align)) -> a, 5) +add_tfunc(aligned_memcpy, 4, 4, (@nospecialize(dst), @nospecialize(src), @nospecialize(n), @nospecialize(align)) -> Nothing, 5) function typeof_tfunc(@nospecialize(t)) if isa(t, Const) diff --git a/base/reinterpretarray.jl b/base/reinterpretarray.jl index 2cde44e9a386d3..697db2e4cc31cd 100644 --- a/base/reinterpretarray.jl +++ b/base/reinterpretarray.jl @@ -123,11 +123,9 @@ end # once it knows the data layout while nbytes_copied < sizeof(T) s[] = a.parent[ind_start + i, tailinds...] - while nbytes_copied < sizeof(T) && sidx < sizeof(S) - unsafe_store!(tptr, unsafe_load(sptr, sidx + 1), nbytes_copied + 1) - sidx += 1 - nbytes_copied += 1 - end + nb = min(sizeof(S) - sidx, sizeof(T)-nbytes_copied) + Intrinsics.aligned_memcpy(tptr + nbytes_copied, sptr + sidx, nb, 1) + nbytes_copied += nb sidx = 0 i += 1 end @@ -173,34 +171,26 @@ end # element from the original array and overwrite the relevant parts if sidx != 0 s[] = a.parent[ind_start + i, tailinds...] - while nbytes_copied < sizeof(T) && sidx < sizeof(S) - unsafe_store!(sptr, unsafe_load(tptr, nbytes_copied + 1), sidx + 1) - sidx += 1 - nbytes_copied += 1 - end + nb = min(sizeof(S) - sidx, sizeof(T)) + Intrinsics.aligned_memcpy(sptr + sidx, tptr, nb, 1) + nbytes_copied += nb a.parent[ind_start + i, tailinds...] = s[] i += 1 sidx = 0 end # Deal with the main body of elements while nbytes_copied < sizeof(T) && (sizeof(T) - nbytes_copied) > sizeof(S) - while nbytes_copied < sizeof(T) && sidx < sizeof(S) - unsafe_store!(sptr, unsafe_load(tptr, nbytes_copied + 1), sidx + 1) - sidx += 1 - nbytes_copied += 1 - end + nb = min(sizeof(S), sizeof(T) - nbytes_copied) + Intrinsics.aligned_memcpy(sptr, tptr + nbytes_copied, nb, 1) + nbytes_copied += nb a.parent[ind_start + i, tailinds...] = s[] i += 1 - sidx = 0 end # Deal with trailing partial elements if nbytes_copied < sizeof(T) s[] = a.parent[ind_start + i, tailinds...] - while nbytes_copied < sizeof(T) && sidx < sizeof(S) - unsafe_store!(sptr, unsafe_load(tptr, nbytes_copied + 1), sidx + 1) - sidx += 1 - nbytes_copied += 1 - end + nb = min(sizeof(S), sizeof(T) - nbytes_copied) + Intrinsics.aligned_memcpy(sptr, tptr + nbytes_copied, nb, 1) a.parent[ind_start + i, tailinds...] = s[] end end diff --git a/base/reshapedarray.jl b/base/reshapedarray.jl index ebd2efba68b36d..18b2008f2c4d11 100644 --- a/base/reshapedarray.jl +++ b/base/reshapedarray.jl @@ -222,7 +222,7 @@ end I = ind2sub_rs(axes(A.parent), A.mi, i) _unsafe_getindex_rs(parent(A), I) end -_unsafe_getindex_rs(A, i::Integer) = (@inbounds ret = A[i]; ret) +@inline _unsafe_getindex_rs(A, i::Integer) = (@inbounds ret = A[i]; ret) @inline _unsafe_getindex_rs(A, I) = (@inbounds ret = A[I...]; ret) @inline function setindex!(A::ReshapedArrayLF, val, index::Int) diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp index eec4cf75f524ba..6c2c0f79735717 100644 --- a/src/intrinsics.cpp +++ b/src/intrinsics.cpp @@ -718,6 +718,33 @@ static jl_cgval_t emit_pointerset(jl_codectx_t &ctx, jl_cgval_t *argv) return mark_julia_type(ctx, thePtr, false, aty); } +static void emit_aligned_memcpy(jl_codectx_t &ctx, jl_cgval_t *argv) +{ + const jl_cgval_t &dst = argv[0]; + const jl_cgval_t &src = argv[1]; + const jl_cgval_t &n = argv[2]; + const jl_cgval_t &align = argv[3]; + + if (n.typ != (jl_value_t*)jl_long_type || + align.typ != (jl_value_t*)jl_long_type || + !jl_is_cpointer_type(dst.typ) || + !jl_is_cpointer_type(src.typ)) { + emit_runtime_call(ctx, aligned_memcpy, argv, 4); + return; + } + + size_t alignment = 1; + if (align.constant) + alignment = jl_unbox_long(align.constant); + + ctx.builder.CreateMemCpy( + ctx.builder.CreateIntToPtr(dst.V, T_pint8), + ctx.builder.CreateIntToPtr(src.V, T_pint8), + n.V, alignment, + false); + +} + static Value *emit_checked_srem_int(jl_codectx_t &ctx, Value *x, Value *den) { Type *t = den->getType(); @@ -937,6 +964,9 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar return emit_pointerref(ctx, argv); case pointerset: return emit_pointerset(ctx, argv); + case aligned_memcpy: + emit_aligned_memcpy(ctx, argv); + return mark_julia_type(ctx, NULL, false, jl_void_type); case bitcast: return generic_bitcast(ctx, argv); case trunc_int: diff --git a/src/intrinsics.h b/src/intrinsics.h index 5cc938cc8b37a6..eb3fb5684f51c6 100644 --- a/src/intrinsics.h +++ b/src/intrinsics.h @@ -91,6 +91,7 @@ /* pointer access */ \ ADD_I(pointerref, 3) \ ADD_I(pointerset, 4) \ + ADD_I(aligned_memcpy, 4) \ /* c interface */ \ ADD_I(cglobal, 2) \ ALIAS(llvmcall, llvmcall) \ diff --git a/src/julia_internal.h b/src/julia_internal.h index 186b5d7b2b985a..0d368ae5651892 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -705,6 +705,7 @@ unsigned jl_intrinsic_nargs(int f); JL_DLLEXPORT jl_value_t *jl_bitcast(jl_value_t *ty, jl_value_t *v); JL_DLLEXPORT jl_value_t *jl_pointerref(jl_value_t *p, jl_value_t *i, jl_value_t *align); JL_DLLEXPORT jl_value_t *jl_pointerset(jl_value_t *p, jl_value_t *x, jl_value_t *align, jl_value_t *i); +JL_DLLEXPORT jl_value_t *jl_aligned_memcpy(jl_value_t *dst, jl_value_t *src, jl_value_t *n, jl_value_t *align); JL_DLLEXPORT jl_value_t *jl_cglobal(jl_value_t *v, jl_value_t *ty); JL_DLLEXPORT jl_value_t *jl_cglobal_auto(jl_value_t *v); diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c index 5caa7bf6f137f0..32b5d69ecb18bc 100644 --- a/src/runtime_intrinsics.c +++ b/src/runtime_intrinsics.c @@ -75,6 +75,18 @@ JL_DLLEXPORT jl_value_t *jl_pointerset(jl_value_t *p, jl_value_t *x, jl_value_t return p; } +// run time version of memcpy intrinsic +JL_DLLEXPORT jl_value_t *jl_aligned_memcpy(jl_value_t *dst, jl_value_t *src, jl_value_t *n, jl_value_t *align) +{ + JL_TYPECHK(pointerref, pointer, dst); + JL_TYPECHK(pointerref, pointer, src) + JL_TYPECHK(pointerref, long, n); + JL_TYPECHK(pointerref, long, align); + memcpy((uint8_t*)jl_unbox_long(dst), (uint8_t*)jl_unbox_long(src), jl_unbox_long(n)); + return jl_nothing; +} + + JL_DLLEXPORT jl_value_t *jl_cglobal(jl_value_t *v, jl_value_t *ty) { JL_TYPECHK(cglobal, type, ty);