diff --git a/NEWS.md b/NEWS.md
index 77a3150a36e39..dc7c581d46cae 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -243,6 +243,9 @@ This section lists changes that do not have deprecation warnings.
   * All command line arguments passed via `-e`, `-E`, and `-L` will be executed in the order
     given on the command line ([#23665]).
 
+  * The return type of `reinterpret` has changed to `ReinterpretArray`. `reinterpret` on sparse
+    arrays has been discontinued.
+
 Library improvements
 --------------------
 
@@ -313,6 +316,10 @@ Library improvements
   * New function `equalto(x)`, which returns a function that compares its argument to `x`
     using `isequal` ([#23812]).
 
+  * `reinterpret` now works on any AbstractArray using the new `ReinterpretArray` type.
+    This supersedes the old behavior of reinterpret on Arrays. As a result, reinterpreting
+    arrays with different alignment requirements (removed in 0.6) is once again allowed ([#23750]).
+
 Compiler/Runtime improvements
 -----------------------------
 
@@ -511,6 +518,11 @@ Deprecated or removed
   * `find` functions now operate only on booleans by default. To look for non-zeros, use
     `x->x!=0` or `!iszero` ([#23120]).
 
+  * The ability of `reinterpret` to yield `Array`s of different type than the underlying storage
+    has been removed. The `reinterpret` function is still available, but now returns a
+    `ReinterpretArray`. The three argument form of `reinterpret` that implicitly reshapes
+    has been deprecated ([#23750]).
+
 Command-line option changes
 ---------------------------
 
diff --git a/base/array.jl b/base/array.jl
index 445798fc56f68..bdd2eaad85c41 100644
--- a/base/array.jl
+++ b/base/array.jl
@@ -218,33 +218,6 @@ original.
 """
 copy(a::T) where {T<:Array} = ccall(:jl_array_copy, Ref{T}, (Any,), a)
 
-function reinterpret(::Type{T}, a::Array{S,1}) where T where S
-    nel = Int(div(length(a) * sizeof(S), sizeof(T)))
-    # TODO: maybe check that remainder is zero?
-    return reinterpret(T, a, (nel,))
-end
-
-function reinterpret(::Type{T}, a::Array{S}) where T where S
-    if sizeof(S) != sizeof(T)
-        throw(ArgumentError("result shape not specified"))
-    end
-    reinterpret(T, a, size(a))
-end
-
-function reinterpret(::Type{T}, a::Array{S}, dims::NTuple{N,Int}) where T where S where N
-    function throwbits(::Type{S}, ::Type{T}, ::Type{U}) where {S,T,U}
-        @_noinline_meta
-        throw(ArgumentError("cannot reinterpret Array{$(S)} to ::Type{Array{$(T)}}, type $(U) is not a bits type"))
-    end
-    isbits(T) || throwbits(S, T, T)
-    isbits(S) || throwbits(S, T, S)
-    nel = div(length(a) * sizeof(S), sizeof(T))
-    if prod(dims) != nel
-        _throw_dmrsa(dims, nel)
-    end
-    ccall(:jl_reshape_array, Array{T,N}, (Any, Any, Any), Array{T,N}, a, dims)
-end
-
 # reshaping to same # of dimensions
 function reshape(a::Array{T,N}, dims::NTuple{N,Int}) where T where N
     if prod(dims) != length(a)
diff --git a/base/deprecated.jl b/base/deprecated.jl
index 9b70231113c31..6792411802d48 100644
--- a/base/deprecated.jl
+++ b/base/deprecated.jl
@@ -1855,6 +1855,10 @@ end
 # also remove deprecation warnings in find* functions in array.jl, sparse/sparsematrix.jl,
 # and sparse/sparsevector.jl.
 
+# issue #22849
+@deprecate reinterpret(::Type{T}, a::Array{S}, dims::NTuple{N,Int}) where {T, S, N} reshape(reinterpret(T, vec(a)), dims)
+@deprecate reinterpret(::Type{T}, a::SparseMatrixCSC{S}, dims::NTuple{N,Int}) where {T, S, N} reinterpret(T, reshape(a, dims))
+
 # END 0.7 deprecations
 
 # BEGIN 1.0 deprecations
diff --git a/base/essentials.jl b/base/essentials.jl
index 0dd56593c4a25..d762cfdd17bec 100644
--- a/base/essentials.jl
+++ b/base/essentials.jl
@@ -321,20 +321,12 @@ unsafe_convert(::Type{P}, x::Ptr) where {P<:Ptr} = convert(P, x)
     reinterpret(type, A)
 
 Change the type-interpretation of a block of memory.
-For arrays, this constructs an array with the same binary data as the given
+For arrays, this constructs a view of the array with the same binary data as the given
 array, but with the specified element type.
 For example,
 `reinterpret(Float32, UInt32(7))` interprets the 4 bytes corresponding to `UInt32(7)` as a
 [`Float32`](@ref).
 
-!!! warning
-
-    It is not allowed to `reinterpret` an array to an element type with a larger alignment then
-    the alignment of the array. For a normal `Array`, this is the alignment of its element type.
-    For a reinterpreted array, this is the alignment of the `Array` it was reinterpreted from.
-    For example, `reinterpret(UInt32, UInt8[0, 0, 0, 0])` is not allowed but
-    `reinterpret(UInt32, reinterpret(UInt8, Float32[1.0]))` is allowed.
-
 # Examples
 ```jldoctest
 julia> reinterpret(Float32, UInt32(7))
diff --git a/base/inference.jl b/base/inference.jl
index 74641816dc8ff..145299a55b2e6 100644
--- a/base/inference.jl
+++ b/base/inference.jl
@@ -504,6 +504,8 @@ add_tfunc(sdiv_int, 2, 2, math_tfunc, 30)
 add_tfunc(udiv_int, 2, 2, math_tfunc, 30)
 add_tfunc(srem_int, 2, 2, math_tfunc, 30)
 add_tfunc(urem_int, 2, 2, math_tfunc, 30)
+add_tfunc(add_ptr, 2, 2, math_tfunc, 1)
+add_tfunc(sub_ptr, 2, 2, math_tfunc, 1)
 add_tfunc(neg_float, 1, 1, math_tfunc, 1)
 add_tfunc(add_float, 2, 2, math_tfunc, 1)
 add_tfunc(sub_float, 2, 2, math_tfunc, 1)
diff --git a/base/io.jl b/base/io.jl
index 4b3a74e30e264..f64273602ebba 100644
--- a/base/io.jl
+++ b/base/io.jl
@@ -267,15 +267,16 @@ readlines(s=STDIN; chomp::Bool=true) = collect(eachline(s, chomp=chomp))
 
 ## byte-order mark, ntoh & hton ##
 
-let endian_boms = reinterpret(UInt8, UInt32[0x01020304])
+let a = UInt32[0x01020304]
+    endian_bom = @gc_preserve a unsafe_load(convert(Ptr{UInt8}, pointer(a)))
     global ntoh, hton, ltoh, htol
-    if endian_boms == UInt8[1:4;]
+    if endian_bom == 0x01
         ntoh(x) = x
         hton(x) = x
         ltoh(x) = bswap(x)
         htol(x) = bswap(x)
         const global ENDIAN_BOM = 0x01020304
-    elseif endian_boms == UInt8[4:-1:1;]
+    elseif endian_bom == 0x04
         ntoh(x) = bswap(x)
         hton(x) = bswap(x)
         ltoh(x) = x
diff --git a/base/linalg/factorization.jl b/base/linalg/factorization.jl
index 9acaef101ccaa..6ab24cfc42fa9 100644
--- a/base/linalg/factorization.jl
+++ b/base/linalg/factorization.jl
@@ -56,9 +56,9 @@ Base.isequal(F::T, G::T) where {T<:Factorization} = all(f -> isequal(getfield(F,
 # With a real lhs and complex rhs with the same precision, we can reinterpret
 # the complex rhs as a real rhs with twice the number of columns
 function (\)(F::Factorization{T}, B::VecOrMat{Complex{T}}) where T<:BlasReal
-    c2r = reshape(transpose(reinterpret(T, B, (2, length(B)))), size(B, 1), 2*size(B, 2))
+    c2r = reshape(transpose(reinterpret(T, reshape(B, (1, length(B))))), size(B, 1), 2*size(B, 2))
     x = A_ldiv_B!(F, c2r)
-    return reinterpret(Complex{T}, transpose(reshape(x, div(length(x), 2), 2)), _ret_size(F, B))
+    return reshape(collect(reinterpret(Complex{T}, transpose(reshape(x, div(length(x), 2), 2)))), _ret_size(F, B))
 end
 
 for (f1, f2) in ((:\, :A_ldiv_B!),
diff --git a/base/linalg/lq.jl b/base/linalg/lq.jl
index b878468617cf8..356b3e99b9228 100644
--- a/base/linalg/lq.jl
+++ b/base/linalg/lq.jl
@@ -267,10 +267,10 @@ end
 # With a real lhs and complex rhs with the same precision, we can reinterpret
 # the complex rhs as a real rhs with twice the number of columns
 function (\)(F::LQ{T}, B::VecOrMat{Complex{T}}) where T<:BlasReal
-    c2r = reshape(transpose(reinterpret(T, B, (2, length(B)))), size(B, 1), 2*size(B, 2))
+    c2r = reshape(transpose(reinterpret(T, reshape(B, (1, length(B))))), size(B, 1), 2*size(B, 2))
     x = A_ldiv_B!(F, c2r)
-    return reinterpret(Complex{T}, transpose(reshape(x, div(length(x), 2), 2)),
-        isa(B, AbstractVector) ? (size(F,2),) : (size(F,2), size(B,2)))
+    return reshape(collect(reinterpret(Complex{T}, transpose(reshape(x, div(length(x), 2), 2)))),
+                           isa(B, AbstractVector) ? (size(F,2),) : (size(F,2), size(B,2)))
 end
 
 
diff --git a/base/linalg/matmul.jl b/base/linalg/matmul.jl
index 0ca518a58fb41..88885be32dcf5 100644
--- a/base/linalg/matmul.jl
+++ b/base/linalg/matmul.jl
@@ -90,7 +90,7 @@ A_mul_B!(y::StridedVector{T}, A::StridedVecOrMat{T}, x::StridedVector{T}) where
 for elty in (Float32,Float64)
     @eval begin
         function A_mul_B!(y::StridedVector{Complex{$elty}}, A::StridedVecOrMat{Complex{$elty}}, x::StridedVector{$elty})
-            Afl = reinterpret($elty,A,(2size(A,1),size(A,2)))
+            Afl = reinterpret($elty,A)
             yfl = reinterpret($elty,y)
             gemv!(yfl,'N',Afl,x)
             return y
@@ -148,8 +148,8 @@ A_mul_B!(C::StridedMatrix{T}, A::StridedVecOrMat{T}, B::StridedVecOrMat{T}) wher
 for elty in (Float32,Float64)
     @eval begin
         function A_mul_B!(C::StridedMatrix{Complex{$elty}}, A::StridedVecOrMat{Complex{$elty}}, B::StridedVecOrMat{$elty})
-            Afl = reinterpret($elty, A, (2size(A,1), size(A,2)))
-            Cfl = reinterpret($elty, C, (2size(C,1), size(C,2)))
+            Afl = reinterpret($elty, A)
+            Cfl = reinterpret($elty, C)
             gemm_wrapper!(Cfl, 'N', 'N', Afl, B)
             return C
         end
@@ -190,8 +190,8 @@ A_mul_Bt!(C::StridedMatrix{T}, A::StridedVecOrMat{T}, B::StridedVecOrMat{T}) whe
 for elty in (Float32,Float64)
     @eval begin
         function A_mul_Bt!(C::StridedMatrix{Complex{$elty}}, A::StridedVecOrMat{Complex{$elty}}, B::StridedVecOrMat{$elty})
-            Afl = reinterpret($elty, A, (2size(A,1), size(A,2)))
-            Cfl = reinterpret($elty, C, (2size(C,1), size(C,2)))
+            Afl = reinterpret($elty, A)
+            Cfl = reinterpret($elty, C)
             gemm_wrapper!(Cfl, 'N', 'T', Afl, B)
             return C
         end
diff --git a/base/linalg/qr.jl b/base/linalg/qr.jl
index 580516ccbcf49..7d9b0e7f4433d 100644
--- a/base/linalg/qr.jl
+++ b/base/linalg/qr.jl
@@ -918,7 +918,7 @@ function (\)(A::Union{QR{T},QRCompactWY{T},QRPivoted{T}}, BIn::VecOrMat{Complex{
 # |z2|z4|      ->       |y1|y2|y3|y4|     ->      |x2|y2|     ->    |x2|y2|x4|y4|
 #                                                 |x3|y3|
 #                                                 |x4|y4|
-    B = reshape(transpose(reinterpret(T, BIn, (2, length(BIn)))), size(BIn, 1), 2*size(BIn, 2))
+    B = reshape(transpose(reinterpret(T, reshape(BIn, (1, length(BIn))))), size(BIn, 1), 2*size(BIn, 2))
 
     X = A_ldiv_B!(A, _append_zeros(B, T, n))
 
@@ -926,7 +926,7 @@ function (\)(A::Union{QR{T},QRCompactWY{T},QRPivoted{T}}, BIn::VecOrMat{Complex{
 # |z2|z4|      <-       |y1|y2|y3|y4|     <-      |x2|y2|     <-    |x2|y2|x4|y4|
 #                                                 |x3|y3|
 #                                                 |x4|y4|
-    XX = reinterpret(Complex{T}, transpose(reshape(X, div(length(X), 2), 2)), _ret_size(A, BIn))
+    XX = reshape(collect(reinterpret(Complex{T}, transpose(reshape(X, div(length(X), 2), 2)))), _ret_size(A, BIn))
     return _cut_B(XX, 1:n)
 end
 
diff --git a/base/pointer.jl b/base/pointer.jl
index b2197d21db8c0..2daa2e4a4408a 100644
--- a/base/pointer.jl
+++ b/base/pointer.jl
@@ -147,8 +147,8 @@ eltype(::Type{Ptr{T}}) where {T} = T
 isless(x::Ptr, y::Ptr) = isless(UInt(x), UInt(y))
 -(x::Ptr, y::Ptr) = UInt(x) - UInt(y)
 
-+(x::Ptr, y::Integer) = oftype(x, (UInt(x) + (y % UInt) % UInt))
--(x::Ptr, y::Integer) = oftype(x, (UInt(x) - (y % UInt) % UInt))
++(x::Ptr, y::Integer) = oftype(x, Intrinsics.add_ptr(UInt(x), (y % UInt) % UInt))
+-(x::Ptr, y::Integer) = oftype(x, Intrinsics.sub_ptr(UInt(x), (y % UInt) % UInt))
 +(x::Integer, y::Ptr) = y + x
 
 """
diff --git a/base/random/dSFMT.jl b/base/random/dSFMT.jl
index 2061cc54f9741..d4ae974dbe9fc 100644
--- a/base/random/dSFMT.jl
+++ b/base/random/dSFMT.jl
@@ -104,7 +104,8 @@ function dsfmt_jump(s::DSFMT_state, jp::AbstractString)
     val = s.val
     nval = length(val)
     index = val[nval - 1]
-    work = zeros(UInt64, JN32 >> 1)
+    work = zeros(Int32, JN32)
+    rwork = reinterpret(UInt64, work)
     dsfmt = Vector{UInt64}(nval >> 1)
     ccall(:memcpy, Ptr{Void}, (Ptr{UInt64}, Ptr{Int32}, Csize_t),
           dsfmt, val, (nval - 1) * sizeof(Int32))
@@ -113,17 +114,17 @@ function dsfmt_jump(s::DSFMT_state, jp::AbstractString)
     for c in jp
         bits = parse(UInt8,c,16)
         for j in 1:4
-            (bits & 0x01) != 0x00 && dsfmt_jump_add!(work, dsfmt)
+            (bits & 0x01) != 0x00 && dsfmt_jump_add!(rwork, dsfmt)
             bits = bits >> 0x01
             dsfmt_jump_next_state!(dsfmt)
         end
     end
 
-    work[end] = index
-    return DSFMT_state(reinterpret(Int32, work))
+    rwork[end] = index
+    return DSFMT_state(work)
 end
 
-function dsfmt_jump_add!(dest::Vector{UInt64}, src::Vector{UInt64})
+function dsfmt_jump_add!(dest::AbstractVector{UInt64}, src::Vector{UInt64})
     dp = dest[end] >> 1
     sp = src[end] >> 1
     diff = ((sp - dp + N) % N)
diff --git a/base/reinterpretarray.jl b/base/reinterpretarray.jl
new file mode 100644
index 0000000000000..688b67308e57f
--- /dev/null
+++ b/base/reinterpretarray.jl
@@ -0,0 +1,134 @@
+"""
+Gives a reinterpreted view (of element type T) of the underlying array (of element type S).
+If the size of `T` differs from the size of `S`, the array will be compressed/expanded in
+the first dimension.
+"""
+struct ReinterpretArray{T,N,S,A<:AbstractArray{S, N}} <: AbstractArray{T, N}
+    parent::A
+    function reinterpret(::Type{T}, a::A) where {T,N,S,A<:AbstractArray{S, N}}
+        function throwbits(::Type{S}, ::Type{T}, ::Type{U}) where {S,T,U}
+            @_noinline_meta
+            throw(ArgumentError("cannot reinterpret `$(S)` `$(T)`, type `$(U)` is not a bits type"))
+        end
+        function throwsize0(::Type{S}, ::Type{T})
+            @_noinline_meta
+            throw(ArgumentError("cannot reinterpret a zero-dimensional `$(S)` array to `$(T)` which is of a different size"))
+        end
+        function thrownonint(::Type{S}, ::Type{T}, dim)
+            @_noinline_meta
+            throw(ArgumentError("""
+                cannot reinterpret an `$(S)` array to `$(T)` whose first dimension has size `$(dim)`.
+                The resulting array would have non-integral first dimension.
+            """))
+        end
+        isbits(T) || throwbits(S, T, T)
+        isbits(S) || throwbits(S, T, S)
+        (N != 0 || sizeof(T) == sizeof(S)) || throwsize0(S, T)
+        if N != 0 && sizeof(S) != sizeof(T)
+            dim = size(a)[1]
+            rem(dim*sizeof(S),sizeof(T)) == 0 || thrownonint(S, T, dim)
+        end
+        new{T, N, S, A}(a)
+    end
+end
+
+parent(a::ReinterpretArray) = a.parent
+
+eltype(a::ReinterpretArray{T}) where {T} = T
+function size(a::ReinterpretArray{T,N,S} where {N}) where {T,S}
+    psize = size(a.parent)
+    size1 = div(psize[1]*sizeof(S), sizeof(T))
+    tuple(size1, tail(psize)...)
+end
+
+unsafe_convert(::Type{Ptr{T}}, a::ReinterpretArray{T,N,S} where N) where {T,S} = Ptr{T}(unsafe_convert(Ptr{S},a.parent))
+
+@inline @propagate_inbounds getindex(a::ReinterpretArray{T,0}) where {T} = reinterpret(T, a.parent[])
+@inline @propagate_inbounds getindex(a::ReinterpretArray) = a[1]
+
+@inline @propagate_inbounds function getindex(a::ReinterpretArray{T,N,S}, inds::Vararg{Int, N}) where {T,N,S}
+    # Make sure to match the scalar reinterpret if that is applicable
+    if sizeof(T) == sizeof(S) && (fieldcount(T) + fieldcount(S)) == 0
+        return reinterpret(T, a.parent[inds...])
+    else
+        ind_start, sidx = divrem((inds[1]-1)*sizeof(T), sizeof(S))
+        t = Ref{T}()
+        s = Ref{S}()
+        @gc_preserve t s begin
+            tptr = Ptr{UInt8}(unsafe_convert(Ref{T}, t))
+            sptr = Ptr{UInt8}(unsafe_convert(Ref{S}, s))
+            i = 1
+            nbytes_copied = 0
+            # This is a bit complicated to deal with partial elements
+            # at both the start and the end. LLVM will fold as appropriate,
+            # once it knows the data layout
+            while nbytes_copied < sizeof(T)
+                s[] = a.parent[ind_start + i, tail(inds)...]
+                while nbytes_copied < sizeof(T) && sidx < sizeof(S)
+                    unsafe_store!(tptr, unsafe_load(sptr, sidx + 1), nbytes_copied + 1)
+                    sidx += 1
+                    nbytes_copied += 1
+                end
+                sidx = 0
+                i += 1
+            end
+        end
+        return t[]
+    end
+end
+
+@inline @propagate_inbounds setindex!(a::ReinterpretArray{T,0,S} where T, v) where {S} = (a.parent[] = reinterpret(S, v))
+@inline @propagate_inbounds setindex!(a::ReinterpretArray, v) = (a[1] = v)
+
+@inline @propagate_inbounds function setindex!(a::ReinterpretArray{T,N,S}, v, inds::Vararg{Int, N}) where {T,N,S}
+    v = convert(T, v)::T
+    # Make sure to match the scalar reinterpret if that is applicable
+    if sizeof(T) == sizeof(S) && (fieldcount(T) + fieldcount(S)) == 0
+        return setindex!(a.parent, reinterpret(S, v), inds...)
+    else
+        ind_start, sidx = divrem((inds[1]-1)*sizeof(T), sizeof(S))
+        t = Ref{T}(v)
+        s = Ref{S}()
+        @gc_preserve t s begin
+            tptr = Ptr{UInt8}(unsafe_convert(Ref{T}, t))
+            sptr = Ptr{UInt8}(unsafe_convert(Ref{S}, s))
+            nbytes_copied = 0
+            i = 1
+            # Deal with any partial elements at the start. We'll have to copy in the
+            # element from the original array and overwrite the relevant parts
+            if sidx != 0
+                s[] = a.parent[ind_start + i, tail(inds)...]
+                while nbytes_copied < sizeof(T) && sidx < sizeof(S)
+                    unsafe_store!(sptr, unsafe_load(tptr, nbytes_copied + 1), sidx + 1)
+                    sidx += 1
+                    nbytes_copied += 1
+                end
+                a.parent[ind_start + i, tail(inds)...] = s[]
+                i += 1
+                sidx = 0
+            end
+            # Deal with the main body of elements
+            while nbytes_copied < sizeof(T) && (sizeof(T) - nbytes_copied) > sizeof(S)
+                while nbytes_copied < sizeof(T) && sidx < sizeof(S)
+                    unsafe_store!(sptr, unsafe_load(tptr, nbytes_copied + 1), sidx + 1)
+                    sidx += 1
+                    nbytes_copied += 1
+                end
+                a.parent[ind_start + i, tail(inds)...] = s[]
+                i += 1
+                sidx = 0
+            end
+            # Deal with trailing partial elements
+            if nbytes_copied < sizeof(T)
+                s[] = a.parent[ind_start + i, tail(inds)...]
+                while nbytes_copied < sizeof(T) && sidx < sizeof(S)
+                    unsafe_store!(sptr, unsafe_load(tptr, nbytes_copied + 1), sidx + 1)
+                    sidx += 1
+                    nbytes_copied += 1
+                end
+                a.parent[ind_start + i, tail(inds)...] = s[]
+            end
+        end
+    end
+    return a
+end
diff --git a/base/show.jl b/base/show.jl
index b4c284bffd39a..c61dd830340da 100644
--- a/base/show.jl
+++ b/base/show.jl
@@ -1888,6 +1888,12 @@ function showarg(io::IO, r::ReshapedArray, toplevel)
     toplevel && print(io, " with eltype ", eltype(r))
 end
 
+function showarg(io::IO, r::ReinterpretArray{T}, toplevel) where {T}
+    print(io, "reinterpret($T, ")
+    showarg(io, parent(r), false)
+    print(io, ')')
+end
+
 # n-dimensional arrays
 function show_nd(io::IO, a::AbstractArray, print_matrix, label_slices)
     limit::Bool = get(io, :limit, false)
diff --git a/base/sparse/abstractsparse.jl b/base/sparse/abstractsparse.jl
index e17d3be97dbcb..8c84f33b7e18c 100644
--- a/base/sparse/abstractsparse.jl
+++ b/base/sparse/abstractsparse.jl
@@ -21,3 +21,10 @@ issparse(S::UpperTriangular{<:Any,<:AbstractSparseMatrix}) = true
 issparse(S::LinAlg.UnitUpperTriangular{<:Any,<:AbstractSparseMatrix}) = true
 
 indtype(S::AbstractSparseArray{<:Any,Ti}) where {Ti} = Ti
+
+function Base.reinterpret(::Type, A::AbstractSparseArray)
+    error("""
+          `reinterpret` on sparse arrays is discontinued.
+          Try reinterpreting the value itself instead.
+          """)
+end
diff --git a/base/sparse/sparse.jl b/base/sparse/sparse.jl
index 0c646964b2852..e1ef23c364c91 100644
--- a/base/sparse/sparse.jl
+++ b/base/sparse/sparse.jl
@@ -25,7 +25,7 @@ import Base: @get!, acos, acosd, acot, acotd, acsch, asech, asin, asind, asinh,
     broadcast, ceil, complex, cond, conj, convert, copy, copy!, adjoint, diagm,
     exp, expm1, factorize, find, findmax, findmin, findnz, float, full, getindex,
     vcat, hcat, hvcat, cat, imag, indmax, ishermitian, kron, length, log, log1p, max, min,
-    maximum, minimum, norm, one, promote_eltype, real, reinterpret, reshape, rot180,
+    maximum, minimum, norm, one, promote_eltype, real, reshape, rot180,
     rotl90, rotr90, round, scale!, setindex!, similar, size, transpose, tril,
     triu, vec, permute!, map, map!
 
diff --git a/base/sparse/sparsematrix.jl b/base/sparse/sparsematrix.jl
index 09255dacdaefa..589047a9fe893 100644
--- a/base/sparse/sparsematrix.jl
+++ b/base/sparse/sparsematrix.jl
@@ -210,18 +210,7 @@ function Base.show(io::IOContext, S::SparseMatrixCSC)
     end
 end
 
-## Reinterpret and Reshape
-
-function reinterpret(::Type{T}, a::SparseMatrixCSC{Tv}) where {T,Tv}
-    if sizeof(T) != sizeof(Tv)
-        throw(ArgumentError("SparseMatrixCSC reinterpret is only supported for element types of the same size"))
-    end
-    mA, nA = size(a)
-    colptr = copy(a.colptr)
-    rowval = copy(a.rowval)
-    nzval  = reinterpret(T, a.nzval)
-    return SparseMatrixCSC(mA, nA, colptr, rowval, nzval)
-end
+## Reshape
 
 function sparse_compute_reshaped_colptr_and_rowval(colptrS::Vector{Ti}, rowvalS::Vector{Ti},
                                                    mS::Int, nS::Int, colptrA::Vector{Ti},
@@ -257,25 +246,6 @@ function sparse_compute_reshaped_colptr_and_rowval(colptrS::Vector{Ti}, rowvalS:
     end
 end
 
-function reinterpret(::Type{T}, a::SparseMatrixCSC{Tv,Ti}, dims::NTuple{N,Int}) where {T,Tv,Ti,N}
-    if sizeof(T) != sizeof(Tv)
-        throw(ArgumentError("SparseMatrixCSC reinterpret is only supported for element types of the same size"))
-    end
-    if prod(dims) != length(a)
-        throw(DimensionMismatch("new dimensions $(dims) must be consistent with array size $(length(a))"))
-    end
-    mS,nS = dims
-    mA,nA = size(a)
-    numnz = nnz(a)
-    colptr = Vector{Ti}(nS+1)
-    rowval = similar(a.rowval)
-    nzval = reinterpret(T, a.nzval)
-
-    sparse_compute_reshaped_colptr_and_rowval(colptr, rowval, mS, nS, a.colptr, a.rowval, mA, nA)
-
-    return SparseMatrixCSC(mS, nS, colptr, rowval, nzval)
-end
-
 function copy(ra::ReshapedArray{<:Any,2,<:SparseMatrixCSC})
     mS,nS = size(ra)
     a = parent(ra)
diff --git a/base/sparse/spqr.jl b/base/sparse/spqr.jl
index d752712027760..e1bf8119ec133 100644
--- a/base/sparse/spqr.jl
+++ b/base/sparse/spqr.jl
@@ -341,14 +341,14 @@ function (\)(F::QRSparse{Float64}, B::VecOrMat{Complex{Float64}})
 # |z2|z4|      ->       |y1|y2|y3|y4|     ->      |x2|y2|     ->    |x2|y2|x4|y4|
 #                                                 |x3|y3|
 #                                                 |x4|y4|
-    c2r = reshape(transpose(reinterpret(Float64, B, (2, length(B)))), size(B, 1), 2*size(B, 2))
+    c2r = reshape(transpose(reinterpret(Float64, reshape(B, (1, length(B))))), size(B, 1), 2*size(B, 2))
     x = F\c2r
 
 # |z1|z3|  reinterpret  |x1|x2|x3|x4|  transpose  |x1|y1|  reshape  |x1|y1|x3|y3|
 # |z2|z4|      <-       |y1|y2|y3|y4|     <-      |x2|y2|     <-    |x2|y2|x4|y4|
 #                                                 |x3|y3|
 #                                                 |x4|y4|
-    return reinterpret(Complex{Float64}, transpose(reshape(x, (length(x) >> 1), 2)), _ret_size(F, B))
+    return collect(reshape(reinterpret(Complex{Float64}, transpose(reshape(x, (length(x) >> 1), 2))), _ret_size(F, B)))
 end
 
 function _ldiv_basic(F::QRSparse, B::StridedVecOrMat)
diff --git a/base/sysimg.jl b/base/sysimg.jl
index 93ae630bd4034..16ba7ca29029d 100644
--- a/base/sysimg.jl
+++ b/base/sysimg.jl
@@ -121,6 +121,7 @@ include("indices.jl")
 include("array.jl")
 include("abstractarray.jl")
 include("subarray.jl")
+include("reinterpretarray.jl")
 
 # Array convenience converting constructors
 Array{T}(m::Integer) where {T} = Array{T,1}(Int(m))
@@ -182,15 +183,16 @@ using .Iterators: Flatten, product  # for generators
 
 # Definition of StridedArray
 StridedReshapedArray{T,N,A<:Union{DenseArray,FastContiguousSubArray}} = ReshapedArray{T,N,A}
+StridedReinterpretArray{T,N,A<:Union{DenseArray,FastContiguousSubArray}} = ReinterpretArray{T,N,S,A} where S
 StridedArray{T,N,A<:Union{DenseArray,StridedReshapedArray},
     I<:Tuple{Vararg{Union{RangeIndex, AbstractCartesianIndex}}}} =
-    Union{DenseArray{T,N}, SubArray{T,N,A,I}, StridedReshapedArray{T,N}}
+    Union{DenseArray{T,N}, SubArray{T,N,A,I}, StridedReshapedArray{T,N}, StridedReinterpretArray{T,N,A}}
 StridedVector{T,A<:Union{DenseArray,StridedReshapedArray},
     I<:Tuple{Vararg{Union{RangeIndex, AbstractCartesianIndex}}}} =
-    Union{DenseArray{T,1}, SubArray{T,1,A,I}, StridedReshapedArray{T,1}}
+    Union{DenseArray{T,1}, SubArray{T,1,A,I}, StridedReshapedArray{T,1}, StridedReinterpretArray{T,1,A}}
 StridedMatrix{T,A<:Union{DenseArray,StridedReshapedArray},
     I<:Tuple{Vararg{Union{RangeIndex, AbstractCartesianIndex}}}} =
-    Union{DenseArray{T,2}, SubArray{T,2,A,I}, StridedReshapedArray{T,2}}
+    Union{DenseArray{T,2}, SubArray{T,2,A,I}, StridedReshapedArray{T,2}, StridedReinterpretArray{T,2,A}}
 StridedVecOrMat{T} = Union{StridedVector{T}, StridedMatrix{T}}
 
 # For OS specific stuff
diff --git a/deps/llvm.mk b/deps/llvm.mk
index 7f44868aba359..07b77b15298e1 100644
--- a/deps/llvm.mk
+++ b/deps/llvm.mk
@@ -460,6 +460,7 @@ $(eval $(call LLVM_PATCH,llvm-D32593))
 $(eval $(call LLVM_PATCH,llvm-D33179))
 $(eval $(call LLVM_PATCH,llvm-PR29010-i386-xmm)) # Remove for 4.0
 $(eval $(call LLVM_PATCH,llvm-3.9.0-D37576-NVPTX-sm_70)) # NVPTX, Remove for 6.0
+$(eval $(call LLVM_PATCH,llvm-D37939-Mem2Reg-Also-handle-memcpy))
 else ifeq ($(LLVM_VER_SHORT),4.0)
 # Cygwin and openSUSE still use win32-threads mingw, https://llvm.org/bugs/show_bug.cgi?id=26365
 $(eval $(call LLVM_PATCH,llvm-4.0.0_threads))
diff --git a/deps/patches/llvm-D37939-Mem2Reg-Also-handle-memcpy.patch b/deps/patches/llvm-D37939-Mem2Reg-Also-handle-memcpy.patch
new file mode 100644
index 0000000000000..b8753b0439ba0
--- /dev/null
+++ b/deps/patches/llvm-D37939-Mem2Reg-Also-handle-memcpy.patch
@@ -0,0 +1,365 @@
+From da4504b2d3c6629fbd58634bf76f1b85939d07cf Mon Sep 17 00:00:00 2001
+From: Keno Fischer <keno@juliacomputing.com>
+Date: Fri, 15 Sep 2017 18:30:59 -0400
+Subject: [PATCH] [Mem2Reg] Also handle memcpy
+
+Summary:
+In julia, when we know we're moving data between two memory locations,
+we always emit that as a memcpy rather than a load/store pair. However,
+this can give worse optimization results in certain cases because some
+optimizations that can handle load/store pairs cannot handle memcpys.
+Mem2reg is one of these optimizations. This patch adds rudamentary
+support for mem2reg for recognizing memcpys that cover the whole alloca
+we're promoting. While several more sophisticated passes (SROA, GVN)
+can get similar optimizations, it is preferable to have these kinds
+of cases caught early to expose optimization opportunities before
+getting to these later passes. The approach taken here is to split
+the memcpy into a load/store pair early (after legality analysis)
+and retain the rest of the analysis only on loads/stores. It would
+be possible of course to leave the memcpy as is and generate the
+left over load or store only on demand. However, that would entail
+a significantly larger patch for unclear benefit.
+
+Reviewers: chandlerc, dberlin
+
+Subscribers: llvm-commits
+
+Differential Revision: https://reviews.llvm.org/D37939
+---
+ lib/Transforms/Utils/PromoteMemoryToRegister.cpp | 166 ++++++++++++++++++++---
+ test/Transforms/Mem2Reg/memcpy.ll                | 101 ++++++++++++++
+ 2 files changed, 251 insertions(+), 16 deletions(-)
+ create mode 100644 test/Transforms/Mem2Reg/memcpy.ll
+
+diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+index ac28f59..b08a0a1 100644
+--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
++++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+@@ -49,6 +49,58 @@ STATISTIC(NumSingleStore,   "Number of alloca's promoted with a single store");
+ STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
+ STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
+ 
++static bool isSplittableMemCpy(const MemCpyInst *MCI, const AllocaInst *AI) {
++  // Punt if this alloca is an array allocation
++  if (AI->isArrayAllocation())
++    return false;
++  if (MCI->isVolatile())
++    return false;
++  Value *Length = MCI->getLength();
++  if (!isa<ConstantInt>(Length))
++    return false;
++  // Anything less than the full alloca, we leave for SROA
++  const DataLayout &DL = AI->getModule()->getDataLayout();
++  size_t AIElSize = DL.getTypeAllocSize(AI->getAllocatedType());
++  if (cast<ConstantInt>(Length)->getZExtValue() != AIElSize)
++    return false;
++  // If the other argument is also an alloca, we need to be sure that either
++  // the types are bitcastable, or the other alloca is not eligible for
++  // promotion (e.g. because the memcpy is for less than the whole size of
++  // that alloca), otherwise we risk turning an allocatable alloca into a
++  // non-allocatable one when splitting the memcpy.
++  AllocaInst *OtherAI = dyn_cast<AllocaInst>(
++      AI == MCI->getSource() ? MCI->getDest() : MCI->getSource());
++  if (OtherAI) {
++    if (!CastInst::isBitCastable(AI->getAllocatedType(),
++                                 OtherAI->getAllocatedType()) &&
++        DL.getTypeAllocSize(OtherAI->getAllocatedType()) == AIElSize)
++      return false;
++  }
++  return true;
++}
++
++/// Look at the result of a bitcast and see if it's only used by lifetime
++/// intrinsics or splittable memcpys. This is needed, because IRBuilder
++/// will always insert a bitcast to i8* for these intrinsics.
++static bool onlyHasCanonicalizableUsers(const AllocaInst *AI, const Value *V) {
++  for (const User *U : V->users()) {
++    const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
++    if (!II)
++      return false;
++
++    if (isa<MemCpyInst>(II)) {
++      if (!isSplittableMemCpy(cast<MemCpyInst>(II), AI))
++        return false;
++      continue;
++    }
++
++    if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
++        II->getIntrinsicID() != Intrinsic::lifetime_end)
++      return false;
++  }
++  return true;
++}
++
+ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+   // FIXME: If the memory unit is of pointer or integer type, we can permit
+   // assignments to subsections of the memory unit.
+@@ -68,6 +120,9 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+       // not have any meaning for a local alloca.
+       if (SI->isVolatile())
+         return false;
++    } else if (const MemCpyInst *MCI = dyn_cast<MemCpyInst>(U)) {
++      if (!isSplittableMemCpy(MCI, AI))
++        return false;
+     } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+       if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+           II->getIntrinsicID() != Intrinsic::lifetime_end)
+@@ -75,7 +130,7 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+     } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+       if (BCI->getType() != Type::getInt8PtrTy(U->getContext(), AS))
+         return false;
+-      if (!onlyUsedByLifetimeMarkers(BCI))
++      if (!onlyHasCanonicalizableUsers(AI, BCI))
+         return false;
+     } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+       if (GEPI->getType() != Type::getInt8PtrTy(U->getContext(), AS))
+@@ -181,7 +235,13 @@ public:
+   /// This code only looks at accesses to allocas.
+   static bool isInterestingInstruction(const Instruction *I) {
++    if (isa<MemCpyInst>(I)) {
++      const MemCpyInst *MCI = cast<MemCpyInst>(I);
++      return isa<AllocaInst>(MCI->getSource()) ||
++             isa<AllocaInst>(MCI->getDest());
++    } else {
+     return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) ||
+            (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1)));
+   }
++  }
+ 
+   /// Get or calculate the index of the specified instruction.
+@@ -208,6 +264,25 @@ public:
+     return It->second;
+   }
+ 
++  // When we split a memcpy intrinsic, we need to update the numbering in this
++  // struct. To make sure the relative ordering remains the same, we give both
++  // the LI and the SI the number that the MCI used to have (if they are both
++  // interesting). This means that they will have equal numbers, which usually
++  // can't happen. However, since they can never reference the same alloca
++  // (since memcpy operands may not overlap), this is fine, because we will
++  // never compare instruction indices for instructions that operate on distinct
++  // allocas.
++  void splitMemCpy(MemCpyInst *MCI, LoadInst *LI, StoreInst *SI) {
++    DenseMap<const Instruction *, unsigned>::iterator It =
++        InstNumbers.find(MCI);
++    if (It == InstNumbers.end())
++      return;
++    unsigned MemCpyNumber = It->second;
++    InstNumbers[LI] = MemCpyNumber;
++    InstNumbers[SI] = MemCpyNumber;
++    deleteValue(MCI);
++  }
++
+   void deleteValue(const Instruction *I) { InstNumbers.erase(I); }
+ 
+   void clear() { InstNumbers.clear(); }
+@@ -305,9 +380,58 @@ static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) {
+   AC->registerAssumption(CI);
+ }
+ 
+-static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
+-  // Knowing that this alloca is promotable, we know that it's safe to kill all
+-  // instructions except for load and store.
++/// Split a memcpy instruction into the corresponding load/store. It is a little
++/// more complicated than one might imagine, because we need to deal with the
++/// fact that the side of the copy we're not currently processing might also
++/// be a promotable alloca. We need to be careful to not break the promotable
++/// predicate for that other alloca (if any).
++static void doMemCpySplit(LargeBlockInfo &LBI, MemCpyInst *MCI,
++                          AllocaInst *AI) {
++  AAMDNodes AA;
++  MCI->getAAMetadata(AA);
++  Value *MCISrc = MCI->getSource();
++  Type *LoadType = AI->getAllocatedType();
++  AllocaInst *SrcAI = dyn_cast<AllocaInst>(MCISrc);
++  if (SrcAI && SrcAI->getType() != AI->getType()) {
++    if (CastInst::isBitCastable(SrcAI->getAllocatedType(), LoadType))
++      LoadType = SrcAI->getAllocatedType();
++  }
++  if (cast<PointerType>(MCISrc->getType())->getElementType() != LoadType)
++    MCISrc = CastInst::Create(
++        Instruction::BitCast, MCISrc,
++        LoadType->getPointerTo(
++            cast<PointerType>(MCISrc->getType())->getAddressSpace()),
++        "", MCI);
++  // This might add to the end of the use list, but that's fine. At worst,
++  // we'd not visit the instructions we insert here, but we don't care
++  // about them in this loop anyway.
++  LoadInst *LI = new LoadInst(LoadType, MCISrc, "", MCI->isVolatile(),
++                              MCI->getAlignment(), MCI);
++  Value *Val = LI;
++  Value *MCIDest = MCI->getDest();
++  AllocaInst *DestAI = dyn_cast<AllocaInst>(MCIDest);
++  Type *DestElTy = DestAI ? DestAI->getAllocatedType() : AI->getAllocatedType();
++  if (LI->getType() != DestElTy &&
++      CastInst::isBitCastable(LI->getType(), DestElTy))
++    Val = CastInst::Create(Instruction::BitCast, Val, DestElTy, "", MCI);
++  if (cast<PointerType>(MCIDest->getType())->getElementType() != Val->getType())
++    MCIDest = CastInst::Create(
++        Instruction::BitCast, MCIDest,
++        Val->getType()->getPointerTo(
++            cast<PointerType>(MCIDest->getType())->getAddressSpace()),
++        "", MCI);
++  StoreInst *SI =
++      new StoreInst(Val, MCIDest, MCI->isVolatile(), MCI->getAlignment(), MCI);
++  LI->setAAMetadata(AA);
++  SI->setAAMetadata(AA);
++  LBI.splitMemCpy(MCI, LI, SI);
++  MCI->eraseFromParent();
++}
++
++static void canonicalizeUsers(LargeBlockInfo &LBI, AllocaInst *AI) {
++  // Knowing that this alloca is promotable, we know that it's safe to split
++  // MTIs into load/store and to kill all other instructions except for
++  // load and store.
+ 
+   for (auto UI = AI->user_begin(), UE = AI->user_end(); UI != UE;) {
+     Instruction *I = cast<Instruction>(*UI);
+@@ -315,14 +439,24 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
+     if (isa<LoadInst>(I) || isa<StoreInst>(I))
+       continue;
+ 
++    if (isa<MemCpyInst>(I)) {
++      MemCpyInst *MCI = cast<MemCpyInst>(I);
++      doMemCpySplit(LBI, MCI, AI);
++      continue;
++    }
++
+     if (!I->getType()->isVoidTy()) {
+-      // The only users of this bitcast/GEP instruction are lifetime intrinsics.
+-      // Follow the use/def chain to erase them now instead of leaving it for
+-      // dead code elimination later.
++      // The only users of this bitcast/GEP instruction are lifetime/memcpy
++      // intrinsics. Split memcpys and delete lifetime intrinsics.
+       for (auto UUI = I->user_begin(), UUE = I->user_end(); UUI != UUE;) {
+         Instruction *Inst = cast<Instruction>(*UUI);
+         ++UUI;
+-        Inst->eraseFromParent();
++        if (isa<MemCpyInst>(Inst)) {
++          doMemCpySplit(LBI, cast<MemCpyInst>(Inst), AI);
++        } else {
++          // Must be a lifetime intrinsic
++          Inst->eraseFromParent();
++        }
+       }
+     }
+     I->eraseFromParent();
+@@ -542,7 +676,7 @@ void PromoteMem2Reg::run() {
+     assert(AI->getParent()->getParent() == &F &&
+            "All allocas should be in the same function, which is same as DF!");
+ 
+-    removeLifetimeIntrinsicUsers(AI);
++    canonicalizeUsers(LBI, AI);
+ 
+     if (AI->use_empty()) {
+       // If there are no uses of the alloca, just delete it now.
+diff --git a/test/Transforms/Mem2Reg/memcpy.ll b/test/Transforms/Mem2Reg/memcpy.ll
+new file mode 100644
+index 0000000..fbc4096
+--- /dev/null
++++ b/test/Transforms/Mem2Reg/memcpy.ll
+@@ -0,0 +1,101 @@
++; RUN: opt < %s -mem2reg -S | FileCheck %s
++
++target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
++
++declare void @llvm.memcpy.p0i128.p0i64.i32(i128 *, i64 *, i32, i32, i1)
++declare void @llvm.memcpy.p0i8.p0i8.i32(i8 *, i8 *, i32, i32, i1)
++declare void @llvm.memcpy.p0i64.p0i64.i32(i64 *, i64 *, i32, i32, i1)
++declare void @llvm.memcpy.p0f64.p0i64.i32(double *, i64 *, i32, i32, i1)
++
++define i128 @test_cpy_different(i64) {
++; CHECK-LABEL: @test_cpy_different
++; CHECK-NOT: alloca i64
++; CHECK: store i64 %0
++    %a = alloca i64
++    %b = alloca i128
++    store i128 0, i128 *%b
++    store i64 %0, i64 *%a
++    call void @llvm.memcpy.p0i128.p0i64.i32(i128 *%b, i64 *%a, i32 8, i32 0, i1 0)
++    %loaded = load i128, i128 *%b
++    ret i128 %loaded
++}
++
++define i64 @test_cpy_same(i64) {
++; CHECK-LABEL: @test_cpy_same
++; CHECK-NOT: alloca
++; CHECK: ret i64 %0
++    %a = alloca i64
++    %b = alloca i64
++    store i64 %0, i64 *%a
++    call void @llvm.memcpy.p0i64.p0i64.i32(i64 *%b, i64 *%a, i32 8, i32 0, i1 0)
++    %loaded = load i64, i64 *%b
++    ret i64 %loaded
++}
++
++define double @test_cpy_different_type(i64) {
++; CHECK-LABEL: @test_cpy_different_type
++; CHECK-NOT: alloca
++; CHECK: bitcast i64 %0 to double
++    %a = alloca i64
++    %b = alloca double
++    store i64 %0, i64 *%a
++    call void @llvm.memcpy.p0f64.p0i64.i32(double *%b, i64 *%a, i32 8, i32 0, i1 0)
++    %loaded = load double, double *%b
++    ret double %loaded
++}
++
++define i128 @test_cpy_differenti8(i64) {
++; CHECK-LABEL: @test_cpy_differenti8
++; CHECK-NOT: alloca i64
++; CHECK: store i64 %0
++    %a = alloca i64
++    %b = alloca i128
++    store i128 0, i128 *%b
++    store i64 %0, i64 *%a
++    %acast = bitcast i64* %a to i8*
++    %bcast = bitcast i128* %b to i8*
++    call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0)
++    %loaded = load i128, i128 *%b
++    ret i128 %loaded
++}
++
++define i64 @test_cpy_samei8(i64) {
++; CHECK-LABEL: @test_cpy_samei8
++; CHECK-NOT: alloca
++; CHECK: ret i64 %0
++    %a = alloca i64
++    %b = alloca i64
++    store i64 %0, i64 *%a
++    %acast = bitcast i64* %a to i8*
++    %bcast = bitcast i64* %b to i8*
++    call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0)
++    %loaded = load i64, i64 *%b
++    ret i64 %loaded
++}
++
++define double @test_cpy_different_typei8(i64) {
++; CHECK-LABEL: @test_cpy_different_typei8
++; CHECK-NOT: alloca
++; CHECK: bitcast i64 %0 to double
++    %a = alloca i64
++    %b = alloca double
++    store i64 %0, i64 *%a
++    %acast = bitcast i64* %a to i8*
++    %bcast = bitcast double* %b to i8*
++    call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0)
++    %loaded = load double, double *%b
++    ret double %loaded
++}
++
++define i64 @test_cpy_differenti8_reverse(i128) {
++; CHECK-LABEL: @test_cpy_differenti8_reverse
++; CHECK-NOT: alloca i64
++    %a = alloca i64
++    %b = alloca i128
++    store i128 %0, i128 *%b
++    %acast = bitcast i64* %a to i8*
++    %bcast = bitcast i128* %b to i8*
++    call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%acast, i8 *%bcast, i32 8, i32 0, i1 0)
++    %loaded = load i64, i64 *%a
++    ret i64 %loaded
++}
+-- 
+2.9.3
+
diff --git a/src/array.c b/src/array.c
index 0810ab9348958..e519415a6cca0 100644
--- a/src/array.c
+++ b/src/array.c
@@ -180,6 +180,7 @@ JL_DLLEXPORT jl_array_t *jl_reshape_array(jl_value_t *atype, jl_array_t *data,
     size_t ndims = jl_nfields(_dims);
     assert(is_ntuple_long(_dims));
     size_t *dims = (size_t*)_dims;
+    assert(jl_types_equal(jl_tparam0(jl_typeof(data)), jl_tparam0(atype)));
 
     int ndimwords = jl_array_ndimwords(ndims);
     int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords * sizeof(size_t) + sizeof(void*), JL_SMALL_BYTE_ALIGNMENT);
diff --git a/src/cgutils.cpp b/src/cgutils.cpp
index af9f99826528a..c6f2006974043 100644
--- a/src/cgutils.cpp
+++ b/src/cgutils.cpp
@@ -235,7 +235,7 @@ static Value *emit_pointer_from_objref(jl_codectx_t &ctx, Value *V)
 #else
     Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
 #endif
-    return Call;
+    return ctx.builder.CreatePtrToInt(Call, T_size);
 }
 
 // --- emitting pointers directly into code ---
@@ -368,6 +368,12 @@ static Value *emit_bitcast(jl_codectx_t &ctx, Value *v, Type *jl_value)
     }
 }
 
+static Value *maybe_bitcast(jl_codectx_t &ctx, Value *V, Type *to) {
+    if (to != V->getType())
+        return emit_bitcast(ctx, V, to);
+    return V;
+}
+
 static Value *julia_binding_gv(jl_codectx_t &ctx, Value *bv)
 {
     Value *offset = ConstantInt::get(T_size, offsetof(jl_binding_t, value) / sizeof(size_t));
@@ -1250,8 +1256,8 @@ static void typed_store(jl_codectx_t &ctx,
     } else {
         data = ptr;
     }
-    Instruction *store = ctx.builder.CreateAlignedStore(r, ctx.builder.CreateGEP(data,
-        idx_0based), isboxed ? alignment : julia_alignment(jltype, alignment));
+    Instruction *store = ctx.builder.CreateAlignedStore(r, idx_0based ? ctx.builder.CreateGEP(data,
+        idx_0based) : data, isboxed ? alignment : julia_alignment(jltype, alignment));
     if (tbaa)
         tbaa_decorate(tbaa, store);
 }
@@ -1267,7 +1273,7 @@ static Value *julia_bool(jl_codectx_t &ctx, Value *cond)
 // --- accessing the representations of built-in data types ---
 
 static Constant *julia_const_to_llvm(jl_value_t *e);
-static Value *data_pointer(jl_codectx_t &ctx, const jl_cgval_t &x, Type *astype = T_ppjlvalue)
+static Value *data_pointer(jl_codectx_t &ctx, const jl_cgval_t &x)
 {
     Value *data = x.V;
     if (x.constant) {
@@ -1279,9 +1285,7 @@ static Value *data_pointer(jl_codectx_t &ctx, const jl_cgval_t &x, Type *astype
             data = boxed(ctx, x);
         }
     }
-    if (astype && data->getType() != astype)
-        data = emit_bitcast(ctx, data, astype);
-    return decay_derived(data);
+    return data;
 }
 
 static void emit_memcpy_llvm(jl_codectx_t &ctx, Value *dst, Value *src,
@@ -1342,7 +1346,7 @@ static Value *get_value_ptr(jl_codectx_t&, Value *ptr)
 
 static Value *get_value_ptr(jl_codectx_t &ctx, const jl_cgval_t &v)
 {
-    return data_pointer(ctx, v, nullptr);
+    return data_pointer(ctx, v);
 }
 
 template<typename T1, typename T2, typename T3>
@@ -1372,7 +1376,9 @@ static bool emit_getfield_unknownidx(jl_codectx_t &ctx,
             Value *fld = tbaa_decorate(strct.tbaa,
                 maybe_mark_load_dereferenceable(
                     ctx.builder.CreateLoad(
-                        ctx.builder.CreateBitCast(ctx.builder.CreateGEP(decay_derived(data_pointer(ctx, strct)), idx),
+                        ctx.builder.CreateBitCast(
+                            ctx.builder.CreateGEP(decay_derived(
+                            emit_bitcast(ctx, data_pointer(ctx, strct), T_pprjlvalue)), idx),
                             PointerType::get(T_prjlvalue, AddressSpace::Derived))),
                     maybe_null,  minimum_field_size));
             if (maybe_null)
@@ -1384,11 +1390,11 @@ static bool emit_getfield_unknownidx(jl_codectx_t &ctx,
             assert(nfields > 0); // nf == 0 trapped by all_pointers case
             jl_value_t *jt = jl_field_type(stt, 0);
             idx = emit_bounds_check(ctx, strct, (jl_value_t*)stt, idx, ConstantInt::get(T_size, nfields), inbounds);
-            Value *ptr = data_pointer(ctx, strct);
+            Value *ptr = decay_derived(data_pointer(ctx, strct));
             if (!stt->mutabl) {
                 // just compute the pointer and let user load it when necessary
                 Type *fty = julia_type_to_llvm(jt);
-                Value *addr = ctx.builder.CreateGEP(emit_bitcast(ctx, decay_derived(ptr), PointerType::get(fty,0)), idx);
+                Value *addr = ctx.builder.CreateGEP(emit_bitcast(ctx, ptr, PointerType::get(fty,0)), idx);
                 *ret = mark_julia_slot(addr, jt, NULL, strct.tbaa);
                 ret->isimmutable = strct.isimmutable;
                 return true;
@@ -1441,28 +1447,34 @@ static jl_cgval_t emit_getfield_knownidx(jl_codectx_t &ctx, const jl_cgval_t &st
         return ghostValue(jfty);
     Value *fldv = NULL;
     if (strct.ispointer()) {
-        Value *addr;
+        Value *addr = decay_derived(data_pointer(ctx, strct));
         bool isboxed;
         Type *lt = julia_type_to_llvm((jl_value_t*)jt, &isboxed);
         if (isboxed) {
-            Value *ptr = decay_derived(data_pointer(ctx, strct, T_pint8));
-            Value *llvm_idx = ConstantInt::get(T_size, jl_field_offset(jt, idx));
-            addr = ctx.builder.CreateGEP(ptr, llvm_idx);
+            size_t byte_offset = jl_field_offset(jt, idx);
+            // byte_offset == 0 is an important special case here, e.g.
+            // for single field wrapper types. Introducing the bitcast
+            // can pessimize mem2reg
+            if (byte_offset > 0) {
+                addr = ctx.builder.CreateGEP(
+                    emit_bitcast(ctx, addr, T_pint8),
+                    ConstantInt::get(T_size, byte_offset));
+            }
         }
         else {
             if (VectorType *vlt = dyn_cast<VectorType>(lt)) {
                 // doesn't have the struct wrapper, so this must have been a VecElement
                 // cast to the element type so that it can be addressed with GEP
                 lt = vlt->getElementType();
-                Value *ptr = data_pointer(ctx, strct, lt->getPointerTo());
+                Value *ptr = emit_bitcast(ctx, addr, lt->getPointerTo());
                 Value *llvm_idx = ConstantInt::get(T_size, idx);
                 addr = ctx.builder.CreateGEP(lt, ptr, llvm_idx);
             }
             else if (lt->isSingleValueType()) {
-                addr = data_pointer(ctx, strct, lt->getPointerTo());
+                addr = emit_bitcast(ctx, addr, lt->getPointerTo());
             }
             else {
-                Value *ptr = data_pointer(ctx, strct, lt->getPointerTo());
+                Value *ptr = emit_bitcast(ctx, addr, lt->getPointerTo());
                 addr = ctx.builder.CreateStructGEP(lt, ptr, idx);
             }
         }
@@ -1503,7 +1515,7 @@ static jl_cgval_t emit_getfield_knownidx(jl_codectx_t &ctx, const jl_cgval_t &st
             fieldval.isimmutable = strct.isimmutable;
             return fieldval;
         }
-        return typed_load(ctx, addr, ConstantInt::get(T_size, 0), jfty, strct.tbaa, true, align);
+        return typed_load(ctx, addr, NULL, jfty, strct.tbaa, true, align);
     }
     else if (isa<UndefValue>(strct.V)) {
         return jl_cgval_t();
@@ -2152,13 +2164,15 @@ static void emit_unionmove(jl_codectx_t &ctx, Value *dest, const jl_cgval_t &src
                 emit_unbox(ctx, store_ty, src, typ, dest, isVolatile);
             }
             else {
-                Value *src_ptr = data_pointer(ctx, src, T_pint8);
-                if (dest->getType() != T_pint8)
-                    dest = emit_bitcast(ctx, dest, T_pint8);
-                if (skip) // copy dest -> dest to simulate an undef value / conditional copy
-                    src_ptr = ctx.builder.CreateSelect(skip, dest, src_ptr);
+                Value *src_ptr = data_pointer(ctx, src);
+                unsigned nb = jl_datatype_size(typ);
                 unsigned alignment = julia_alignment(typ, 0);
-                emit_memcpy(ctx, dest, src_ptr, jl_datatype_size(typ), alignment, isVolatile, tbaa);
+                Value *nbytes = ConstantInt::get(T_size, nb);
+                if (skip) // copy dest -> dest to simulate an undef value / conditional copy
+                    nbytes = ctx.builder.CreateSelect(skip,
+                        ConstantInt::get(T_size, 0),
+                        nbytes);
+                emit_memcpy(ctx, dest, src_ptr, nbytes, alignment, isVolatile, tbaa);
             }
         }
     }
@@ -2166,9 +2180,8 @@ static void emit_unionmove(jl_codectx_t &ctx, Value *dest, const jl_cgval_t &src
         Value *tindex = ctx.builder.CreateAnd(src.TIndex, ConstantInt::get(T_int8, 0x7f));
         if (skip)
             tindex = ctx.builder.CreateSelect(skip, ConstantInt::get(T_int8, 0), tindex);
-        Value *src_ptr = data_pointer(ctx, src, T_pint8);
-        if (dest->getType() != T_pint8)
-            dest = emit_bitcast(ctx, dest, T_pint8);
+        Value *src_ptr = maybe_bitcast(ctx, data_pointer(ctx, src), T_pint8);
+        dest = maybe_bitcast(ctx, dest, T_pint8);
         BasicBlock *defaultBB = BasicBlock::Create(jl_LLVMContext, "union_move_skip", ctx.f);
         SwitchInst *switchInst = ctx.builder.CreateSwitch(tindex, defaultBB);
         BasicBlock *postBB = BasicBlock::Create(jl_LLVMContext, "post_union_move", ctx.f);
@@ -2288,8 +2301,13 @@ static void emit_setfield(jl_codectx_t &ctx,
 {
     if (sty->mutabl || !checked) {
         assert(strct.ispointer());
-        Value *addr = ctx.builder.CreateGEP(data_pointer(ctx, strct, T_pint8),
-                ConstantInt::get(T_size, jl_field_offset(sty, idx0)));
+        size_t byte_offset = jl_field_offset(sty, idx0);
+        Value *addr = data_pointer(ctx, strct);
+        if (byte_offset > 0) {
+            addr = ctx.builder.CreateGEP(
+                emit_bitcast(ctx, decay_derived(addr), T_pint8),
+                ConstantInt::get(T_size, byte_offset));
+        }
         jl_value_t *jfty = jl_svecref(sty->types, idx0);
         if (jl_field_isptr(sty, idx0)) {
             Value *r = maybe_decay_untracked(boxed(ctx, rhs)); // don't need a temporary gcroot since it'll be rooted by strct
@@ -2306,7 +2324,7 @@ static void emit_setfield(jl_codectx_t &ctx,
                 return;
             Value *tindex = compute_tindex_unboxed(ctx, rhs_union, jfty);
             tindex = ctx.builder.CreateNUWSub(tindex, ConstantInt::get(T_int8, 1));
-            Value *ptindex = ctx.builder.CreateGEP(T_int8, emit_bitcast(ctx, addr, T_pint8), ConstantInt::get(T_size, fsz - 1));
+            Value *ptindex = ctx.builder.CreateGEP(T_int8, emit_bitcast(ctx, decay_derived(addr), T_pint8), ConstantInt::get(T_size, fsz - 1));
             ctx.builder.CreateStore(tindex, ptindex);
             // copy data
             if (!rhs.isghost) {
@@ -2315,8 +2333,9 @@ static void emit_setfield(jl_codectx_t &ctx,
         }
         else {
             unsigned align = jl_field_align(sty, idx0);
-            typed_store(ctx, addr, ConstantInt::get(T_size, 0), rhs, jfty,
-                strct.tbaa, data_pointer(ctx, strct, T_pjlvalue), align);
+            typed_store(ctx, addr, NULL, rhs, jfty,
+                strct.tbaa, maybe_bitcast(ctx,
+                data_pointer(ctx, strct), T_pjlvalue), align);
         }
     }
     else {
@@ -2416,12 +2435,13 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
         Value *strct = emit_allocobj(ctx, jl_datatype_size(sty),
                                      literal_pointer_val(ctx, (jl_value_t*)ty));
         jl_cgval_t strctinfo = mark_julia_type(ctx, strct, true, ty);
+        strct = decay_derived(strct);
         for (size_t i = 0; i < nf; i++) {
             if (jl_field_isptr(sty, i)) {
                 tbaa_decorate(strctinfo.tbaa, ctx.builder.CreateStore(
                         ConstantPointerNull::get(cast<PointerType>(T_prjlvalue)),
                         emit_bitcast(ctx,
-                            ctx.builder.CreateGEP(emit_bitcast(ctx, decay_derived(strct), T_pint8),
+                            ctx.builder.CreateGEP(emit_bitcast(ctx, strct, T_pint8),
                                 ConstantInt::get(T_size, jl_field_offset(sty, i))),
                             T_pprjlvalue)));
             }
diff --git a/src/codegen.cpp b/src/codegen.cpp
index e1a3f99e146d3..7f13e04fcec9d 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -2135,16 +2135,16 @@ static Value *emit_bits_compare(jl_codectx_t &ctx, const jl_cgval_t &arg1, const
         if (sz > 512 && !((jl_datatype_t*)arg1.typ)->layout->haspadding) {
             Value *answer = ctx.builder.CreateCall(prepare_call(memcmp_derived_func),
                             {
-                            data_pointer(ctx, arg1, T_pint8),
-                            data_pointer(ctx, arg2, T_pint8),
+                            maybe_bitcast(ctx, decay_derived(data_pointer(ctx, arg1)), T_pint8),
+                            maybe_bitcast(ctx, decay_derived(data_pointer(ctx, arg2)), T_pint8),
                             ConstantInt::get(T_size, sz)
                             });
             return ctx.builder.CreateICmpEQ(answer, ConstantInt::get(T_int32, 0));
         }
         else {
             Type *atp = at->getPointerTo();
-            Value *varg1 = data_pointer(ctx, arg1, atp);
-            Value *varg2 = data_pointer(ctx, arg2, atp);
+            Value *varg1 = maybe_bitcast(ctx, decay_derived(data_pointer(ctx, arg1)), atp);
+            Value *varg2 = maybe_bitcast(ctx, decay_derived(data_pointer(ctx, arg2)), atp);
             jl_svec_t *types = ((jl_datatype_t*)arg1.typ)->types;
             Value *answer = ConstantInt::get(T_int1, 1);
             for (size_t i = 0, l = jl_svec_len(types); i < l; i++) {
@@ -2645,7 +2645,7 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
                                 emit_datatype_nfields(ctx, emit_typeof_boxed(ctx, obj)),
                                 jl_true);
                         }
-                        Value *ptr = data_pointer(ctx, obj);
+                        Value *ptr = decay_derived(data_pointer(ctx, obj));
                         *ret = typed_load(ctx, ptr, vidx, jt, obj.tbaa, false);
                         return true;
                     }
@@ -2836,7 +2836,7 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
         }
         else {
             size_t offs = jl_field_offset(stt, fieldidx);
-            Value *ptr = data_pointer(ctx, obj, T_pint8);
+            Value *ptr = emit_bitcast(ctx, decay_derived(data_pointer(ctx, obj)), T_pint8);
             Value *llvm_idx = ConstantInt::get(T_size, offs);
             Value *addr = ctx.builder.CreateGEP(ptr, llvm_idx);
             // emit this using the same type as emit_getfield_knownidx
@@ -2926,7 +2926,8 @@ static jl_cgval_t emit_call_function_object(jl_method_instance_t *li, jl_llvm_fu
                 // can lazy load on demand, no copy needed
                 assert(at == PointerType::get(et, AddressSpace::Derived));
                 assert(arg.ispointer());
-                argvals[idx] = decay_derived(data_pointer(ctx, arg, at));
+                argvals[idx] = decay_derived(maybe_bitcast(ctx,
+                    data_pointer(ctx, arg), at));
             }
             else {
                 assert(at == et);
@@ -3433,9 +3434,15 @@ static void emit_vi_assignment_unboxed(jl_codectx_t &ctx, jl_varinfo_t &vi, Valu
                 tbaa = NULL;
             if (vi.pTIndex == NULL) {
                 assert(jl_is_leaf_type(vi.value.typ));
-                Value *copy_bytes = ConstantInt::get(T_int32, jl_datatype_size(vi.value.typ));
-                emit_memcpy(ctx, vi.value.V, rval_info, copy_bytes,
-                            jl_datatype_align(rval_info.typ), vi.isVolatile, tbaa);
+                // Sometimes we can get into situations where the LHS and RHS
+                // are the same slot. We're not allowed to memcpy in that case
+                // under penalty of undefined behavior. This check should catch
+                // the relevant situations.
+                if (vi.value.V != rval_info.V) {
+                    Value *copy_bytes = ConstantInt::get(T_int32, jl_datatype_size(vi.value.typ));
+                    emit_memcpy(ctx, vi.value.V, rval_info, copy_bytes,
+                                jl_datatype_align(rval_info.typ), vi.isVolatile, tbaa);
+                }
             }
             else {
                 emit_unionmove(ctx, vi.value.V, rval_info, isboxed, vi.isVolatile, tbaa);
@@ -4297,7 +4304,8 @@ static Function *gen_cfun_wrapper(jl_function_t *ff, jl_value_t *jlrettype, jl_t
             }
             else if (T->isAggregateType()) {
                 // aggregate types are passed by pointer
-                arg = data_pointer(ctx, inputarg, T->getPointerTo());
+                arg = maybe_bitcast(ctx, decay_derived(data_pointer(ctx, inputarg)),
+                    T->getPointerTo());
             }
             else {
                 arg = emit_unbox(ctx, T, inputarg, spect);
@@ -6571,7 +6579,7 @@ static void init_julia_llvm_env(Module *m)
                                         "llvm.julia.gc_preserve_end");
     add_named_global(gc_preserve_end_func, (void*)NULL, /*dllimport*/false);
 
-    pointer_from_objref_func = Function::Create(FunctionType::get(T_size,
+    pointer_from_objref_func = Function::Create(FunctionType::get(T_pjlvalue,
                                          ArrayRef<Type*>(PointerType::get(T_jlvalue, AddressSpace::Derived)), false),
                                          Function::ExternalLinkage,
                                          "julia.pointer_from_objref");
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index 425941888d77b..0dc7c5319738d 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -269,6 +269,37 @@ static Constant *julia_const_to_llvm(jl_value_t *e)
 
 static jl_cgval_t ghostValue(jl_value_t *ty);
 
+static Value *emit_unboxed_coercion(jl_codectx_t &ctx, Type *to, Value *unboxed)
+{
+    Type *ty = unboxed->getType();
+    assert(ty != T_void);
+    bool frompointer = ty->isPointerTy();
+    bool topointer = to->isPointerTy();
+    if (frompointer && topointer) {
+        unboxed = emit_bitcast(ctx, unboxed, to);
+    }
+    else if (frompointer) {
+        Type *INTT_to = INTT(to);
+        unboxed = ctx.builder.CreatePtrToInt(unboxed, INTT_to);
+        if (INTT_to != to)
+            unboxed = ctx.builder.CreateBitCast(unboxed, to);
+    }
+    else if (topointer) {
+        Type *INTT_to = INTT(to);
+        if (to != INTT_to)
+            unboxed = ctx.builder.CreateBitCast(unboxed, INTT_to);
+        unboxed = ctx.builder.CreateIntToPtr(unboxed, to);
+    }
+    else if (ty == T_int1 && to == T_int8) {
+        // bools may be stored internally as int8
+        unboxed = ctx.builder.CreateZExt(unboxed, T_int8);
+    }
+    else if (ty != to) {
+        unboxed = ctx.builder.CreateBitCast(unboxed, to);
+    }
+    return unboxed;
+}
+
 // emit code to unpack a raw value from a box into registers or a stack slot
 static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_value_t *jt, Value *dest, bool volatile_store)
 {
@@ -287,33 +318,7 @@ static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_va
 
     Constant *c = x.constant ? julia_const_to_llvm(x.constant) : NULL;
     if (!x.ispointer() || c) { // already unboxed, but sometimes need conversion
-        Value *unboxed = c ? c : x.V;
-        Type *ty = unboxed->getType();
-        assert(ty != T_void);
-        bool frompointer = ty->isPointerTy();
-        bool topointer = to->isPointerTy();
-        if (frompointer && topointer) {
-            unboxed = emit_bitcast(ctx, unboxed, to);
-        }
-        else if (frompointer) {
-            Type *INTT_to = INTT(to);
-            unboxed = ctx.builder.CreatePtrToInt(unboxed, INTT_to);
-            if (INTT_to != to)
-                unboxed = ctx.builder.CreateBitCast(unboxed, to);
-        }
-        else if (topointer) {
-            Type *INTT_to = INTT(to);
-            if (to != INTT_to)
-                unboxed = ctx.builder.CreateBitCast(unboxed, INTT_to);
-            unboxed = ctx.builder.CreateIntToPtr(unboxed, to);
-        }
-        else if (ty == T_int1 && to == T_int8) {
-            // bools may be stored internally as int8
-            unboxed = ctx.builder.CreateZExt(unboxed, T_int8);
-        }
-        else if (ty != to) {
-            unboxed = ctx.builder.CreateBitCast(unboxed, to);
-        }
+        Value *unboxed = emit_unboxed_coercion(ctx, to, c ? c : x.V);
         if (!dest)
             return unboxed;
         Type *dest_ty = unboxed->getType()->getPointerTo();
@@ -326,14 +331,12 @@ static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_va
     // bools stored as int8, so an extra Trunc is needed to get an int1
     Value *p = x.constant ? literal_pointer_val(ctx, x.constant) : x.V;
     Type *ptype = (to == T_int1 ? T_pint8 : to->getPointerTo());
-    if (p->getType() != ptype)
-        p = emit_bitcast(ctx, p, ptype);
 
     Value *unboxed = NULL;
     if (to == T_int1)
-        unboxed = ctx.builder.CreateTrunc(tbaa_decorate(x.tbaa, ctx.builder.CreateLoad(p)), T_int1);
+        unboxed = ctx.builder.CreateTrunc(tbaa_decorate(x.tbaa, ctx.builder.CreateLoad(maybe_bitcast(ctx, p, ptype))), T_int1);
     else if (jt == (jl_value_t*)jl_bool_type)
-        unboxed = ctx.builder.CreateZExt(ctx.builder.CreateTrunc(tbaa_decorate(x.tbaa, ctx.builder.CreateLoad(p)), T_int1), to);
+        unboxed = ctx.builder.CreateZExt(ctx.builder.CreateTrunc(tbaa_decorate(x.tbaa, ctx.builder.CreateLoad(maybe_bitcast(ctx, p, ptype))), T_int1), to);
     if (unboxed) {
         if (!dest)
             return unboxed;
@@ -354,6 +357,27 @@ static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_va
         return NULL;
     }
     else {
+        if (p->getType() != ptype && isa<AllocaInst>(p)) {
+            // LLVM's mem2reg can't handle coercion if the load/store type does
+            // not match the type of the alloca. As such, it is better to
+            // perform the load using the alloca's type and then perform the
+            // appropriate coercion manually.
+            AllocaInst *AI = cast<AllocaInst>(p);
+            Type *AllocType = AI->getAllocatedType();
+#if JL_LLVM_VERSION >= 40000
+            const DataLayout &DL = jl_data_layout;
+#else
+            const DataLayout &DL = jl_ExecutionEngine->getDataLayout();
+#endif
+            if (!AI->isArrayAllocation() &&
+                    (AllocType->isFloatingPointTy() || AllocType->isIntegerTy() || AllocType->isPointerTy()) &&
+                    (to->isFloatingPointTy() || to->isIntegerTy() || to->isPointerTy()) &&
+                    DL.getTypeSizeInBits(AllocType) == DL.getTypeSizeInBits(to)) {
+                Instruction *load = ctx.builder.CreateAlignedLoad(p, alignment);
+                return emit_unboxed_coercion(ctx, to, tbaa_decorate(x.tbaa, load));
+            }
+        }
+        p = maybe_bitcast(ctx, p, ptype);
         Instruction *load = ctx.builder.CreateAlignedLoad(p, alignment);
         return tbaa_decorate(x.tbaa, load);
     }
@@ -439,7 +463,8 @@ static jl_cgval_t generic_bitcast(jl_codectx_t &ctx, const jl_cgval_t *argv)
         if (isboxed)
             vxt = llvmt;
         vx = tbaa_decorate(v.tbaa, ctx.builder.CreateLoad(
-                    data_pointer(ctx, v, vxt == T_int1 ? T_pint8 : vxt->getPointerTo())));
+                    emit_bitcast(ctx, data_pointer(ctx, v),
+                        vxt == T_int1 ? T_pint8 : vxt->getPointerTo())));
     }
 
     vxt = vx->getType();
@@ -899,6 +924,26 @@ static Value *emit_untyped_intrinsic(jl_codectx_t &ctx, intrinsic f, Value **arg
     case srem_int: return ctx.builder.CreateSRem(x, y);
     case urem_int: return ctx.builder.CreateURem(x, y);
 
+    // LLVM will not fold ptrtoint+arithmetic+inttoptr to GEP. The reason for this
+    // has to do with alias analysis. When adding two integers, either one of them
+    // could be the pointer base. With getelementptr, it is clear which of the
+    // operands is the pointer base. We also have this information at the julia
+    // level. Thus, to not lose information, we need to have a separate intrinsic
+    // for pointer arithmetic which lowers to getelementptr.
+    case add_ptr: {
+        return ctx.builder.CreatePtrToInt(
+            ctx.builder.CreateGEP(T_int8,
+                ctx.builder.CreateIntToPtr(x, T_pint8), y), t);
+
+    }
+
+    case sub_ptr: {
+        return ctx.builder.CreatePtrToInt(
+            ctx.builder.CreateGEP(T_int8,
+                ctx.builder.CreateIntToPtr(x, T_pint8), ctx.builder.CreateNeg(y)), t);
+
+    }
+
 // Implements IEEE negate. See issue #7868
     case neg_float: return math_builder(ctx)().CreateFSub(ConstantFP::get(t, -0.0), x);
     case neg_float_fast: return math_builder(ctx, true)().CreateFNeg(x);
diff --git a/src/intrinsics.h b/src/intrinsics.h
index 80491639ac6b8..0f04fe418c4e6 100644
--- a/src/intrinsics.h
+++ b/src/intrinsics.h
@@ -12,6 +12,8 @@
     ADD_I(udiv_int, 2) \
     ADD_I(srem_int, 2) \
     ADD_I(urem_int, 2) \
+    ADD_I(add_ptr, 2) \
+    ADD_I(sub_ptr, 2) \
     ADD_I(neg_float, 1) \
     ADD_I(add_float, 2) \
     ADD_I(sub_float, 2) \
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index 73a270141417d..7218e53cb180c 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -210,6 +210,9 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level)
     PM->add(createSimpleLoopUnrollPass());     // Unroll small loops
     //PM->add(createLoopStrengthReducePass());   // (jwb added)
 
+    // Re-run SROA after loop-unrolling (useful for small loops that operate,
+    // over the structure of an aggregate)
+    PM->add(createSROAPass());                 // Break up aggregate allocas
     PM->add(createInstructionCombiningPass()); // Clean up after the unroller
     PM->add(createGVNPass());                  // Remove redundancies
     PM->add(createMemCpyOptPass());            // Remove memcpy / form memset
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 84cfb543f9fb8..7cd30911f54fd 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -761,6 +761,9 @@ JL_DLLEXPORT jl_value_t *jl_udiv_int(jl_value_t *a, jl_value_t *b);
 JL_DLLEXPORT jl_value_t *jl_srem_int(jl_value_t *a, jl_value_t *b);
 JL_DLLEXPORT jl_value_t *jl_urem_int(jl_value_t *a, jl_value_t *b);
 
+JL_DLLEXPORT jl_value_t *jl_add_ptr(jl_value_t *a, jl_value_t *b);
+JL_DLLEXPORT jl_value_t *jl_sub_ptr(jl_value_t *a, jl_value_t *b);
+
 JL_DLLEXPORT jl_value_t *jl_neg_float(jl_value_t *a);
 JL_DLLEXPORT jl_value_t *jl_add_float(jl_value_t *a, jl_value_t *b);
 JL_DLLEXPORT jl_value_t *jl_sub_float(jl_value_t *a, jl_value_t *b);
diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp
index 4a67c39f841aa..216a37eb64bdd 100644
--- a/src/llvm-alloc-opt.cpp
+++ b/src/llvm-alloc-opt.cpp
@@ -592,7 +592,6 @@ void AllocOpt::replaceUsesWith(Instruction *orig_inst, Instruction *new_inst,
         }
         else if (auto call = dyn_cast<CallInst>(user)) {
             if (ptr_from_objref && ptr_from_objref == call->getCalledFunction()) {
-                new_i = new PtrToIntInst(new_i, T_size, "", call);
                 call->replaceAllUsesWith(new_i);
                 call->eraseFromParent();
                 return;
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 8acf06496db0f..ccb660d966a43 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -1207,9 +1207,8 @@ bool LateLowerGCFrame::CleanupIR(Function &F) {
             } else if (pointer_from_objref_func != nullptr && callee == pointer_from_objref_func) {
                 auto *obj = CI->getOperand(0);
                 auto *ASCI = new AddrSpaceCastInst(obj, T_pjlvalue, "", CI);
-                auto *ptr = new PtrToIntInst(ASCI, CI->getType(), "", CI);
-                ptr->takeName(CI);
-                CI->replaceAllUsesWith(ptr);
+                ASCI->takeName(CI);
+                CI->replaceAllUsesWith(ASCI);
             } else if (alloc_obj_func && callee == alloc_obj_func) {
                 assert(CI->getNumArgOperands() == 3);
                 auto sz = (size_t)cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index 4947d93d5c496..a1935158ca613 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -702,8 +702,10 @@ JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b, jl_value_t *c)
 un_iintrinsic_fast(LLVMNeg, neg, neg_int, u)
 #define add(a,b) a + b
 bi_iintrinsic_fast(LLVMAdd, add, add_int, u)
+bi_iintrinsic_fast(LLVMAdd, add, add_ptr, u)
 #define sub(a,b) a - b
 bi_iintrinsic_fast(LLVMSub, sub, sub_int, u)
+bi_iintrinsic_fast(LLVMSub, sub, sub_ptr, u)
 #define mul(a,b) a * b
 bi_iintrinsic_fast(LLVMMul, mul, mul_int, u)
 #define div(a,b) a / b
diff --git a/test/arrayops.jl b/test/arrayops.jl
index 3e65cd7b65d42..74d142a37d968 100644
--- a/test/arrayops.jl
+++ b/test/arrayops.jl
@@ -68,19 +68,15 @@ using Main.TestHelpers.OAs
     @test a[1,2,1,1,2] == 20
     @test a[1,1,2,2,1] == 30
 
-    @test_throws ArgumentError reinterpret(Int8, a)
-
     b = reshape(a, (32,))
     @test b[1]  == 10
     @test b[19] == 20
     @test b[13] == 30
     @test_throws DimensionMismatch reshape(b,(5,7))
     @test_throws DimensionMismatch reshape(b,(35,))
-    @test_throws DimensionMismatch reinterpret(Int, b, (35,))
-    @test_throws ArgumentError reinterpret(Any, b, (32,))
-    @test_throws DimensionMismatch reinterpret(Complex128, b, (32,))
+    @test_throws ArgumentError reinterpret(Any, b)
     c = ["hello", "world"]
-    @test_throws ArgumentError reinterpret(Float32, c, (2,))
+    @test_throws ArgumentError reinterpret(Float32, c)
     a = Vector(ones(5))
     @test_throws ArgumentError resize!(a, -2)
 
@@ -209,7 +205,7 @@ end
     @test b[5] == -4
     @test b[6] == -3
     @test b[7] == -2
-    b = reinterpret(Int, a, (3,4))
+    b = reinterpret(Int, a)
     b[1] = -1
     @test vec(b) == vec(a)
 
diff --git a/test/choosetests.jl b/test/choosetests.jl
index 5320e104c925e..dfc70a23d3123 100644
--- a/test/choosetests.jl
+++ b/test/choosetests.jl
@@ -48,7 +48,8 @@ function choosetests(choices = [])
         "enums", "cmdlineargs", "i18n", "workspace", "libdl", "int",
         "checked", "intset", "floatfuncs", "compile", "distributed", "inline",
         "boundscheck", "error", "ambiguous", "cartesian", "asmvariant", "osutils",
-        "channels", "iostream", "specificity", "codegen", "codevalidation"
+        "channels", "iostream", "specificity", "codegen", "codevalidation",
+        "reinterpretarray"
     ]
     profile_skipped = false
     if startswith(string(Sys.ARCH), "arm")
diff --git a/test/core.jl b/test/core.jl
index d5701879c43ae..bb2c2a9ddcff7 100644
--- a/test/core.jl
+++ b/test/core.jl
@@ -3953,9 +3953,6 @@ f = unsafe_wrap(Array, pointer(d), length(d))
 @test !check_nul(f)
 f = unsafe_wrap(Array, ccall(:malloc, Ptr{UInt8}, (Csize_t,), 10), 10, true)
 @test !check_nul(f)
-g = reinterpret(UInt8, UInt16[0x1, 0x2])
-@test !check_nul(g)
-@test check_nul(copy(g))
 end
 
 # Copy of `#undef`
@@ -5007,23 +5004,6 @@ end
 g21719(f, goal; tol = 1e-6) = T21719(f, tol, goal)
 @test isa(g21719(identity, 1.0; tol=0.1), T21719)
 
-# reinterpret alignment requirement
-let arr8 = zeros(UInt8, 16),
-    arr64 = zeros(UInt64, 2),
-    arr64_8 = reinterpret(UInt8, arr64),
-    arr64_i
-
-    # Not allowed to reinterpret arrays allocated as UInt8 array to a Int32 array
-    res = @test_throws ArgumentError reinterpret(Int32, arr8)
-    @test res.value.msg == "reinterpret from alignment 1 bytes to alignment 4 bytes not allowed"
-    # OK to reinterpret arrays allocated as UInt64 array to a Int64 array even though
-    # it is passed as a UInt8 array
-    arr64_i = reinterpret(Int64, arr64_8)
-    @test arr8 == arr64_8
-    arr64_i[2] = 1234
-    @test arr64[2] == 1234
-end
-
 # Alignment of perm boxes
 for i in 1:10
     # Int64 box should be 16bytes aligned even on 32bits
diff --git a/test/inference.jl b/test/inference.jl
index 81910152cf37e..1642878136df1 100644
--- a/test/inference.jl
+++ b/test/inference.jl
@@ -830,7 +830,7 @@ f2_17003(::Any) = f2_17003(NArray_17003(gl_17003))
 
 # issue #20847
 function segfaultfunction_20847(A::Vector{NTuple{N, T}}) where {N, T}
-    B = reinterpret(T, A, (N, length(A)))
+    B = reshape(reinterpret(T, A), (N, length(A)))
     return nothing
 end
 
diff --git a/test/reinterpretarray.jl b/test/reinterpretarray.jl
new file mode 100644
index 0000000000000..b334f341e83d7
--- /dev/null
+++ b/test/reinterpretarray.jl
@@ -0,0 +1,31 @@
+using Test
+
+A = Int64[1, 2, 3, 4]
+B = Complex{Int64}[5+6im, 7+8im, 9+10im]
+# getindex
+@test reinterpret(Complex{Int64}, A) == [1 + 2im, 3 + 4im]
+@test reinterpret(Float64, A) == reinterpret.(Float64, A)
+
+@test reinterpret(NTuple{3, Int64}, B) == [(5,6,7),(8,9,10)]
+
+# setindex
+let Ac = copy(A), Bc = copy(B)
+    reinterpret(Complex{Int64}, Ac)[2] = -1 - 2im
+    @test Ac == [1, 2, -1, -2]
+    reinterpret(NTuple{3, Int64}, Bc)[2] = (4,5,6)
+    @test Bc == Complex{Int64}[5+6im, 7+4im, 5+6im]
+    reinterpret(NTuple{3, Int64}, Bc)[1] = (1,2,3)
+    @test Bc == Complex{Int64}[1+2im, 3+4im, 5+6im]
+
+    A1 = reinterpret(Float64, A)
+    A2 = reinterpret(Complex{Float64}, A)
+    A1[1] = 1.0
+    @test real(A2[1]) == 1.0
+end
+
+# same-size reinterpret where one of the types is non-primitive
+let a = NTuple{4,UInt8}[(0x01,0x02,0x03,0x04)]
+    @test reinterpret(Float32, a)[1] == reinterpret(Float32, 0x04030201)
+    reinterpret(Float32, a)[1] = 2.0
+    @test reinterpret(Float32, a)[1] == 2.0
+end
diff --git a/test/sparse/sparse.jl b/test/sparse/sparse.jl
index 4570876f3d500..ba1800964334e 100644
--- a/test/sparse/sparse.jl
+++ b/test/sparse/sparse.jl
@@ -489,12 +489,6 @@ end
     @test Array(spdiagm(ones(2), -1, 3, 3)) == diagm(ones(2), -1)
 end
 
-@testset "issue #4986, reinterpret" begin
-    sfe22 = speye(Float64, 2)
-    mfe22 = eye(Float64, 2)
-    @test reinterpret(Int64, sfe22) == reinterpret(Int64, mfe22)
-end
-
 @testset "issue #5190" begin
     @test_throws ArgumentError sparsevec([3,5,7],[0.1,0.0,3.2],4)
 end
@@ -964,10 +958,6 @@ end
     ACPY = copy(A)
     B = reshape(A,25,1)
     @test A == ACPY
-    C = reinterpret(Int64, A, (25, 1))
-    @test A == ACPY
-    D = reinterpret(Int64, copy(B))
-    @test C == D
 end
 
 @testset "issue #8225" begin
@@ -1316,11 +1306,8 @@ end
     @test spdiagm(([1,2],[3.5],[4+5im]), (0,1,-1), 2,2) == [1 3.5; 4+5im 2]
 end
 
-@testset "error conditions for reinterpret, reshape, and squeeze" begin
+@testset "error conditions for reshape, and squeeze" begin
     local A = sprand(Bool, 5, 5, 0.2)
-    @test_throws ArgumentError reinterpret(Complex128, A)
-    @test_throws ArgumentError reinterpret(Complex128, A,(5, 5))
-    @test_throws DimensionMismatch reinterpret(Int8, A,(20,))
     @test_throws DimensionMismatch reshape(A,(20, 2))
     @test_throws ArgumentError squeeze(A,(1, 1))
 end
diff --git a/test/sparse/sparsevector.jl b/test/sparse/sparsevector.jl
index 56493ac391d95..6935071f86355 100644
--- a/test/sparse/sparsevector.jl
+++ b/test/sparse/sparsevector.jl
@@ -281,11 +281,6 @@ let a = SparseVector(8, [2, 5, 6], Int32[12, 35, 72])
     # vec
     @test vec(a) == a
 
-    # reinterpret
-    au = reinterpret(UInt32, a)
-    @test isa(au, SparseVector{UInt32,Int})
-    @test exact_equal(au, SparseVector(8, [2, 5, 6], UInt32[12, 35, 72]))
-
     # float
     af = float(a)
     @test float(af) == af