Skip to content

Commit

Permalink
changed broadcast! into bitarray algorithm (#32048)
Browse files Browse the repository at this point in the history
Cf
https://discourse.julialang.org/t/broadcast-vs-slow-performance-allocations/24259/6
for some more discussion and
#32047 for the question of
validity in view of exceptions.

Before:
```
julia> using BenchmarkTools, Random
julia> y=1; xsmall=[1]; Random.seed!(42); xlarge=rand(1:4, 100_003);
julia> @Btime broadcast(==, $xsmall, $y); @Btime  broadcast(==, $xlarge, $y); @show hash(broadcast(==, xlarge, y).chunks);
  860.500 ns (3 allocations: 4.31 KiB)
  152.971 μs (3 allocations: 16.59 KiB)
hash((broadcast(==, xlarge, y)).chunks) = 0xaa3b5a692968e128
```
After:
```
julia> @Btime broadcast(==, $xsmall, $y); @Btime  broadcast(==, $xlarge, $y); @show hash(broadcast(==, xlarge, y).chunks);
  65.466 ns (2 allocations: 128 bytes)
  42.927 μs (2 allocations: 12.41 KiB)
hash((broadcast(==, xlarge, y)).chunks) = 0xaa3b5a692968e128
```

Co-authored-by: Jameson Nash <vtjnash@gmail.com>
  • Loading branch information
chethega and vtjnash authored Nov 4, 2023
1 parent d75a00f commit f3ae44c
Showing 1 changed file with 26 additions and 0 deletions.
26 changes: 26 additions & 0 deletions base/broadcast.jl
Original file line number Diff line number Diff line change
Expand Up @@ -974,6 +974,32 @@ preprocess(dest, x) = extrude(broadcast_unalias(dest, x))
return dest
end

# Performance optimization: for BitVector outputs, we cache the result
# in a 64-bit register before writing into memory (to bypass LSQ)
@inline function copyto!(dest::BitVector, bc::Broadcasted{Nothing})
axes(dest) == axes(bc) || throwdm(axes(dest), axes(bc))
ischunkedbroadcast(dest, bc) && return chunkedcopyto!(dest, bc)
destc = dest.chunks
bcp = preprocess(dest, bc)
length(bcp) <= 0 && return dest
len = Base.num_bit_chunks(Int(length(bcp)))
@inbounds for i = 0:(len - 2)
z = UInt64(0)
for j = 0:63
z |= UInt64(bcp[i*64 + j + 1]::Bool) << (j & 63)
end
destc[i + 1] = z
end
@inbounds let i = len - 1
z = UInt64(0)
for j = 0:((length(bcp) - 1) & 63)
z |= UInt64(bcp[i*64 + j + 1]::Bool) << (j & 63)
end
destc[i + 1] = z
end
return dest
end

# Performance optimization: for BitArray outputs, we cache the result
# in a "small" Vector{Bool}, and then copy in chunks into the output
@inline function copyto!(dest::BitArray, bc::Broadcasted{Nothing})
Expand Down

0 comments on commit f3ae44c

Please sign in to comment.