Skip to content

Commit

Permalink
arm64: xor
Browse files Browse the repository at this point in the history
  • Loading branch information
bwesterb committed Jul 24, 2024
1 parent 5c7939b commit 5e32cdf
Show file tree
Hide file tree
Showing 8 changed files with 194 additions and 1 deletion.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ go-and

[![Go Reference](https://pkg.go.dev/badge/github.com/bwesterb/go-and.svg)](https://pkg.go.dev/github.com/bwesterb/go-and)

Fast bitwise and, or, andn, popcount and memset for `[]byte` slices.
Fast bitwise and, or, xor, andn, popcount and memset for `[]byte` slices.

```go
import "github.com/bwesterb/go-and"
Expand Down
48 changes: 48 additions & 0 deletions and_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,54 @@ loop:
JNZ loop
RET

// func xorAVX2(dst *byte, a *byte, b *byte, l uint64)
// Requires: AVX, AVX2
TEXT ·xorAVX2(SB), NOSPLIT, $0-32
MOVQ a+8(FP), AX
MOVQ b+16(FP), CX
MOVQ dst+0(FP), DX
MOVQ l+24(FP), BX

loop:
VMOVDQU (AX), Y0
VMOVDQU (CX), Y8
VMOVDQU 32(AX), Y1
VMOVDQU 32(CX), Y9
VMOVDQU 64(AX), Y2
VMOVDQU 64(CX), Y10
VMOVDQU 96(AX), Y3
VMOVDQU 96(CX), Y11
VMOVDQU 128(AX), Y4
VMOVDQU 128(CX), Y12
VMOVDQU 160(AX), Y5
VMOVDQU 160(CX), Y13
VMOVDQU 192(AX), Y6
VMOVDQU 192(CX), Y14
VMOVDQU 224(AX), Y7
VMOVDQU 224(CX), Y15
VPXOR Y8, Y0, Y8
VPXOR Y9, Y1, Y9
VPXOR Y10, Y2, Y10
VPXOR Y11, Y3, Y11
VPXOR Y12, Y4, Y12
VPXOR Y13, Y5, Y13
VPXOR Y14, Y6, Y14
VPXOR Y15, Y7, Y15
VMOVDQU Y8, (DX)
VMOVDQU Y9, 32(DX)
VMOVDQU Y10, 64(DX)
VMOVDQU Y11, 96(DX)
VMOVDQU Y12, 128(DX)
VMOVDQU Y13, 160(DX)
VMOVDQU Y14, 192(DX)
VMOVDQU Y15, 224(DX)
ADDQ $0x00000100, AX
ADDQ $0x00000100, CX
ADDQ $0x00000100, DX
SUBQ $0x00000001, BX
JNZ loop
RET

// func andNotAVX2(dst *byte, a *byte, b *byte, l uint64)
// Requires: AVX, AVX2
TEXT ·andNotAVX2(SB), NOSPLIT, $0-32
Expand Down
12 changes: 12 additions & 0 deletions and_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ func andNEON(dst, a, b *byte, l uint64)
//go:noescape
func orNEON(dst, a, b *byte, l uint64)

//go:noescape
func xorNEON(dst, a, b *byte, l uint64)

//go:noescape
func popcntNEON(a *byte, l uint64) uint64

Expand All @@ -29,6 +32,15 @@ func or(dst, a, b []byte) {
orGeneric(dst[l:], a[l:], b[l:])
}

func xor(dst, a, b []byte) {
l := uint64(len(a)) >> 8
if l != 0 {
xorNEON(&dst[0], &a[0], &b[0], l)
}
l <<= 8
xorGeneric(dst[l:], a[l:], b[l:])
}

func andNot(dst, a, b []byte) {
// TODO: Write a NEON version for this
andNotGeneric(dst, a, b)
Expand Down
47 changes: 47 additions & 0 deletions and_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,53 @@ loop:

RET

// func xorNEON(dst *byte, a *byte, b *byte, l uint64)
TEXT ·xorNEON(SB), NOSPLIT, $0-32
MOVD dst+0(FP), R0
MOVD a+8(FP), R1
MOVD b+16(FP), R2
MOVD l+24(FP), R3

loop:
VLD1.P 64(R1), [ V0.B16, V1.B16, V2.B16, V3.B16]
VLD1.P 64(R2), [ V4.B16, V5.B16, V6.B16, V7.B16]
VLD1.P 64(R1), [ V8.B16, V9.B16, V10.B16, V11.B16]
VLD1.P 64(R2), [V12.B16, V13.B16, V14.B16, V15.B16]
VLD1.P 64(R1), [V16.B16, V17.B16, V18.B16, V19.B16]
VLD1.P 64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
VLD1.P 64(R1), [V24.B16, V25.B16, V26.B16, V27.B16]
VLD1.P 64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]

VEOR V0.B16, V4.B16, V0.B16
VEOR V1.B16, V5.B16, V1.B16
VEOR V2.B16, V6.B16, V2.B16
VEOR V3.B16, V7.B16, V3.B16

VEOR V8.B16, V12.B16, V8.B16
VEOR V9.B16, V13.B16, V9.B16
VEOR V10.B16, V14.B16, V10.B16
VEOR V11.B16, V15.B16, V11.B16

VEOR V16.B16, V20.B16, V16.B16
VEOR V17.B16, V21.B16, V17.B16
VEOR V18.B16, V22.B16, V18.B16
VEOR V19.B16, V23.B16, V19.B16

VEOR V24.B16, V28.B16, V24.B16
VEOR V25.B16, V29.B16, V25.B16
VEOR V26.B16, V30.B16, V26.B16
VEOR V27.B16, V31.B16, V27.B16

VST1.P [ V0.B16, V1.B16, V2.B16, V3.B16], 64(R0)
VST1.P [ V8.B16, V9.B16, V10.B16, V11.B16], 64(R0)
VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R0)
VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R0)

SUBS $1, R3, R3
CBNZ R3, loop

RET

// func orNEON(dst *byte, a *byte, b *byte, l uint64)
TEXT ·orNEON(SB), NOSPLIT, $0-32
MOVD dst+0(FP), R0
Expand Down
5 changes: 5 additions & 0 deletions and_stubs_amd64.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

54 changes: 54 additions & 0 deletions and_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ import (
"testing"
)

func xorNaive(dst, a, b []byte) {
for i := range dst {
dst[i] = a[i] ^ b[i]
}
}

func andNaive(dst, a, b []byte) {
for i := range dst {
dst[i] = a[i] & b[i]
Expand Down Expand Up @@ -55,6 +61,18 @@ func TestAnd(t *testing.T) {
}
}

func TestXor(t *testing.T) {
for i := 0; i < 20; i++ {
size := 1 << i
testAgainst(t, Xor, xorNaive, size)
testAgainst(t, xorGeneric, xorNaive, size)
for j := 0; j < 10; j++ {
testAgainst(t, Xor, xorNaive, size+rand.IntN(100))
testAgainst(t, xorGeneric, xorNaive, size+rand.IntN(100))
}
}
}

func TestOr(t *testing.T) {
for i := 0; i < 20; i++ {
size := 1 << i
Expand Down Expand Up @@ -151,6 +169,42 @@ func BenchmarkOrNaive(b *testing.B) {
}
}

func BenchmarkXor(b *testing.B) {
b.StopTimer()
size := 1000000
a := make([]byte, size)
bb := make([]byte, size)
b.SetBytes(int64(size))
b.StartTimer()
for i := 0; i < b.N; i++ {
Xor(a, a, bb)
}
}

func BenchmarkXorGeneric(b *testing.B) {
b.StopTimer()
size := 1000000
a := make([]byte, size)
bb := make([]byte, size)
b.SetBytes(int64(size))
b.StartTimer()
for i := 0; i < b.N; i++ {
xorGeneric(a, a, bb)
}
}

func BenchmarkXorNaive(b *testing.B) {
b.StopTimer()
size := 1000000
a := make([]byte, size)
bb := make([]byte, size)
b.SetBytes(int64(size))
b.StartTimer()
for i := 0; i < b.N; i++ {
xorNaive(a, a, bb)
}
}

func BenchmarkAndNot(b *testing.B) {
b.StopTimer()
size := 1000000
Expand Down
1 change: 1 addition & 0 deletions internal/asm/src.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ func main() {

gen("and", VPAND, "Sets dst to the bitwise and of a and b")
gen("or", VPOR, "Sets dst to the bitwise or of a and b")
gen("xor", VPXOR, "Sets dst to the bitwise xor of a and b")
gen("andNot", VPANDN, "Sets dst to the bitwise and of not(a) and b")
genPopcnt()
genMemset()
Expand Down
26 changes: 26 additions & 0 deletions lib.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,32 @@ func orGeneric(dst, a, b []byte) {
}
}

// Writes bitwise xor of a and b to dst.
//
// Panics if len(a) ≠ len(b), or len(dst) ≠ len(a).
func Xor(dst, a, b []byte) {
if len(a) != len(b) || len(b) != len(dst) {
panic("lengths of a, b and dst must be equal")
}

xor(dst, a, b)

Check failure on line 69 in lib.go

View workflow job for this annotation

GitHub Actions / build

undefined: xor
}

func xorGeneric(dst, a, b []byte) {
i := 0

for ; i <= len(a)-8; i += 8 {
binary.LittleEndian.PutUint64(
dst[i:],
binary.LittleEndian.Uint64(a[i:])^binary.LittleEndian.Uint64(b[i:]),
)
}

for ; i < len(a); i++ {
dst[i] = a[i] ^ b[i]
}
}

// Writes bitwise and of not(a) and b to dst.
//
// Panics if len(a) ≠ len(b), or len(dst) ≠ len(a).
Expand Down

0 comments on commit 5e32cdf

Please sign in to comment.