From 28327781d4f07cd1c8b23311273c6671ca5768bc Mon Sep 17 00:00:00 2001 From: Jille Timmermans Date: Mon, 22 Jul 2024 19:51:50 +0200 Subject: [PATCH] Add Memset() 10x faster than a naive loop ``` $ go test -bench=Memset goos: linux goarch: amd64 pkg: github.com/bwesterb/go-and cpu: 13th Gen Intel(R) Core(TM) i9-13900 BenchmarkMemset-32 73898 15965 ns/op 62637.75 MB/s BenchmarkMemsetGeneric-32 6602 168255 ns/op 5943.36 MB/s ``` --- and_amd64.go | 12 ++++++++++++ and_amd64.s | 14 +++++++++++++ and_arm64.go | 5 +++++ and_stubs_amd64.go | 5 +++++ fallback.go | 4 ++++ internal/asm/src.go | 31 +++++++++++++++++++++++++++++ lib.go | 11 +++++++++++ memset_test.go | 48 +++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 130 insertions(+) create mode 100644 memset_test.go diff --git a/and_amd64.go b/and_amd64.go index 9120b00..2a6a358 100644 --- a/and_amd64.go +++ b/and_amd64.go @@ -49,3 +49,15 @@ func popcnt(a []byte) int { ret += popcntGeneric(a[l:]) return ret } + +func memset(dst []byte, b byte) { + l := uint64(0) + if hasAVX2() { + l = uint64(len(dst)) >> 5 + if l != 0 { + memsetAVX2(&dst[0], l, b) + } + l <<= 5 + } + memsetGeneric(dst[l:], b) +} diff --git a/and_amd64.s b/and_amd64.s index 0403fd3..71f6bc2 100644 --- a/and_amd64.s +++ b/and_amd64.s @@ -183,3 +183,17 @@ loop: JNZ loop MOVQ DX, ret+16(FP) RET + +// func memsetAVX2(dst *byte, l uint64, b byte) +// Requires: AVX, AVX2 +TEXT ·memsetAVX2(SB), NOSPLIT, $0-17 + MOVQ dst+0(FP), AX + MOVQ l+8(FP), CX + VPBROADCASTB b+16(FP), Y0 + +loop: + VMOVDQU Y0, (AX) + ADDQ $0x00000020, AX + SUBQ $0x00000001, CX + JNZ loop + RET diff --git a/and_arm64.go b/and_arm64.go index 31d2d46..129b628 100644 --- a/and_arm64.go +++ b/and_arm64.go @@ -33,3 +33,8 @@ func popcnt(a []byte) int { // TODO: Write a NEON version for this return popcntGeneric(a) } + +func memset(dst []byte, b byte) { + // TODO: Write a NEON version for this + memsetGeneric(dst, b) +} diff --git a/and_stubs_amd64.go b/and_stubs_amd64.go index 393d108..8bbbf96 100644 --- a/and_stubs_amd64.go +++ b/and_stubs_amd64.go @@ -21,3 +21,8 @@ func andNotAVX2(dst *byte, a *byte, b *byte, l uint64) // //go:noescape func popcntAsm(a *byte, l uint64) int + +// Sets each byte in dst to b +// +//go:noescape +func memsetAVX2(dst *byte, l uint64, b byte) diff --git a/fallback.go b/fallback.go index e4eec7b..95ee3c0 100644 --- a/fallback.go +++ b/fallback.go @@ -17,3 +17,7 @@ func andNot(dst, a, b []byte) { func popcnt(a []byte) int { return popcntGeneric(a) } + +func memset(dst []byte, b byte) { + return memsetGeneric(dst, b) +} diff --git a/internal/asm/src.go b/internal/asm/src.go index 599f002..dde3b81 100644 --- a/internal/asm/src.go +++ b/internal/asm/src.go @@ -11,6 +11,7 @@ func main() { gen("or", VPOR, "Sets dst to the bitwise or of a and b") gen("andNot", VPANDN, "Sets dst to the bitwise and of not(a) and b") genPopcnt() + genMemset() Generate() } @@ -86,3 +87,33 @@ func genPopcnt() { Store(ret, ReturnIndex(0)) RET() } + +func genMemset() { + const rounds = 1 + TEXT("memsetAVX2", NOSPLIT, "func(dst *byte, l uint64, b byte)") + + Pragma("noescape") + + Doc("Sets each byte in dst to b") + dst := Load(Param("dst"), GP64()) + l := Load(Param("l"), GP64()) + + bRepeated := YMM() + b, err := Param("b").Resolve() + if err != nil { + panic(err) + } + VPBROADCASTB(b.Addr, bRepeated) + + Label("loop") + + for i := 0; i < rounds; i++ { + VMOVDQU(bRepeated, Mem{Base: dst, Disp: 32 * i}) + } + + ADDQ(U32(32*rounds), dst) + SUBQ(U32(1), l) + JNZ(LabelRef("loop")) + + RET() +} diff --git a/lib.go b/lib.go index 806fe5e..ea30bfe 100644 --- a/lib.go +++ b/lib.go @@ -104,3 +104,14 @@ func popcntGeneric(a []byte) int { } return ret } + +// Memset sets dst[*] to b. +func Memset(dst []byte, b byte) { + memset(dst, b) +} + +func memsetGeneric(dst []byte, b byte) { + for i := range dst { + dst[i] = b + } +} diff --git a/memset_test.go b/memset_test.go new file mode 100644 index 0000000..8899e60 --- /dev/null +++ b/memset_test.go @@ -0,0 +1,48 @@ +package and + +import ( + "math/rand/v2" + "testing" +) + +func testMemset(t *testing.T, size int) { + a := make([]byte, size) + Memset(a, 0xff) + for i, v := range a { + if v != 0xff { + t.Errorf("Memset failed to set a[%d] to 0xff", i) + } + } +} + +func TestMemsetAgainstGeneric(t *testing.T) { + for i := 0; i < 20; i++ { + size := 1 << i + testMemset(t, size) + for j := 0; j < 10; j++ { + testMemset(t, size+rand.IntN(100)) + } + } +} + +func BenchmarkMemset(b *testing.B) { + b.StopTimer() + size := 1000000 + a := make([]byte, size) + b.SetBytes(int64(size)) + b.StartTimer() + for i := 0; i < b.N; i++ { + Memset(a, 0xff) + } +} + +func BenchmarkMemsetGeneric(b *testing.B) { + b.StopTimer() + size := 1000000 + a := make([]byte, size) + b.SetBytes(int64(size)) + b.StartTimer() + for i := 0; i < b.N; i++ { + memsetGeneric(a, 0xff) + } +}