Skip to content

Commit

Permalink
cmd/compile: use MOVBE instruction for GOAMD64>=v3
Browse files Browse the repository at this point in the history
encoding/binary benchmark on my laptop:
name                      old time/op    new time/op     delta
ReadSlice1000Int32s-8       4.42µs ± 5%     4.20µs ± 1%   -4.94%  (p=0.046 n=9+8)
ReadStruct-8                 359ns ± 8%      368ns ± 5%   +2.35%  (p=0.041 n=9+10)
WriteStruct-8                349ns ± 1%      357ns ± 1%   +2.15%  (p=0.000 n=8+10)
ReadInts-8                   235ns ± 1%      233ns ± 1%   -1.01%  (p=0.005 n=10+10)
WriteInts-8                  265ns ± 1%      274ns ± 1%   +3.45%  (p=0.000 n=10+10)
WriteSlice1000Int32s-8      4.61µs ± 5%     4.59µs ± 5%     ~     (p=0.986 n=10+10)
PutUint16-8                 0.56ns ± 4%     0.57ns ± 4%     ~     (p=0.101 n=10+10)
PutUint32-8                 0.83ns ± 2%     0.56ns ± 6%  -32.91%  (p=0.000 n=10+10)
PutUint64-8                 0.81ns ± 3%     0.62ns ± 4%  -23.82%  (p=0.000 n=10+10)
LittleEndianPutUint16-8     0.55ns ± 4%     0.55ns ± 3%     ~     (p=0.926 n=10+10)
LittleEndianPutUint32-8     0.41ns ± 4%     0.42ns ± 3%     ~     (p=0.148 n=10+9)
LittleEndianPutUint64-8     0.55ns ± 2%     0.56ns ± 6%     ~     (p=0.897 n=10+10)
ReadFloats-8                60.4ns ± 4%     59.0ns ± 1%   -2.25%  (p=0.007 n=10+10)
WriteFloats-8               72.3ns ± 2%     71.5ns ± 7%     ~     (p=0.089 n=10+10)
ReadSlice1000Float32s-8     4.21µs ± 3%     4.18µs ± 2%     ~     (p=0.197 n=10+10)
WriteSlice1000Float32s-8    4.61µs ± 2%     4.68µs ± 7%     ~     (p=1.000 n=8+10)
ReadSlice1000Uint8s-8        250ns ± 4%      247ns ± 4%     ~     (p=0.324 n=10+10)
WriteSlice1000Uint8s-8       227ns ± 5%      229ns ± 2%     ~     (p=0.193 n=10+7)
PutUvarint32-8              15.3ns ± 2%     15.4ns ± 4%     ~     (p=0.782 n=10+10)
PutUvarint64-8              38.5ns ± 1%     38.6ns ± 5%     ~     (p=0.396 n=8+10)

name                      old speed      new speed       delta
ReadSlice1000Int32s-8      890MB/s ±17%    953MB/s ± 1%   +7.00%  (p=0.027 n=10+8)
ReadStruct-8               209MB/s ± 8%    204MB/s ± 5%   -2.42%  (p=0.043 n=9+10)
WriteStruct-8              214MB/s ± 3%    210MB/s ± 1%   -1.75%  (p=0.003 n=9+10)
ReadInts-8                 127MB/s ± 1%    129MB/s ± 1%   +1.01%  (p=0.006 n=10+10)
WriteInts-8                113MB/s ± 1%    109MB/s ± 1%   -3.34%  (p=0.000 n=10+10)
WriteSlice1000Int32s-8     868MB/s ± 5%    872MB/s ± 5%     ~     (p=1.000 n=10+10)
PutUint16-8               3.55GB/s ± 4%   3.50GB/s ± 4%     ~     (p=0.093 n=10+10)
PutUint32-8               4.83GB/s ± 2%   7.21GB/s ± 6%  +49.16%  (p=0.000 n=10+10)
PutUint64-8               9.89GB/s ± 3%  12.99GB/s ± 4%  +31.26%  (p=0.000 n=10+10)
LittleEndianPutUint16-8   3.65GB/s ± 4%   3.65GB/s ± 4%     ~     (p=0.912 n=10+10)
LittleEndianPutUint32-8   9.74GB/s ± 3%   9.63GB/s ± 3%     ~     (p=0.222 n=9+9)
LittleEndianPutUint64-8   14.4GB/s ± 2%   14.3GB/s ± 5%     ~     (p=0.912 n=10+10)
ReadFloats-8               199MB/s ± 4%    203MB/s ± 1%   +2.27%  (p=0.007 n=10+10)
WriteFloats-8              166MB/s ± 2%    168MB/s ± 7%     ~     (p=0.089 n=10+10)
ReadSlice1000Float32s-8    949MB/s ± 3%    958MB/s ± 2%     ~     (p=0.218 n=10+10)
WriteSlice1000Float32s-8   867MB/s ± 2%    857MB/s ± 6%     ~     (p=1.000 n=8+10)
ReadSlice1000Uint8s-8     4.00GB/s ± 4%   4.06GB/s ± 4%     ~     (p=0.353 n=10+10)
WriteSlice1000Uint8s-8    4.40GB/s ± 4%   4.36GB/s ± 2%     ~     (p=0.193 n=10+7)
PutUvarint32-8             262MB/s ± 2%    260MB/s ± 4%     ~     (p=0.739 n=10+10)
PutUvarint64-8             208MB/s ± 1%    207MB/s ± 5%     ~     (p=0.408 n=8+10)

Updates #45453

Change-Id: Ifda0d48d54665cef45d46d3aad974062633142c4
Reviewed-on: https://go-review.googlesource.com/c/go/+/354670
Run-TryBot: Alberto Donizetti <alb.donizetti@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Trust: Matthew Dempsky <mdempsky@google.com>
  • Loading branch information
wdvxdr1123 authored and randall77 committed Oct 18, 2021
1 parent c091767 commit 3e5cc4d
Show file tree
Hide file tree
Showing 7 changed files with 389 additions and 20 deletions.
7 changes: 5 additions & 2 deletions src/cmd/compile/internal/amd64/ssa.go
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,9 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p.From.Val = math.Float64frombits(uint64(v.AuxInt))
p.To.Type = obj.TYPE_REG
p.To.Reg = x
case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVOload:
case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload,
ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload,
ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
Expand All @@ -788,7 +790,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p.To.Reg = v.Reg()
case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore,
ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify,
ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify:
ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify,
ssa.OpAMD64MOVBEQstore, ssa.OpAMD64MOVBELstore:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
Expand Down
27 changes: 21 additions & 6 deletions src/cmd/compile/internal/amd64/versions_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ func TestGoAMD64v1(t *testing.T) {
opcodes := map[string]bool{}
var features []string
for feature, opcodeList := range featureToOpcodes {
features = append(features, fmt.Sprintf("cpu.%s=off", feature))
if runtimeFeatures[feature] {
features = append(features, fmt.Sprintf("cpu.%s=off", feature))
}
for _, op := range opcodeList {
opcodes[op] = true
}
Expand Down Expand Up @@ -204,14 +206,28 @@ func clobber(t *testing.T, src string, dst *os.File, opcodes map[string]bool) {
f.Close()
}

func setOf(keys ...string) map[string]bool {
m := make(map[string]bool, len(keys))
for _, key := range keys {
m[key] = true
}
return m
}

var runtimeFeatures = setOf(
"adx", "aes", "avx", "avx2", "bmi1", "bmi2", "erms", "fma",
"pclmulqdq", "popcnt", "rdtscp", "sse3", "sse41", "sse42", "ssse3",
)

var featureToOpcodes = map[string][]string{
// Note: we include *q, *l, and plain opcodes here.
// go tool objdump doesn't include a [QL] on popcnt instructions, until CL 351889
// native objdump doesn't include [QL] on linux.
"popcnt": []string{"popcntq", "popcntl", "popcnt"},
"bmi1": []string{"andnq", "andnl", "andn", "blsiq", "blsil", "blsi", "blsmskq", "blsmskl", "blsmsk", "blsrq", "blsrl", "blsr", "tzcntq", "tzcntl", "tzcnt"},
"sse41": []string{"roundsd"},
"fma": []string{"vfmadd231sd"},
"popcnt": {"popcntq", "popcntl", "popcnt"},
"bmi1": {"andnq", "andnl", "andn", "blsiq", "blsil", "blsi", "blsmskq", "blsmskl", "blsmsk", "blsrq", "blsrl", "blsr", "tzcntq", "tzcntl", "tzcnt"},
"sse41": {"roundsd"},
"fma": {"vfmadd231sd"},
"movbe": {"movbeqq", "movbeq", "movbell", "movbel", "movbe"},
}

// Test to use POPCNT instruction, if available
Expand Down Expand Up @@ -364,5 +380,4 @@ func TestFMA(t *testing.T) {
t.Errorf("FMA(%f,%f,%f) = %f, want %f", tt.x, tt.y, tt.z, got, tt.want)
}
}

}
26 changes: 26 additions & 0 deletions src/cmd/compile/internal/ssa/gen/AMD64.rules
Original file line number Diff line number Diff line change
Expand Up @@ -2219,3 +2219,29 @@
(AND(Q|L) x (ADD(Q|L)const [-1] x)) && buildcfg.GOAMD64 >= 3 => (BLSR(Q|L) x)

(BSWAP(Q|L) (BSWAP(Q|L) p)) => p

// CPUID feature: MOVBE.
(MOV(Q|L)store [i] {s} p x:(BSWAP(Q|L) w) mem) && x.Uses == 1 && buildcfg.GOAMD64 >= 3 => (MOVBE(Q|L)store [i] {s} p w mem)
(BSWAP(Q|L) x:(MOV(Q|L)load [i] {s} p mem)) && x.Uses == 1 && buildcfg.GOAMD64 >= 3 => (MOVBE(Q|L)load [i] {s} p mem)
(BSWAP(Q|L) (MOVBE(Q|L)load [i] {s} p m)) => (MOV(Q|L)load [i] {s} p m)
(MOVBE(Q|L)store [i] {s} p (BSWAP(Q|L) x) m) => (MOV(Q|L)store [i] {s} p x m)

(ORQ x0:(MOVBELload [i0] {s} p mem)
sh:(SHLQconst [32] x1:(MOVBELload [i1] {s} p mem)))
&& i0 == i1+4
&& x0.Uses == 1
&& x1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0, x1, sh)
=> @mergePoint(b,x0,x1) (MOVBEQload [i1] {s} p mem)

(ORQ x0:(MOVBELload [i] {s} p0 mem)
sh:(SHLQconst [32] x1:(MOVBELload [i] {s} p1 mem)))
&& x0.Uses == 1
&& x1.Uses == 1
&& sh.Uses == 1
&& sequentialAddresses(p1, p0, 4)
&& mergePoint(b,x0,x1) != nil
&& clobber(x0, x1, sh)
=> @mergePoint(b,x0,x1) (MOVBEQload [i] {s} p0 mem)
6 changes: 6 additions & 0 deletions src/cmd/compile/internal/ssa/gen/AMD64Ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -922,6 +922,12 @@ func init() {
// and BSFQ(0) is undefined. Same for TZCNTL(0)==32
{name: "TZCNTQ", argLength: 1, reg: gp11, asm: "TZCNTQ", clobberFlags: true},
{name: "TZCNTL", argLength: 1, reg: gp11, asm: "TZCNTL", clobberFlags: true},

// CPUID feature: MOVBE
{name: "MOVBELload", argLength: 2, reg: gpload, asm: "MOVBEL", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load and swap 4 bytes from arg0+auxint+aux. arg1=mem. Zero extend.
{name: "MOVBELstore", argLength: 3, reg: gpstore, asm: "MOVBEL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // swap and store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
{name: "MOVBEQload", argLength: 2, reg: gpload, asm: "MOVBEQ", aux: "SymOff", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load and swap 8 bytes from arg0+auxint+aux. arg1=mem
{name: "MOVBEQstore", argLength: 3, reg: gpstore, asm: "MOVBEQ", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // swap and store 8 bytes in arg1 to arg0+auxint+aux. arg2=mem
}

var AMD64blocks = []blockData{
Expand Down
64 changes: 64 additions & 0 deletions src/cmd/compile/internal/ssa/opGen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 3e5cc4d

Please sign in to comment.