Skip to content

Commit

Permalink
cmd/compile: add some generic composite type optimizations
Browse files Browse the repository at this point in the history
Propagate values through some wide Zero/Move operations. Among
other things this allows us to optimize some kinds of array
initialization. For example, the following code no longer
requires a temporary be allocated on the stack. Instead it
writes the values directly into the return value.

func f(i uint32) [4]uint32 {
    return [4]uint32{i, i+1, i+2, i+3}
}

The return value is unnecessarily cleared but removing that is
probably a task for dead store analysis (I think it needs to
be able to match multiple Store ops to wide Zero ops).

In order to reliably remove stack variables that are rendered
unnecessary by these new rules I've added a new generic version
of the unread autos elimination pass.

These rules are triggered more than 5000 times when building and
testing the standard library.

Updates #15925 (fixes for arrays of up to 4 elements).
Updates #24386 (fixes for up to 4 kept elements).
Updates #24416.

compilebench results:

name       old time/op       new time/op       delta
Template         353ms ± 5%        359ms ± 3%    ~     (p=0.143 n=10+10)
Unicode          219ms ± 1%        217ms ± 4%    ~     (p=0.740 n=7+10)
GoTypes          1.26s ± 1%        1.26s ± 2%    ~     (p=0.549 n=9+10)
Compiler         6.00s ± 1%        6.08s ± 1%  +1.42%  (p=0.000 n=9+8)
SSA              15.3s ± 2%        15.6s ± 1%  +2.43%  (p=0.000 n=10+10)
Flate            237ms ± 2%        240ms ± 2%  +1.31%  (p=0.015 n=10+10)
GoParser         285ms ± 1%        285ms ± 1%    ~     (p=0.878 n=8+8)
Reflect          797ms ± 3%        807ms ± 2%    ~     (p=0.065 n=9+10)
Tar              334ms ± 0%        335ms ± 4%    ~     (p=0.460 n=8+10)
XML              419ms ± 0%        423ms ± 1%  +0.91%  (p=0.001 n=7+9)
StdCmd           46.0s ± 0%        46.4s ± 0%  +0.85%  (p=0.000 n=9+9)

name       old user-time/op  new user-time/op  delta
Template         337ms ± 3%        346ms ± 5%    ~     (p=0.053 n=9+10)
Unicode          205ms ±10%        205ms ± 8%    ~     (p=1.000 n=10+10)
GoTypes          1.22s ± 2%        1.21s ± 3%    ~     (p=0.436 n=10+10)
Compiler         5.85s ± 1%        5.93s ± 0%  +1.46%  (p=0.000 n=10+8)
SSA              14.9s ± 1%        15.3s ± 1%  +2.62%  (p=0.000 n=10+10)
Flate            229ms ± 4%        228ms ± 6%    ~     (p=0.796 n=10+10)
GoParser         271ms ± 3%        275ms ± 4%    ~     (p=0.165 n=10+10)
Reflect          779ms ± 5%        775ms ± 2%    ~     (p=0.971 n=10+10)
Tar              317ms ± 4%        319ms ± 5%    ~     (p=0.853 n=10+10)
XML              404ms ± 4%        409ms ± 5%    ~     (p=0.436 n=10+10)

name       old alloc/op      new alloc/op      delta
Template        34.9MB ± 0%       35.0MB ± 0%  +0.26%  (p=0.000 n=10+10)
Unicode         29.3MB ± 0%       29.3MB ± 0%  +0.02%  (p=0.000 n=10+10)
GoTypes          115MB ± 0%        115MB ± 0%  +0.30%  (p=0.000 n=10+10)
Compiler         519MB ± 0%        521MB ± 0%  +0.30%  (p=0.000 n=10+10)
SSA             1.55GB ± 0%       1.57GB ± 0%  +1.34%  (p=0.000 n=10+9)
Flate           24.1MB ± 0%       24.2MB ± 0%  +0.10%  (p=0.000 n=10+10)
GoParser        28.1MB ± 0%       28.1MB ± 0%  +0.07%  (p=0.000 n=10+10)
Reflect         78.7MB ± 0%       78.7MB ± 0%  +0.03%  (p=0.000 n=8+10)
Tar             34.4MB ± 0%       34.5MB ± 0%  +0.12%  (p=0.000 n=10+10)
XML             43.2MB ± 0%       43.2MB ± 0%  +0.13%  (p=0.000 n=10+10)

name       old allocs/op     new allocs/op     delta
Template          330k ± 0%         330k ± 0%  -0.01%  (p=0.017 n=10+10)
Unicode           337k ± 0%         337k ± 0%  +0.01%  (p=0.000 n=9+10)
GoTypes          1.15M ± 0%        1.15M ± 0%  +0.03%  (p=0.000 n=10+10)
Compiler         4.77M ± 0%        4.77M ± 0%  +0.03%  (p=0.000 n=9+10)
SSA              12.5M ± 0%        12.6M ± 0%  +1.16%  (p=0.000 n=10+10)
Flate             221k ± 0%         221k ± 0%  +0.05%  (p=0.000 n=9+10)
GoParser          275k ± 0%         275k ± 0%  +0.01%  (p=0.014 n=10+9)
Reflect           944k ± 0%         944k ± 0%  -0.02%  (p=0.000 n=10+10)
Tar               324k ± 0%         323k ± 0%  -0.12%  (p=0.000 n=10+10)
XML               384k ± 0%         384k ± 0%  -0.01%  (p=0.001 n=10+10)

name       old object-bytes  new object-bytes  delta
Template         476kB ± 0%        476kB ± 0%  -0.04%  (p=0.000 n=10+10)
Unicode          218kB ± 0%        218kB ± 0%    ~     (all equal)
GoTypes         1.58MB ± 0%       1.58MB ± 0%  -0.04%  (p=0.000 n=10+10)
Compiler        6.25MB ± 0%       6.24MB ± 0%  -0.09%  (p=0.000 n=10+10)
SSA             15.9MB ± 0%       16.1MB ± 0%  +1.22%  (p=0.000 n=10+10)
Flate            304kB ± 0%        304kB ± 0%  -0.13%  (p=0.000 n=10+10)
GoParser         370kB ± 0%        370kB ± 0%  -0.00%  (p=0.000 n=10+10)
Reflect         1.27MB ± 0%       1.27MB ± 0%  -0.12%  (p=0.000 n=10+10)
Tar              421kB ± 0%        419kB ± 0%  -0.64%  (p=0.000 n=10+10)
XML              518kB ± 0%        517kB ± 0%  -0.12%  (p=0.000 n=10+10)

name       old export-bytes  new export-bytes  delta
Template        16.7kB ± 0%       16.7kB ± 0%    ~     (all equal)
Unicode         6.52kB ± 0%       6.52kB ± 0%    ~     (all equal)
GoTypes         29.2kB ± 0%       29.2kB ± 0%    ~     (all equal)
Compiler        88.0kB ± 0%       88.0kB ± 0%    ~     (all equal)
SSA              109kB ± 0%        109kB ± 0%    ~     (all equal)
Flate           4.49kB ± 0%       4.49kB ± 0%    ~     (all equal)
GoParser        8.10kB ± 0%       8.10kB ± 0%    ~     (all equal)
Reflect         7.71kB ± 0%       7.71kB ± 0%    ~     (all equal)
Tar             9.15kB ± 0%       9.15kB ± 0%    ~     (all equal)
XML             12.3kB ± 0%       12.3kB ± 0%    ~     (all equal)

name       old text-bytes    new text-bytes    delta
HelloSize        676kB ± 0%        672kB ± 0%  -0.59%  (p=0.000 n=10+10)
CmdGoSize       7.26MB ± 0%       7.24MB ± 0%  -0.18%  (p=0.000 n=10+10)

name       old data-bytes    new data-bytes    delta
HelloSize       10.2kB ± 0%       10.2kB ± 0%    ~     (all equal)
CmdGoSize        248kB ± 0%        248kB ± 0%    ~     (all equal)

name       old bss-bytes     new bss-bytes     delta
HelloSize        125kB ± 0%        125kB ± 0%    ~     (all equal)
CmdGoSize        145kB ± 0%        145kB ± 0%    ~     (all equal)

name       old exe-bytes     new exe-bytes     delta
HelloSize       1.46MB ± 0%       1.45MB ± 0%  -0.31%  (p=0.000 n=10+10)
CmdGoSize       14.7MB ± 0%       14.7MB ± 0%  -0.17%  (p=0.000 n=10+10)

Change-Id: Ic72b0c189dd542f391e1c9ab88a76e9148dc4285
Reviewed-on: https://go-review.googlesource.com/106495
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
  • Loading branch information
mundaym committed May 8, 2018
1 parent 098ca84 commit f31a18d
Show file tree
Hide file tree
Showing 8 changed files with 4,330 additions and 1,014 deletions.
1 change: 1 addition & 0 deletions src/cmd/compile/internal/ssa/compile.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,7 @@ var passes = [...]pass{
{name: "decompose builtin", fn: decomposeBuiltIn, required: true},
{name: "softfloat", fn: softfloat, required: true},
{name: "late opt", fn: opt, required: true}, // TODO: split required rules and optimizing rules
{name: "dead auto elim", fn: elimDeadAutosGeneric},
{name: "generic deadcode", fn: deadcode},
{name: "check bce", fn: checkbce},
{name: "branchelim", fn: branchelim},
Expand Down
147 changes: 147 additions & 0 deletions src/cmd/compile/internal/ssa/deadstore.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,153 @@ func dse(f *Func) {
}
}

// elimDeadAutosGeneric deletes autos that are never accessed. To acheive this
// we track the operations that the address of each auto reaches and if it only
// reaches stores then we delete all the stores. The other operations will then
// be eliminated by the dead code elimination pass.
func elimDeadAutosGeneric(f *Func) {
addr := make(map[*Value]GCNode) // values that the address of the auto reaches
elim := make(map[*Value]GCNode) // values that could be eliminated if the auto is
used := make(map[GCNode]bool) // used autos that must be kept

// visit the value and report whether any of the maps are updated
visit := func(v *Value) (changed bool) {
args := v.Args
switch v.Op {
case OpAddr:
// Propagate the address if it points to an auto.
n, ok := v.Aux.(GCNode)
if !ok || n.StorageClass() != ClassAuto {
return
}
if addr[v] == nil {
addr[v] = n
changed = true
}
return
case OpVarDef, OpVarKill:
// v should be eliminated if we eliminate the auto.
n, ok := v.Aux.(GCNode)
if !ok || n.StorageClass() != ClassAuto {
return
}
if elim[v] == nil {
elim[v] = n
changed = true
}
return
case OpVarLive:
// Don't delete the auto if it needs to be kept alive.
n, ok := v.Aux.(GCNode)
if !ok || n.StorageClass() != ClassAuto {
return
}
if !used[n] {
used[n] = true
changed = true
}
return
case OpStore, OpMove, OpZero:
// v should be elimated if we eliminate the auto.
n, ok := addr[args[0]]
if ok && elim[v] == nil {
elim[v] = n
changed = true
}
// Other args might hold pointers to autos.
args = args[1:]
}

// The code below assumes that we have handled all the ops
// with sym effects already. Sanity check that here.
// Ignore Args since they can't be autos.
if v.Op.SymEffect() != SymNone && v.Op != OpArg {
panic("unhandled op with sym effect")
}

if v.Uses == 0 || len(args) == 0 {
return
}

// If the address of the auto reaches a memory or control
// operation not covered above then we probably need to keep it.
if v.Type.IsMemory() || v.Type.IsFlags() || (v.Op != OpPhi && v.MemoryArg() != nil) {
for _, a := range args {
if n, ok := addr[a]; ok {
if !used[n] {
used[n] = true
changed = true
}
}
}
return
}

// Propagate any auto addresses through v.
node := GCNode(nil)
for _, a := range args {
if n, ok := addr[a]; ok && !used[n] {
if node == nil {
node = n
} else if node != n {
// Most of the time we only see one pointer
// reaching an op, but some ops can take
// multiple pointers (e.g. NeqPtr, Phi etc.).
// This is rare, so just propagate the first
// value to keep things simple.
used[n] = true
changed = true
}
}
}
if node == nil {
return
}
if addr[v] == nil {
// The address of an auto reaches this op.
addr[v] = node
changed = true
return
}
if addr[v] != node {
// This doesn't happen in practice, but catch it just in case.
used[node] = true
changed = true
}
return
}

iterations := 0
for {
if iterations == 4 {
// give up
return
}
iterations++
changed := false
for _, b := range f.Blocks {
for _, v := range b.Values {
changed = visit(v) || changed
}
}
if !changed {
break
}
}

// Eliminate stores to unread autos.
for v, n := range elim {
if used[n] {
continue
}
// replace with OpCopy
v.SetArgs1(v.MemoryArg())
v.Aux = nil
v.AuxInt = 0
v.Op = OpCopy
}
}

// elimUnreadAutos deletes stores (and associated bookkeeping ops VarDef and VarKill)
// to autos that are never read from.
func elimUnreadAutos(f *Func) {
Expand Down
Loading

0 comments on commit f31a18d

Please sign in to comment.