Skip to content

Commit

Permalink
zstd: Shorter and faster asm for decSymbol.newState (#896)
Browse files Browse the repository at this point in the history
* zstd: Shorter asm for decSymbol.newState

The asm needs to compute decSymbol.newState, which is

	uint16(state >> 16),

or, equivalently (except for types),

	uint32(state) >> 16.

This can be accomplished by a MOVL+SHRL, the former of which is elided
by avo, so we get a single instruction for both the BMI2 and non-BMI2
cases.

Benchmarks show no difference on a new BMI2-supporting machine, but on
an older i7, decompression throughput is a tiny bit faster:

	goos: linux
	goarch: amd64
	pkg: github.com/klauspost/compress/zstd
	cpu: Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz
	                                       │     old      │                shift                │
	                                       │     B/s      │     B/s       vs base               │
	Decoder_DecodeAll/kppkn.gtb.zst-8        441.4Mi ± 2%   450.4Mi ± 0%  +2.03% (p=0.000 n=10)
	Decoder_DecodeAll/geo.protodata.zst-8    1.148Gi ± 1%   1.152Gi ± 0%  +0.34% (p=0.009 n=10)
	Decoder_DecodeAll/plrabn12.txt.zst-8     347.9Mi ± 0%   356.6Mi ± 1%  +2.48% (p=0.000 n=10)
	Decoder_DecodeAll/lcet10.txt.zst-8       417.4Mi ± 0%   427.3Mi ± 0%  +2.37% (p=0.000 n=10)
	Decoder_DecodeAll/asyoulik.txt.zst-8     347.1Mi ± 0%   352.7Mi ± 1%  +1.62% (p=0.003 n=10)
	Decoder_DecodeAll/alice29.txt.zst-8      346.3Mi ± 1%   352.6Mi ± 0%  +1.83% (p=0.000 n=10)
	Decoder_DecodeAll/html_x_4.zst-8         1.440Gi ± 0%   1.445Gi ± 0%  +0.29% (p=0.019 n=10)
	Decoder_DecodeAll/paper-100k.pdf.zst-8   4.191Gi ± 0%   4.210Gi ± 0%  +0.45% (p=0.007 n=10)
	Decoder_DecodeAll/fireworks.jpeg.zst-8   8.891Gi ± 0%   8.849Gi ± 0%  -0.47% (p=0.000 n=10)
	Decoder_DecodeAll/urls.10K.zst-8         589.6Mi ± 0%   600.2Mi ± 0%  +1.80% (p=0.001 n=10)
	Decoder_DecodeAll/html.zst-8             926.1Mi ± 1%   937.9Mi ± 0%  +1.27% (p=0.000 n=10)
	Decoder_DecodeAll/comp-data.bin.zst-8    389.6Mi ± 0%   395.1Mi ± 0%  +1.40% (p=0.000 n=10)
	geomean                                  832.6Mi        843.3Mi       +1.28%

* zstd: Remove unused parameter in asm generator
  • Loading branch information
greatroar authored Dec 9, 2023
1 parent 1dba04a commit 6bf960e
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 107 deletions.
39 changes: 12 additions & 27 deletions zstd/_generate/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,30 +316,30 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute
lowBits := GP64()
BZHIQ(nBits, bits, lowBits) // lowBits = bits & ((1 << nBits) - 1))
SHRXQ(nBits, bits, bits) // bits >>= nBits
o.nextState(name+"_ofState", ofState, lowBits, "ofTable")
o.nextState(ofState, lowBits, "ofTable")
}
Comment("Update Match Length State")
{
nBits := mlState
lowBits := GP64()
BZHIQ(nBits, bits, lowBits) // lowBits = bits & ((1 << nBits) - 1))
SHRXQ(nBits, bits, bits) // lowBits >>= nBits
o.nextState(name+"_mlState", mlState, lowBits, "mlTable")
o.nextState(mlState, lowBits, "mlTable")
}
Comment("Update Literal Length State")
{
nBits := llState
lowBits := GP64()
BZHIQ(nBits, bits, lowBits) // lowBits = bits & ((1 << nBits) - 1))
o.nextState(name+"_llState", llState, lowBits, "llTable")
o.nextState(llState, lowBits, "llTable")
}
} else {
Comment("Update Literal Length State")
o.updateState(name+"_llState", llState, brValue, brBitsRead, "llTable")
o.updateState(llState, brValue, brBitsRead, "llTable")
Comment("Update Match Length State")
o.updateState(name+"_mlState", mlState, brValue, brBitsRead, "mlTable")
o.updateState(mlState, brValue, brBitsRead, "mlTable")
Comment("Update Offset State")
o.updateState(name+"_ofState", ofState, brValue, brBitsRead, "ofTable")
o.updateState(ofState, brValue, brBitsRead, "ofTable")
}
}
Label(name + "_skip_update")
Expand Down Expand Up @@ -631,8 +631,7 @@ func (o options) updateLength(name string, brValue, brBitsRead, state reg.GPVirt
}
}

func (o options) updateState(name string, state, brValue, brBitsRead reg.GPVirtual, table string) {
name = name + "_updateState"
func (o options) updateState(state, brValue, brBitsRead reg.GPVirtual, table string) {
AX := GP64()
MOVBQZX(state.As8(), AX) // AX = nBits
// Check we have a reasonable nBits
Expand All @@ -642,15 +641,8 @@ func (o options) updateState(name string, state, brValue, brBitsRead reg.GPVirtu
})

DX := GP64()
if o.bmi2 {
tmp := GP64()
MOVQ(U32(16|(16<<8)), tmp)
BEXTRQ(tmp, state, DX)
} else {
MOVQ(state, DX)
SHRQ(U8(16), DX)
MOVWQZX(DX.As16(), DX)
}
MOVL(state.As32(), DX.As32()) // Clear the top 32 bits.
SHRL(U8(16), DX.As32())

{
lowBits := o.getBits(AX, brValue, brBitsRead)
Expand Down Expand Up @@ -681,17 +673,10 @@ func (o options) updateState(name string, state, brValue, brBitsRead reg.GPVirtu
MOVQ(Mem{Base: tablePtr, Index: DX, Scale: 8}, state)
}

func (o options) nextState(name string, state, lowBits reg.GPVirtual, table string) {
func (o options) nextState(state, lowBits reg.GPVirtual, table string) {
DX := GP64()
if o.bmi2 {
tmp := GP64()
MOVQ(U32(16|(16<<8)), tmp)
BEXTRQ(tmp, state, DX)
} else {
MOVQ(state, DX)
SHRQ(U8(16), DX)
MOVWQZX(DX.As16(), DX)
}
MOVL(state.As32(), DX.As32()) // Clear the top 32 bits.
SHRL(U8(16), DX.As32())

ADDQ(lowBits, DX)

Expand Down
136 changes: 56 additions & 80 deletions zstd/seqdec_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,7 @@ sequenceDecs_decode_amd64_ll_update_zero:

// Update Literal Length State
MOVBQZX DI, R14
SHRQ $0x10, DI
MOVWQZX DI, DI
SHRL $0x10, DI
LEAQ (BX)(R14*1), CX
MOVQ DX, R15
MOVQ CX, BX
Expand All @@ -177,8 +176,7 @@ sequenceDecs_decode_amd64_ll_update_zero:

// Update Match Length State
MOVBQZX R8, R14
SHRQ $0x10, R8
MOVWQZX R8, R8
SHRL $0x10, R8
LEAQ (BX)(R14*1), CX
MOVQ DX, R15
MOVQ CX, BX
Expand All @@ -197,8 +195,7 @@ sequenceDecs_decode_amd64_ll_update_zero:

// Update Offset State
MOVBQZX R9, R14
SHRQ $0x10, R9
MOVWQZX R9, R9
SHRL $0x10, R9
LEAQ (BX)(R14*1), CX
MOVQ DX, R15
MOVQ CX, BX
Expand Down Expand Up @@ -459,8 +456,7 @@ sequenceDecs_decode_56_amd64_ll_update_zero:

// Update Literal Length State
MOVBQZX DI, R14
SHRQ $0x10, DI
MOVWQZX DI, DI
SHRL $0x10, DI
LEAQ (BX)(R14*1), CX
MOVQ DX, R15
MOVQ CX, BX
Expand All @@ -479,8 +475,7 @@ sequenceDecs_decode_56_amd64_ll_update_zero:

// Update Match Length State
MOVBQZX R8, R14
SHRQ $0x10, R8
MOVWQZX R8, R8
SHRL $0x10, R8
LEAQ (BX)(R14*1), CX
MOVQ DX, R15
MOVQ CX, BX
Expand All @@ -499,8 +494,7 @@ sequenceDecs_decode_56_amd64_ll_update_zero:

// Update Offset State
MOVBQZX R9, R14
SHRQ $0x10, R9
MOVWQZX R9, R9
SHRL $0x10, R9
LEAQ (BX)(R14*1), CX
MOVQ DX, R15
MOVQ CX, BX
Expand Down Expand Up @@ -772,34 +766,31 @@ sequenceDecs_decode_bmi2_fill_2_end:
BZHIQ R14, R15, R15

// Update Offset State
BZHIQ R8, R15, CX
SHRXQ R8, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, R8, R8
ADDQ CX, R8
BZHIQ R8, R15, CX
SHRXQ R8, R15, R15
SHRL $0x10, R8
ADDQ CX, R8

// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8

// Update Match Length State
BZHIQ DI, R15, CX
SHRXQ DI, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, DI, DI
ADDQ CX, DI
BZHIQ DI, R15, CX
SHRXQ DI, R15, R15
SHRL $0x10, DI
ADDQ CX, DI

// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI

// Update Literal Length State
BZHIQ SI, R15, CX
MOVQ $0x00001010, R14
BEXTRQ R14, SI, SI
ADDQ CX, SI
BZHIQ SI, R15, CX
SHRL $0x10, SI
ADDQ CX, SI

// Load ctx.llTable
MOVQ ctx+16(FP), CX
Expand Down Expand Up @@ -1032,34 +1023,31 @@ sequenceDecs_decode_56_bmi2_fill_end:
BZHIQ R14, R15, R15

// Update Offset State
BZHIQ R8, R15, CX
SHRXQ R8, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, R8, R8
ADDQ CX, R8
BZHIQ R8, R15, CX
SHRXQ R8, R15, R15
SHRL $0x10, R8
ADDQ CX, R8

// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8

// Update Match Length State
BZHIQ DI, R15, CX
SHRXQ DI, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, DI, DI
ADDQ CX, DI
BZHIQ DI, R15, CX
SHRXQ DI, R15, R15
SHRL $0x10, DI
ADDQ CX, DI

// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI

// Update Literal Length State
BZHIQ SI, R15, CX
MOVQ $0x00001010, R14
BEXTRQ R14, SI, SI
ADDQ CX, SI
BZHIQ SI, R15, CX
SHRL $0x10, SI
ADDQ CX, SI

// Load ctx.llTable
MOVQ ctx+16(FP), CX
Expand Down Expand Up @@ -1967,8 +1955,7 @@ sequenceDecs_decodeSync_amd64_ll_update_zero:

// Update Literal Length State
MOVBQZX DI, R13
SHRQ $0x10, DI
MOVWQZX DI, DI
SHRL $0x10, DI
LEAQ (BX)(R13*1), CX
MOVQ DX, R14
MOVQ CX, BX
Expand All @@ -1987,8 +1974,7 @@ sequenceDecs_decodeSync_amd64_ll_update_zero:

// Update Match Length State
MOVBQZX R8, R13
SHRQ $0x10, R8
MOVWQZX R8, R8
SHRL $0x10, R8
LEAQ (BX)(R13*1), CX
MOVQ DX, R14
MOVQ CX, BX
Expand All @@ -2007,8 +1993,7 @@ sequenceDecs_decodeSync_amd64_ll_update_zero:

// Update Offset State
MOVBQZX R9, R13
SHRQ $0x10, R9
MOVWQZX R9, R9
SHRL $0x10, R9
LEAQ (BX)(R13*1), CX
MOVQ DX, R14
MOVQ CX, BX
Expand Down Expand Up @@ -2514,34 +2499,31 @@ sequenceDecs_decodeSync_bmi2_fill_2_end:
BZHIQ R13, R14, R14

// Update Offset State
BZHIQ R8, R14, CX
SHRXQ R8, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, R8, R8
ADDQ CX, R8
BZHIQ R8, R14, CX
SHRXQ R8, R14, R14
SHRL $0x10, R8
ADDQ CX, R8

// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8

// Update Match Length State
BZHIQ DI, R14, CX
SHRXQ DI, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, DI, DI
ADDQ CX, DI
BZHIQ DI, R14, CX
SHRXQ DI, R14, R14
SHRL $0x10, DI
ADDQ CX, DI

// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI

// Update Literal Length State
BZHIQ SI, R14, CX
MOVQ $0x00001010, R13
BEXTRQ R13, SI, SI
ADDQ CX, SI
BZHIQ SI, R14, CX
SHRL $0x10, SI
ADDQ CX, SI

// Load ctx.llTable
MOVQ ctx+16(FP), CX
Expand Down Expand Up @@ -3055,8 +3037,7 @@ sequenceDecs_decodeSync_safe_amd64_ll_update_zero:

// Update Literal Length State
MOVBQZX DI, R13
SHRQ $0x10, DI
MOVWQZX DI, DI
SHRL $0x10, DI
LEAQ (BX)(R13*1), CX
MOVQ DX, R14
MOVQ CX, BX
Expand All @@ -3075,8 +3056,7 @@ sequenceDecs_decodeSync_safe_amd64_ll_update_zero:

// Update Match Length State
MOVBQZX R8, R13
SHRQ $0x10, R8
MOVWQZX R8, R8
SHRL $0x10, R8
LEAQ (BX)(R13*1), CX
MOVQ DX, R14
MOVQ CX, BX
Expand All @@ -3095,8 +3075,7 @@ sequenceDecs_decodeSync_safe_amd64_ll_update_zero:

// Update Offset State
MOVBQZX R9, R13
SHRQ $0x10, R9
MOVWQZX R9, R9
SHRL $0x10, R9
LEAQ (BX)(R13*1), CX
MOVQ DX, R14
MOVQ CX, BX
Expand Down Expand Up @@ -3704,34 +3683,31 @@ sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
BZHIQ R13, R14, R14

// Update Offset State
BZHIQ R8, R14, CX
SHRXQ R8, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, R8, R8
ADDQ CX, R8
BZHIQ R8, R14, CX
SHRXQ R8, R14, R14
SHRL $0x10, R8
ADDQ CX, R8

// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8

// Update Match Length State
BZHIQ DI, R14, CX
SHRXQ DI, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, DI, DI
ADDQ CX, DI
BZHIQ DI, R14, CX
SHRXQ DI, R14, R14
SHRL $0x10, DI
ADDQ CX, DI

// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI

// Update Literal Length State
BZHIQ SI, R14, CX
MOVQ $0x00001010, R13
BEXTRQ R13, SI, SI
ADDQ CX, SI
BZHIQ SI, R14, CX
SHRL $0x10, SI
ADDQ CX, SI

// Load ctx.llTable
MOVQ ctx+16(FP), CX
Expand Down

0 comments on commit 6bf960e

Please sign in to comment.