Skip to content

Commit

Permalink
zstd: Fix reuse of huff0 when data hard to compress
Browse files Browse the repository at this point in the history
zstd would reject huff0 compressed literals if the improvement was too small.

However, this would update the huff0 state to contain a new table which could be reused. In that case a wrong table could be used for the next block.

We move the rejection code to huff0, so the state can be properly maintained.

Fixes #170
  • Loading branch information
klauspost committed Oct 24, 2019
1 parent be100d6 commit 2988758
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 16 deletions.
16 changes: 11 additions & 5 deletions huff0/compress.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,12 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
canReuse = s.canUseTable(s.prevTable)
}

// We want the output size to be less than this:
wantSize := len(in)
if s.WantLogLess > 0 {
wantSize -= wantSize >> s.WantLogLess
}

// Reset for next run.
s.clearCount = true
s.maxCount = 0
Expand All @@ -77,7 +83,7 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
s.cTable = s.prevTable
s.Out, err = compressor(in)
s.cTable = keepTable
if err == nil && len(s.Out) < len(in) {
if err == nil && len(s.Out) < wantSize {
s.OutData = s.Out
return s.Out, true, nil
}
Expand All @@ -100,16 +106,16 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
hSize := len(s.Out)
oldSize := s.prevTable.estimateSize(s.count[:s.symbolLen])
newSize := s.cTable.estimateSize(s.count[:s.symbolLen])
if oldSize <= hSize+newSize || hSize+12 >= len(in) {
if oldSize <= hSize+newSize || hSize+12 >= wantSize {
// Retain cTable even if we re-use.
keepTable := s.cTable
s.cTable = s.prevTable
s.Out, err = compressor(in)
s.cTable = keepTable
if err != nil {
return nil, false, err
}
s.cTable = keepTable
if len(s.Out) >= len(in) {
if len(s.Out) >= wantSize {
return nil, false, ErrIncompressible
}
s.OutData = s.Out
Expand All @@ -131,7 +137,7 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
s.OutTable = nil
return nil, false, err
}
if len(s.Out) >= len(in) {
if len(s.Out) >= wantSize {
s.OutTable = nil
return nil, false, ErrIncompressible
}
Expand Down
6 changes: 6 additions & 0 deletions huff0/huff0.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ type Scratch struct {
// Reuse will specify the reuse policy
Reuse ReusePolicy

// WantLogLess allows to specify a log 2 reduction that should at least be achieved,
// otherwise the block will be returned as incompressible.
// The reduction should then at least be (input size >> WantLogLess)
// If WantLogLess == 0 any improvement will do.
WantLogLess uint8

// MaxDecodedSize will set the maximum allowed output size.
// This value will automatically be set to BlockSizeMax if not set.
// Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded.
Expand Down
10 changes: 2 additions & 8 deletions zstd/blockenc.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ func (b *blockEnc) init() {
b.coders.llEnc = &fseEncoder{}
b.coders.llPrev = &fseEncoder{}
}
b.litEnc = &huff0.Scratch{}
b.litEnc = &huff0.Scratch{WantLogLess: 4}
b.reset(nil)
}

Expand Down Expand Up @@ -415,16 +415,10 @@ func (b *blockEnc) encode() error {
if len(b.literals) >= 1024 {
// Use 4 Streams.
out, reUsed, err = huff0.Compress4X(b.literals, b.litEnc)
if len(out) > len(b.literals)-len(b.literals)>>4 {
err = huff0.ErrIncompressible
}
} else if len(b.literals) > 32 {
// Use 1 stream
single = true
out, reUsed, err = huff0.Compress1X(b.literals, b.litEnc)
if len(out) > len(b.literals)-len(b.literals)>>4 {
err = huff0.ErrIncompressible
}
} else {
err = huff0.ErrIncompressible
}
Expand Down Expand Up @@ -711,7 +705,7 @@ func (b *blockEnc) encode() error {
return nil
}

var errIncompressible = errors.New("uncompressible")
var errIncompressible = errors.New("incompressible")

func (b *blockEnc) genCodes() {
if len(b.sequences) == 0 {
Expand Down
6 changes: 3 additions & 3 deletions zstd/enc_dfast.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ encodeLoop:
if debug && s-t > e.maxMatchOff {
panic("s - t >e.maxMatchOff")
}
if debug {
if debugMatches {
println("long match")
}
break
Expand All @@ -259,7 +259,7 @@ encodeLoop:
// but the likelihood of both the first 4 bytes and the hash matching should be enough.
t = candidateL.offset - e.cur
s += checkAt
if debug {
if debugMatches {
println("long match (after short)")
}
break
Expand All @@ -275,7 +275,7 @@ encodeLoop:
if debug && t < 0 {
panic("t<0")
}
if debug {
if debugMatches {
println("short match")
}
break
Expand Down
1 change: 1 addition & 0 deletions zstd/zstd.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (

const debug = false
const debugSequences = false
const debugMatches = false

// force encoder to use predefined tables.
const forcePreDef = false
Expand Down

0 comments on commit 2988758

Please sign in to comment.