Skip to content

Commit

Permalink
zstd: Add delta encoding support
Browse files Browse the repository at this point in the history
This adds support for delta encoding, compatible with the --patch-from
option that was introduced in zstd reference v1.4.5:
https://github.com/facebook/zstd/wiki/Zstandard-as-a-patching-engine
  • Loading branch information
greatroar committed Jan 5, 2023
1 parent 8b191e4 commit 8307ba5
Show file tree
Hide file tree
Showing 10 changed files with 152 additions and 34 deletions.
49 changes: 24 additions & 25 deletions zstd/decoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,15 +341,8 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
}
return dst, err
}
if frame.DictionaryID != nil {
dict, ok := d.dicts[*frame.DictionaryID]
if !ok {
return nil, ErrUnknownDictionary
}
if debugDecoder {
println("setting dict", frame.DictionaryID)
}
frame.history.setDict(&dict)
if err = d.setDict(frame); err != nil {
return nil, err
}
if frame.WindowSize > d.o.maxWindowSize {
if debugDecoder {
Expand Down Expand Up @@ -495,18 +488,12 @@ func (d *Decoder) nextBlockSync() (ok bool) {
if !d.syncStream.inFrame {
d.frame.history.reset()
d.current.err = d.frame.reset(&d.syncStream.br)
if d.current.err == nil {
d.current.err = d.setDict(d.frame)
}
if d.current.err != nil {
return false
}
if d.frame.DictionaryID != nil {
dict, ok := d.dicts[*d.frame.DictionaryID]
if !ok {
d.current.err = ErrUnknownDictionary
return false
} else {
d.frame.history.setDict(&dict)
}
}
if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize {
d.current.err = ErrDecoderSizeExceeded
return false
Expand Down Expand Up @@ -865,13 +852,8 @@ decodeStream:
if debugDecoder && err != nil {
println("Frame decoder returned", err)
}
if err == nil && frame.DictionaryID != nil {
dict, ok := d.dicts[*frame.DictionaryID]
if !ok {
err = ErrUnknownDictionary
} else {
frame.history.setDict(&dict)
}
if err == nil {
err = d.setDict(frame)
}
if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
if debugDecoder {
Expand Down Expand Up @@ -953,3 +935,20 @@ decodeStream:
hist.reset()
d.frame.history.b = frameHistCache
}

func (d *Decoder) setDict(frame *frameDec) (err error) {
dict, ok := d.dicts[frame.DictionaryID]
if ok {
if debugDecoder {
println("setting dict", frame.DictionaryID)
}
frame.history.setDict(&dict)
} else if frame.DictionaryID != 0 {
// A zero (or missing) dictionary id means unspecified: the decoder
// must know whether a dictionary is used "by other means".
// In particular, zstd --patch-from uses this id.
// So only return an error if the dictionary id is not zero.
err = ErrUnknownDictionary
}
return err
}
17 changes: 16 additions & 1 deletion zstd/decoder_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,13 @@ func WithDecoderMaxMemory(n uint64) DOption {
}

// WithDecoderDicts allows to register one or more dictionaries for the decoder.
// If several dictionaries with the same ID is provided the last one will be used.
//
// Each slice in dict must be in the [dictionary format] produced by
// "zstd --train" from the Zstandard reference implementation.
//
// If several dictionaries with the same ID are provided, the last one will be used.
//
// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
func WithDecoderDicts(dicts ...[]byte) DOption {
return func(o *decoderOptions) error {
for _, b := range dicts {
Expand All @@ -99,6 +105,15 @@ func WithDecoderDicts(dicts ...[]byte) DOption {
}
}

// WithEncoderDictRaw registers a dictionary that may be used by the decoder.
// The slice content can be arbitrary data.
func WithDecoderDictRaw(id uint32, content []byte) DOption {
return func(o *decoderOptions) error {
o.dicts = append(o.dicts, dict{id: id, content: content})
return nil
}
}

// WithDecoderMaxWindow allows to set a maximum window size for decodes.
// This allows rejecting packets that will cause big memory usage.
// The Decoder will likely allocate more memory based on the WithDecoderLowmem setting.
Expand Down
35 changes: 35 additions & 0 deletions zstd/dict_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -459,3 +459,38 @@ func readDicts(tb testing.TB, zr *zip.Reader) [][]byte {
}
return dicts
}

// Test decoding of zstd --patch-from output.
func TestDecoderRawDict(t *testing.T) {
t.Parallel()

dict, err := os.ReadFile("testdata/delta/source.txt")
if err != nil {
t.Fatal(err)
}

delta, err := os.Open("testdata/delta/target.txt.zst")
if err != nil {
t.Fatal(err)
}
defer delta.Close()

dec, err := NewReader(delta, WithDecoderDictRaw(0, dict))
if err != nil {
t.Fatal(err)
}

out, err := io.ReadAll(dec)
if err != nil {
t.Fatal(err)
}

ref, err := os.ReadFile("testdata/delta/target.txt")
if err != nil {
t.Fatal(err)
}

if !bytes.Equal(out, ref) {
t.Errorf("mismatch: got %q, wanted %q", out, ref)
}
}
17 changes: 17 additions & 0 deletions zstd/encoder_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,13 @@ func WithLowerEncoderMem(b bool) EOption {
}

// WithEncoderDict allows to register a dictionary that will be used for the encode.
//
// The slice dict must be in the [dictionary format] produced by
// "zstd --train" from the Zstandard reference implementation.
//
// The encoder *may* choose to use no dictionary instead for certain payloads.
//
// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
func WithEncoderDict(dict []byte) EOption {
return func(o *encoderOptions) error {
d, err := loadDict(dict)
Expand All @@ -316,3 +322,14 @@ func WithEncoderDict(dict []byte) EOption {
return nil
}
}

// WithEncoderDictRaw registers a dictionary that may be used by the encoder.
//
// The slice content may contain arbitrary data. It will be used as an initial
// history.
func WithEncoderDictRaw(id uint32, content []byte) EOption {
return func(o *encoderOptions) error {
o.dict = &dict{id: id, content: content}
return nil
}
}
47 changes: 47 additions & 0 deletions zstd/example_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package zstd_test

import (
"bytes"
"fmt"

"github.com/klauspost/compress/zstd"
)

func ExampleWithEncoderDictRaw() {
// "Raw" dictionaries can be used for compressed delta encoding.

source := []byte(`
This is the source file. Compression of the target file with
the source file as the dictionary will produce a compressed
delta encoding of the target file.`)
target := []byte(`
This is the target file. Decompression of the delta encoding with
the source file as the dictionary will produce this file.`)

// The dictionary id is arbitrary. We use zero for compatibility
// with zstd --patch-from, but applications can use any id
// not in the range [32768, 1<<31).
const id = 0

bestLevel := zstd.WithEncoderLevel(zstd.SpeedBestCompression)

w, _ := zstd.NewWriter(nil, bestLevel,
zstd.WithEncoderDictRaw(id, source))
delta := w.EncodeAll(target, nil)
fmt.Printf("delta encoding: %4d bytes\n", len(delta))

r, _ := zstd.NewReader(nil, zstd.WithDecoderDictRaw(id, source))
out, err := r.DecodeAll(delta, nil)
if err != nil || !bytes.Equal(out, target) {
panic("decoding error")
}

// Ordinary compression, for reference.
w, _ = zstd.NewWriter(nil, bestLevel)
z := w.EncodeAll(target, nil)
fmt.Printf("compression only: %4d bytes\n", len(z))

// Output:
// delta encoding: 44 bytes
// compression only: 111 bytes
}
10 changes: 3 additions & 7 deletions zstd/framedec.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ type frameDec struct {

FrameContentSize uint64

DictionaryID *uint32
DictionaryID uint32
HasCheckSum bool
SingleSegment bool
}
Expand Down Expand Up @@ -155,7 +155,7 @@ func (d *frameDec) reset(br byteBuffer) error {

// Read Dictionary_ID
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
d.DictionaryID = nil
d.DictionaryID = 0
if size := fhd & 3; size != 0 {
if size == 3 {
size = 4
Expand All @@ -178,11 +178,7 @@ func (d *frameDec) reset(br byteBuffer) error {
if debugDecoder {
println("Dict size", size, "ID:", id)
}
if id > 0 {
// ID 0 means "sorry, no dictionary anyway".
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
d.DictionaryID = &id
}
d.DictionaryID = id
}

// Read Frame_Content_Size
Expand Down
5 changes: 5 additions & 0 deletions zstd/testdata/delta/source.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
0000000000000000

This file is to be used as the dictionary for compressing target.txt:

zstd -19 --patch-from=source.txt target.txt
5 changes: 5 additions & 0 deletions zstd/testdata/delta/target.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
0000000000000000

This file is to be compressed with source.txt as the dictionary:

zstd -19 --patch-from=source.txt target.txt
Binary file added zstd/testdata/delta/target.txt.zst
Binary file not shown.
1 change: 0 additions & 1 deletion zstd/zstd.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ var (
ErrDecoderSizeExceeded = errors.New("decompressed size exceeds configured limit")

// ErrUnknownDictionary is returned if the dictionary ID is unknown.
// For the time being dictionaries are not supported.
ErrUnknownDictionary = errors.New("unknown dictionary")

// ErrFrameSizeExceeded is returned if the stated frame size is exceeded.
Expand Down

0 comments on commit 8307ba5

Please sign in to comment.