Skip to content

Commit

Permalink
*: infer sstable.TableFormat from FormatMajorVerison
Browse files Browse the repository at this point in the history
The [Pebble SSTable Format Versions RFC][1] outlines a new versioning
scheme used for new SSTable-level features. The RFC also outlines a
requirement that external callers (i.e. Cockroach) ensure that the
format version is compatible with other stores in a cluster.

To ensure tables are created with an explicit version, the
`(*pebble.Options).NewWriterOptions` method now takes an additional
parameter that specifies the format version to be used when writing a
table. Previous the table format was pinned at RocksDBv2.

Expose a `(FormatMajorVersion).MaxTableFormat()` method that returns the
maximum allowable table format for the given `FormatMajorVersion`. This
can be used to ensure that given a store major version, an
`sstable.Writer` is capped at writing tables with a maximum table
format, to ensure compatibility with other stores running at the same
version.

New `sstable.Writer`s within Pebble itself (in contrast to `Writer`s
created _external_ to a Pebble store, e.g. Cockroach generating a table
for backup or ingestion) select the table format by consulting the
current value of `(*DB).FormatMajorVersion()` and then infer the maximum
allowable version from `(FormatMajorVersion).MaxTableFormat()`.

Update the table footer encode and decode functions to take into account
the new Pebble magic string. This new magic string implies a
Pebble-specific versioning scheme (with v1 for block properties and v2
for range keys) with the same table footer format used in RocksDBv2.

Add a compatibility check to the table write path to assert that the
presence of specific features on a table must be accompanied by the
appropriate table format (i.e. block properties must be at least version
PebbleDBv1, etc.).

Update existing tests that need to make use of newer Pebble table
features (i.e. block properties and / or range keys) to explicitly set
the table format to opt-into the newer features.

[1]: ./docs/RFCS/20220112_pebble_sstable_format_versions.md
  • Loading branch information
nicktrav committed Feb 3, 2022
1 parent 38b68e1 commit 6c9f712
Show file tree
Hide file tree
Showing 15 changed files with 372 additions and 51 deletions.
6 changes: 5 additions & 1 deletion compaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -2064,6 +2064,10 @@ func (d *DB) runCompaction(

snapshots := d.mu.snapshots.toSlice()
formatVers := d.mu.formatVers.vers
// The table is written at the maximum allowable format implied by the current
// format major version of the DB.
tableFormat := formatVers.MaxTableFormat()

// Release the d.mu lock while doing I/O.
// Note the unusual order: Unlock and then Lock.
d.mu.Unlock()
Expand Down Expand Up @@ -2115,7 +2119,7 @@ func (d *DB) runCompaction(
c.metrics[c.startLevel.level] = &LevelMetrics{}
}

writerOpts := d.opts.MakeWriterOptions(c.outputLevel.level)
writerOpts := d.opts.MakeWriterOptions(c.outputLevel.level, tableFormat)
if formatVers < FormatBlockPropertyCollector {
// Cannot yet write block properties.
writerOpts.BlockPropertyCollectors = nil
Expand Down
18 changes: 18 additions & 0 deletions format_major_version.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"github.com/cockroachdb/errors"
"github.com/cockroachdb/pebble/internal/base"
"github.com/cockroachdb/pebble/sstable"
"github.com/cockroachdb/pebble/vfs"
"github.com/cockroachdb/pebble/vfs/atomicfs"
)
Expand Down Expand Up @@ -70,9 +71,26 @@ const (
// BlockPropertyCollectors.
FormatBlockPropertyCollector
// FormatNewest always contains the most recent format major version.
// NB: When adding new versions, the MaxTableFormat method should also be
// updated to return the maximum allowable version for the new
// FormatMajorVersion.
FormatNewest FormatMajorVersion = FormatBlockPropertyCollector
)

// MaxTableFormat returns the maximum sstable.TableFormat that can be used at
// this FormatMajorVersion.
func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat {
switch v {
case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker,
FormatVersioned, FormatSetWithDelete:
return sstable.TableFormatRocksDBv2
case FormatBlockPropertyCollector:
return sstable.TableFormatPebblev1
default:
panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
}
}

// formatMajorVersionMigrations defines the migrations from one format
// major version to the next. Each migration is defined as a closure
// which will be invoked on the database before the new format major
Expand Down
27 changes: 27 additions & 0 deletions format_major_version_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"fmt"
"testing"

"github.com/cockroachdb/pebble/sstable"
"github.com/cockroachdb/pebble/vfs"
"github.com/cockroachdb/pebble/vfs/atomicfs"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -174,3 +175,29 @@ func TestFormatMajorVersions(t *testing.T) {
})
}
}

func TestFormatMajorVersions_TableFormat(t *testing.T) {
// NB: This test is intended to validate the mapping between every
// FormatMajorVersion and sstable.TableFormat exhaustively. This serves as a
// sanity check that new versions have a corresponding mapping. The test
// fixture is intentionally verbose.

m := map[FormatMajorVersion]sstable.TableFormat{
FormatDefault: sstable.TableFormatRocksDBv2,
FormatMostCompatible: sstable.TableFormatRocksDBv2,
formatVersionedManifestMarker: sstable.TableFormatRocksDBv2,
FormatVersioned: sstable.TableFormatRocksDBv2,
FormatSetWithDelete: sstable.TableFormatRocksDBv2,
FormatBlockPropertyCollector: sstable.TableFormatPebblev1,
}

// Valid versions.
for fmv := FormatMostCompatible; fmv <= FormatNewest; fmv++ {
f := fmv.MaxTableFormat()
require.Equalf(t, m[fmv], f, "got %s; want %s", f, m[fmv])
}

// Invalid versions.
fmv := FormatNewest + 1
require.Panics(t, func() { _ = fmv.MaxTableFormat() })
}
3 changes: 2 additions & 1 deletion internal/metamorphic/ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,8 @@ func (o *ingestOp) build(t *test, h *history, b *pebble.Batch, i int) (string, e
}()

equal := t.opts.Comparer.Equal
w := sstable.NewWriter(f, t.opts.MakeWriterOptions(0))
tableFormat := t.db.FormatMajorVersion().MaxTableFormat()
w := sstable.NewWriter(f, t.opts.MakeWriterOptions(0, tableFormat))

var lastUserKey []byte
for key, value := iter.First(); key != nil; key, value = iter.Next() {
Expand Down
4 changes: 2 additions & 2 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -1247,15 +1247,15 @@ func (o *Options) MakeReaderOptions() sstable.ReaderOptions {

// MakeWriterOptions constructs sstable.WriterOptions for the specified level
// from the corresponding options in the receiver.
func (o *Options) MakeWriterOptions(level int) sstable.WriterOptions {
func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstable.WriterOptions {
var writerOpts sstable.WriterOptions
writerOpts.TableFormat = format
if o != nil {
writerOpts.Cache = o.Cache
writerOpts.Comparer = o.Comparer
if o.Merger != nil {
writerOpts.MergerName = o.Merger.Name
}
writerOpts.TableFormat = sstable.TableFormatRocksDBv2
writerOpts.TablePropertyCollectors = o.TablePropertyCollectors
writerOpts.BlockPropertyCollectors = o.BlockPropertyCollectors
}
Expand Down
2 changes: 1 addition & 1 deletion sstable/block_property_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -895,7 +895,7 @@ func TestBlockProperties(t *testing.T) {
_ = r.Close()
r = nil
}
var opts WriterOptions
opts := WriterOptions{TableFormat: TableFormatPebblev2}
for _, cmd := range td.CmdArgs {
switch cmd.Key {
case "block-size":
Expand Down
4 changes: 2 additions & 2 deletions sstable/data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,14 +162,14 @@ func runBuildCmd(
return meta, r, nil
}

func runBuildRawCmd(td *datadriven.TestData) (*WriterMetadata, *Reader, error) {
func runBuildRawCmd(td *datadriven.TestData, opts *WriterOptions) (*WriterMetadata, *Reader, error) {
mem := vfs.NewMem()
f0, err := mem.Create("test")
if err != nil {
return nil, nil, err
}

w := NewWriter(f0, WriterOptions{})
w := NewWriter(f0, *opts)
for i := range td.CmdArgs {
arg := &td.CmdArgs[i]
if arg.Key == "range-del-v1" {
Expand Down
91 changes: 91 additions & 0 deletions sstable/format.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.

package sstable

import (
"github.com/cockroachdb/errors"
"github.com/cockroachdb/pebble/internal/base"
)

// TableFormat specifies the format version for sstables. The legacy LevelDB
// format is format version 1.
type TableFormat uint32

// The available table formats, representing the tuple (magic number, version
// number). Note that these values are not (and should not) be serialized to
// disk. The ordering should follow the order the versions were introduced to
// Pebble (i.e. the history is linear).
const (
TableFormatUnspecified TableFormat = iota
TableFormatLevelDB
TableFormatRocksDBv2
TableFormatPebblev1 // Block properties.
TableFormatPebblev2 // Range keys.

TableFormatMax = TableFormatPebblev2
)

// ParseTableFormat parses the given magic bytes and version into its
// corresponding internal TableFormat.
func ParseTableFormat(magic []byte, version uint32) (TableFormat, error) {
switch string(magic) {
case levelDBMagic:
return TableFormatLevelDB, nil
case rocksDBMagic:
if version != rocksDBFormatVersion2 {
return TableFormatUnspecified, base.CorruptionErrorf(
"pebble/table: unsupported rocksdb format version %d", errors.Safe(version),
)
}
return TableFormatRocksDBv2, nil
case pebbleDBMagic:
switch version {
case 1:
return TableFormatPebblev1, nil
case 2:
return TableFormatPebblev2, nil
default:
return TableFormatUnspecified, base.CorruptionErrorf(
"pebble/table: unsupported pebble format version %d", errors.Safe(version),
)
}
default:
return TableFormatUnspecified, base.CorruptionErrorf(
"pebble/table: invalid table (bad magic number)",
)
}
}

// AsTuple returns the TableFormat's (Magic String, Version) tuple.
func (f TableFormat) AsTuple() (string, uint32) {
switch f {
case TableFormatLevelDB:
return levelDBMagic, 0
case TableFormatRocksDBv2:
return rocksDBMagic, 2
case TableFormatPebblev1:
return pebbleDBMagic, 1
case TableFormatPebblev2:
return pebbleDBMagic, 2
default:
panic("sstable: unknown table format version tuple")
}
}

// String returns the TableFormat (Magic String,Version) tuple.
func (f TableFormat) String() string {
switch f {
case TableFormatLevelDB:
return "(LevelDB)"
case TableFormatRocksDBv2:
return "(RocksDB,v2)"
case TableFormatPebblev1:
return "(Pebble,v1)"
case TableFormatPebblev2:
return "(Pebble,v2)"
default:
panic("sstable: unknown table format version tuple")
}
}
84 changes: 84 additions & 0 deletions sstable/format_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.

package sstable

import (
"testing"

"github.com/stretchr/testify/require"
)

func TestTableFormat_RoundTrip(t *testing.T) {
tcs := []struct {
name string
magic string
version uint32
want TableFormat
wantErr string
}{
// Valid cases.
{
name: "LevelDB",
magic: levelDBMagic,
version: 0,
want: TableFormatLevelDB,
},
{
name: "RocksDBv2",
magic: rocksDBMagic,
version: 2,
want: TableFormatRocksDBv2,
},
{
name: "PebbleDBv1",
magic: pebbleDBMagic,
version: 1,
want: TableFormatPebblev1,
},
{
name: "PebbleDBv2",
magic: pebbleDBMagic,
version: 2,
want: TableFormatPebblev2,
},
// Invalid cases.
{
name: "Invalid RocksDB version",
magic: rocksDBMagic,
version: 1,
wantErr: "pebble/table: unsupported rocksdb format version 1",
},
{
name: "Invalid PebbleDB version",
magic: pebbleDBMagic,
version: 3,
wantErr: "pebble/table: unsupported pebble format version 3",
},
{
name: "Unknown magic string",
magic: "foo",
wantErr: "pebble/table: invalid table (bad magic number)",
},
}

for _, tc := range tcs {
t.Run(tc.name, func(t *testing.T) {
// Tuple -> TableFormat.
f, err := ParseTableFormat([]byte(tc.magic), tc.version)
if tc.wantErr != "" {
require.Error(t, err)
require.Equal(t, tc.wantErr, err.Error())
return
}
require.NoError(t, err)
require.Equal(t, tc.want, f)

// TableFormat -> Tuple.
s, v := f.AsTuple()
require.Equal(t, tc.magic, s)
require.Equal(t, tc.version, v)
})
}
}
15 changes: 2 additions & 13 deletions sstable/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,19 +50,6 @@ type FilterWriter = base.FilterWriter
// FilterPolicy exports the base.FilterPolicy type.
type FilterPolicy = base.FilterPolicy

// TableFormat specifies the format version for sstables. The legacy LevelDB
// format is format version 0.
type TableFormat uint32

// The available table formats, representing the tuple (magic number, version
// number). Note that these values are not (and should not) be serialized to
// disk.
const (
TableFormatUnspecified TableFormat = iota
TableFormatLevelDB
TableFormatRocksDBv2
)

// TablePropertyCollector provides a hook for collecting user-defined
// properties based on the keys and values stored in an sstable. A new
// TablePropertyCollector is created for an sstable when the sstable is being
Expand Down Expand Up @@ -234,6 +221,8 @@ func (o WriterOptions) ensureDefaults() WriterOptions {
if o.Checksum == ChecksumTypeNone {
o.Checksum = ChecksumTypeCRC32c
}
// By default, if the table format is not specified, fall back to using the
// most compatible format.
if o.TableFormat == TableFormatUnspecified {
o.TableFormat = TableFormatRocksDBv2
}
Expand Down
Loading

0 comments on commit 6c9f712

Please sign in to comment.