compaction: don't split outputs within a user key

During a compaction, if the current sstable hits the file size limit, defer finishing the sstable if the next sstable would share a user key. This is the current behavior of flushes, and this change brings parity between the two. This change is motivated by introduction of range keys (see #1339). This ensures we can always cleanly truncate range keys that span range-key boundaries. This commit also removes (keyspan.Fragmenter).FlushTo. Now that we prohibit splitting sstables in the middle of a user key, the Fragmenter's FlushTo function is unnecessary. Compactions and flushes always use the TruncateAndFlushTo variant. This change required a tweak to the way grandparent limits are applied, in order to switch the grandparent splitter's comparison into a >= comparsion. This was necessary due to the shift in interpreting `splitterSuggestion`s as exclusive boundaries. Close #734.
cockroachdb · Feb 3, 2022 · a860bba · a860bba
1 parent 6c9f712
commit a860bba
Show file tree

Hide file tree

Showing 15 changed files with 180 additions and 624 deletions.
diff --git a/compaction.go b/compaction.go
diff --git a/compaction_iter.go b/compaction_iter.go
@@ -770,17 +770,14 @@ func (i *compactionIter) Close() error {
 // exclude specifies if the specified key is exclusive or inclusive.
 // When exclude = true, all returned range tombstones are truncated to the
 // specified key.
-func (i *compactionIter) Tombstones(key []byte, exclude bool) []keyspan.Span {
-	switch {
-	case key == nil:
+func (i *compactionIter) Tombstones(key []byte) []keyspan.Span {
+	if key == nil {
 		i.rangeDelFrag.Finish()
-	case exclude:
+	} else {
 		// The specified end key is exclusive; no versions of the specified
 		// user key (including range tombstones covering that key) should
 		// be flushed yet.
 		i.rangeDelFrag.TruncateAndFlushTo(key)
-	default:
-		i.rangeDelFrag.FlushTo(key)
 	}
 	tombstones := i.tombstones
 	i.tombstones = nil

diff --git a/compaction_iter_test.go b/compaction_iter_test.go
@@ -192,7 +192,7 @@ func TestCompactionIter(t *testing.T) {
 						if len(parts) == 2 {
 							key = []byte(parts[1])
 						}
-						for _, v := range iter.Tombstones(key, false) {
+						for _, v := range iter.Tombstones(key) {
 							fmt.Fprintf(&b, "%s-%s#%d\n",
 								v.Start.UserKey, v.End, v.Start.SeqNum())
 						}

diff --git a/compaction_picker.go b/compaction_picker.go
@@ -374,6 +374,11 @@ func (pc *pickedCompaction) grow(sm, la InternalKey, maxExpandedBytes uint64) bo
 // disableIsCompacting is true, isCompacting always returns false. This helps
 // avoid spurious races from being detected when this method is used outside
 // of compaction picking code.
+//
+// TODO(jackson): Compactions and flushes no longer split a user key between two
+// sstables. We could perform a migration, re-compacting any sstables with split
+// user keys, which would allow us to remove atomic compaction unit expansion
+// code.
 func expandToAtomicUnit(
 	cmp Compare, inputs manifest.LevelSlice, disableIsCompacting bool,
 ) (slice manifest.LevelSlice, isCompacting bool) {

diff --git a/docs/range_deletions.md b/docs/range_deletions.md
@@ -1,5 +1,9 @@
 # Range Deletions
 
+TODO: The following explanation of range deletions does not take into account
+the recent change to prohibit splitting of a user key between sstables. This
+change simplifies the logic, removing 'improperly truncated range tombstones.'
+
 TODO: The following explanation of range deletions ignores the
 kind/trailer that appears at the end of keys after the sequence
 number. This should be harmless but need to add a justification on why

diff --git a/internal/keyspan/fragmenter.go b/internal/keyspan/fragmenter.go
@@ -88,8 +88,8 @@ type Fragmenter struct {
 	// flushed. All pending spans have the same Start.UserKey.
 	pending []Span
 	// doneBuf is used to buffer completed span fragments when flushing to a
-	// specific key (e.g. FlushTo). It is cached in the Fragmenter to allow
-	// reuse.
+	// specific key (e.g. TruncateAndFlushTo). It is cached in the Fragmenter to
+	// allow reuse.
 	doneBuf []Span
 	// sortBuf is used to sort fragments by end key when flushing.
 	sortBuf spansByEndKey
@@ -266,88 +266,15 @@ func (f *Fragmenter) Empty() bool {
 	return f.finished || len(f.pending) == 0
 }
 
-// FlushTo flushes all of the fragments with a start key <= key. Used during
-// compaction to force emitting of spans which straddle an sstable boundary.
-// Note that the emitted spans are not truncated to the specified key. Consider
+// TruncateAndFlushTo flushes all of the fragments with a start key <= key,
+// truncating spans to the specified end key. Used during compaction to force
+// emitting of spans which straddle an sstable boundary. Consider
 // the scenario:
 //
 //     a---------k#10
 //          f#8
 //          f#7
-//
-// If the compaction logic splits f#8 and f#7 into different sstables, we can't
-// truncate the span [a,k) at f. Doing so could produce an sstable with the
-// records:
-//
-//     a----f#10
-//          f#8
-//
-// The span [a,f) does not cover the key f.
-func (f *Fragmenter) FlushTo(key []byte) {
-	if f.finished {
-		panic("pebble: span fragmenter already finished")
-	}
-
-	if f.flushedKey != nil {
-		switch c := f.Cmp(key, f.flushedKey); {
-		case c < 0:
-			panic(fmt.Sprintf("pebble: flush-to key (%s) < flushed key (%s)",
-				f.Format(key), f.Format(f.flushedKey)))
-		}
-	}
-	f.flushedKey = append(f.flushedKey[:0], key...)
-
-	if len(f.pending) > 0 {
-		// Since all of the pending spans have the same start key, we only need
-		// to compare against the first one.
-		switch c := f.Cmp(f.pending[0].Start.UserKey, key); {
-		case c > 0:
-			panic(fmt.Sprintf("pebble: keys must be in order: %s > %s",
-				f.Format(f.pending[0].Start.UserKey), f.Format(key)))
-		}
-		// Note that we explicitly do not return early here if Start.UserKey ==
-		// key. Similar to the scenario described above, consider:
-		//
-		//          f----k#10
-		//          f#8
-		//          f#7
-		//
-		// If the compaction logic splits f#8 and f#7 into different sstables, we
-		// have to emit the span [f,k) in both sstables.
-	}
-
-	// At this point we know that the new start key is greater than the pending
-	// spans start keys. We flush all span fragments with a start key
-	// <= key.
-	f.flush(f.pending, key)
-
-	// Truncate the pending spans to start with key, filtering any which
-	// would become empty.
-	pending := f.pending
-	f.pending = f.pending[:0]
-	for _, s := range pending {
-		if f.Cmp(key, s.End) < 0 {
-			//   s: a--+--e
-			// new:    c------
-			f.pending = append(f.pending, Span{
-				Start: base.MakeInternalKey(key, s.Start.SeqNum(), s.Start.Kind()),
-				End:   s.End,
-				Value: s.Value,
-			})
-		}
-	}
-}
-
-// TruncateAndFlushTo is similar to FlushTo, except it also truncates spans to
-// the specified end key by calling truncateAndFlush. Only called in compactions
-// where we can guarantee that all versions of UserKeys < key have been written,
-// or in other words, where we can ensure we don't split a user key across two
-// sstables. Going back to the scenario from above:
-//
-//    a---------k#10
-//         f#8
-//         f#7
-//
+///
 // Let's say the next user key after f is g. Calling TruncateAndFlushTo(g) will
 // flush this span:
 //

diff --git a/internal/keyspan/fragmenter_test.go b/internal/keyspan/fragmenter_test.go
@@ -45,14 +45,7 @@ func buildSpans(
 		},
 	}
 	for _, line := range strings.Split(s, "\n") {
-		if strings.HasPrefix(line, "flush-to ") {
-			parts := strings.Split(line, " ")
-			if len(parts) != 2 {
-				t.Fatalf("expected 2 components, but found %d: %s", len(parts), line)
-			}
-			f.FlushTo([]byte(parts[1]))
-			continue
-		} else if strings.HasPrefix(line, "truncate-and-flush-to ") {
+		if strings.HasPrefix(line, "truncate-and-flush-to ") {
 			parts := strings.Split(line, " ")
 			if len(parts) != 2 {
 				t.Fatalf("expected 2 components, but found %d: %s", len(parts), line)
@@ -204,30 +197,6 @@ func TestFragmenterDeleted(t *testing.T) {
 	})
 }
 
-func TestFragmenterFlushTo(t *testing.T) {
-	cmp := base.DefaultComparer.Compare
-	fmtKey := base.DefaultComparer.FormatKey
-
-	datadriven.RunTest(t, "testdata/fragmenter_flush_to", func(d *datadriven.TestData) string {
-		switch d.Cmd {
-		case "build":
-			return func() (result string) {
-				defer func() {
-					if r := recover(); r != nil {
-						result = fmt.Sprint(r)
-					}
-				}()
-
-				spans := buildSpans(t, cmp, fmtKey, d.Input, base.InternalKeyKindRangeDelete)
-				return formatSpans(spans)
-			}()
-
-		default:
-			return fmt.Sprintf("unknown command: %s", d.Cmd)
-		}
-	})
-}
-
 func TestFragmenterTruncateAndFlushTo(t *testing.T) {
 	cmp := base.DefaultComparer.Compare
 	fmtKey := base.DefaultComparer.FormatKey

diff --git a/internal/keyspan/testdata/fragmenter_flush_to b/internal/keyspan/testdata/fragmenter_flush_to
diff --git a/internal/keyspan/testdata/fragmenter_values b/internal/keyspan/testdata/fragmenter_values
@@ -63,26 +63,3 @@ truncate-and-flush-to d
 3:     e-g                    orange
 1:     e-g                    coconut
 3:       g-i                  orange
-
-# NB: Unlike the above truncate-and-flush-to calls, a flush-to does not truncate
-# the end boundary. In this case, the fragments beginning at `c` are not
-# truncated to `d`, they're flushed with the bounadries formed by fragmentation
-# (`e`)
-build
-3: a-c                        apple
-2: a---e                      banana
-1: a-----g                    coconut
-flush-to d
-3:    d----i                  orange
-----
-3: a-c                        apple
-2: a-c                        banana
-1: a-c                        coconut
-2:   c-e                      banana
-1:   c-e                      coconut
-3:    de                      orange
-2:    de                      banana
-1:    de                      coconut
-3:     e-g                    orange
-1:     e-g                    coconut
-3:       g-i                  orange