m3db · rallen090 · Aug 19, 2021 · Jul 28, 2021 · Jul 28, 2021 · Jul 29, 2021
diff --git a/src/dbnode/storage/flush.go b/src/dbnode/storage/flush.go
@@ -54,20 +54,6 @@ const (
 	flushManagerIndexFlushInProgress
 )
 
-type namespaceFlushes map[string]namespaceFlush
-
-type namespaceFlush struct {
-	namespace    databaseNamespace
-	shardFlushes shardFlushes
-}
-
-type shardFlushes map[shardFlushKey]databaseShard
-
-type shardFlushKey struct {
-	shardID    uint32
-	blockStart xtime.UnixNano
-}
-
 type flushManagerMetrics struct {
 	isFlushing      tally.Gauge
 	isSnapshotting  tally.Gauge
@@ -158,8 +144,7 @@ func (m *flushManager) Flush(startTime xtime.UnixNano) error {
 	// will attempt to snapshot blocks w/ unflushed data which would be wasteful if
 	// the block is already flushable.
 	multiErr := xerrors.NewMultiError()
-	dataFlushes, err := m.dataWarmFlush(namespaces, startTime)
-	if err != nil {
+	if err := m.dataWarmFlush(namespaces, startTime); err != nil {
 		multiErr = multiErr.Add(err)
 	}
 
@@ -174,69 +159,26 @@ func (m *flushManager) Flush(startTime xtime.UnixNano) error {
 		multiErr = multiErr.Add(fmt.Errorf("error rotating commitlog in mediator tick: %v", err))
 	}
 
-	indexFlushes, err := m.indexFlush(namespaces)
-	if err != nil {
+	if err := m.indexFlush(namespaces); err != nil {
 		multiErr = multiErr.Add(err)
 	}
 
-	err = multiErr.FinalError()
-
-	// Mark all flushed shards as such.
-	// If index is not enabled, then a shard+blockStart is "flushed" if the data has been flushed.
-	// If index is enabled, then a shard+blockStart is "flushed" if the data AND index has been flushed.
-	for _, n := range namespaces {
-		var (
-			indexEnabled = n.Options().IndexOptions().Enabled()
-			flushed      shardFlushes
-		)
-		if indexEnabled {
-			flushesForNs, ok := indexFlushes[n.ID().String()]
-			if !ok {
-				continue
-			}
-			flushed = flushesForNs.shardFlushes
-		} else {
-			flushesForNs, ok := dataFlushes[n.ID().String()]
-			if !ok {
-				continue
-			}
-			flushed = flushesForNs.shardFlushes
-		}
-
-		for k, v := range flushed {
-			// Block sizes for data and index can differ and so if we are driving the flushing by
-			// the index blockStarts, we must expand them to mark all containing data blockStarts.
-			// E.g. if blockSize == 2h and indexBlockSize == 4h and the flushed index time is 6:00pm,
-			// we should mark as flushed [6:00pm, 8:00pm].
-			if indexEnabled {
-				blockSize := n.Options().RetentionOptions().BlockSize()
-				indexBlockSize := n.Options().IndexOptions().BlockSize()
-				for start := k.blockStart; start < k.blockStart.Add(indexBlockSize); start = start.Add(blockSize) {
-					v.MarkWarmFlushStateSuccessOrError(start, err)
-				}
-			} else {
-				v.MarkWarmFlushStateSuccessOrError(k.blockStart, err)
-			}
-		}
-	}
-
-	return err
+	return multiErr.FinalError()
 }
 
 func (m *flushManager) dataWarmFlush(
 	namespaces []databaseNamespace,
 	startTime xtime.UnixNano,
-) (namespaceFlushes, error) {
+) error {
 	flushPersist, err := m.pm.StartFlushPersist()
 	if err != nil {
-		return nil, err
+		return err
 	}
 
 	m.setState(flushManagerFlushInProgress)
 	var (
-		start      = m.nowFn()
-		multiErr   = xerrors.NewMultiError()
-		allFlushes = make(map[string]namespaceFlush)
+		start    = m.nowFn()
+		multiErr = xerrors.NewMultiError()
 	)
 	for _, ns := range namespaces {
 		// Flush first because we will only snapshot if there are no outstanding flushes.
@@ -245,11 +187,9 @@ func (m *flushManager) dataWarmFlush(
 			multiErr = multiErr.Add(err)
 			continue
 		}
-		flush, err := m.flushNamespaceWithTimes(ns, flushTimes, flushPersist)
-		if err != nil {
+		if err := m.flushNamespaceWithTimes(ns, flushTimes, flushPersist); err != nil {
 			multiErr = multiErr.Add(err)
 		}
-		allFlushes[ns.ID().String()] = flush
 	}
 
 	err = flushPersist.DoneFlush()
@@ -258,7 +198,7 @@ func (m *flushManager) dataWarmFlush(
 	}
 
 	m.metrics.dataWarmFlushDuration.Record(m.nowFn().Sub(start))
-	return allFlushes, multiErr.FinalError()
+	return multiErr.FinalError()
 }
 
 func (m *flushManager) dataSnapshot(
@@ -312,17 +252,16 @@ func (m *flushManager) dataSnapshot(
 
 func (m *flushManager) indexFlush(
 	namespaces []databaseNamespace,
-) (namespaceFlushes, error) {
+) error {
 	indexFlush, err := m.pm.StartIndexPersist()
 	if err != nil {
-		return nil, err
+		return err
 	}
 
 	m.setState(flushManagerIndexFlushInProgress)
 	var (
-		start            = m.nowFn()
-		multiErr         = xerrors.NewMultiError()
-		namespaceFlushes = make(map[string]namespaceFlush)
+		start    = m.nowFn()
+		multiErr = xerrors.NewMultiError()
 	)
 	for _, ns := range namespaces {
 		var (
@@ -333,20 +272,14 @@ func (m *flushManager) indexFlush(
 			continue
 		}
 
-		flushes, err := ns.FlushIndex(indexFlush)
-		if err != nil {
+		if err := ns.FlushIndex(indexFlush); err != nil {
 			multiErr = multiErr.Add(err)
-		} else {
-			namespaceFlushes[ns.ID().String()] = namespaceFlush{
-				namespace:    ns,
-				shardFlushes: flushes,
-			}
 		}
 	}
 	multiErr = multiErr.Add(indexFlush.DoneIndex())
 
 	m.metrics.indexFlushDuration.Record(m.nowFn().Sub(start))
-	return namespaceFlushes, multiErr.FinalError()
+	return multiErr.FinalError()
 }
 
 func (m *flushManager) Report() {
@@ -430,31 +363,18 @@ func (m *flushManager) flushNamespaceWithTimes(
 	ns databaseNamespace,
 	times []xtime.UnixNano,
 	flushPreparer persist.FlushPreparer,
-) (namespaceFlush, error) {
-	flushes := make(shardFlushes)
+) error {
 	multiErr := xerrors.NewMultiError()
 	for _, t := range times {
 		// NB(xichen): we still want to proceed if a namespace fails to flush its data.
 		// Probably want to emit a counter here, but for now just log it.
-		shards, err := ns.WarmFlush(t, flushPreparer)
-		if err != nil {
+		if err := ns.WarmFlush(t, flushPreparer); err != nil {
 			detailedErr := fmt.Errorf("namespace %s failed to flush data: %v",
 				ns.ID().String(), err)
 			multiErr = multiErr.Add(detailedErr)
-			continue
-		}
-
-		for _, s := range shards {
-			flushes[shardFlushKey{
-				shardID:    s.ID(),
-				blockStart: t,
-			}] = s
 		}
 	}
-	return namespaceFlush{
-		namespace:    ns,
-		shardFlushes: flushes,
-	}, multiErr.FinalError()
+	return multiErr.FinalError()
 }
 
 func (m *flushManager) LastSuccessfulSnapshotStartTime() (xtime.UnixNano, bool) {

diff --git a/src/dbnode/storage/flush_test.go b/src/dbnode/storage/flush_test.go
@@ -321,12 +321,10 @@ func TestFlushManagerSkipNamespaceIndexingDisabled(t *testing.T) {
 	ns.EXPECT().Options().Return(nsOpts).AnyTimes()
 	ns.EXPECT().ID().Return(defaultTestNs1ID).AnyTimes()
 	ns.EXPECT().NeedsFlush(gomock.Any(), gomock.Any()).Return(true, nil).AnyTimes()
-	ns.EXPECT().WarmFlush(gomock.Any(), gomock.Any()).Return([]databaseShard{s1, s2}, nil).AnyTimes()
+	ns.EXPECT().WarmFlush(gomock.Any(), gomock.Any()).Return(nil).AnyTimes()
 	ns.EXPECT().Snapshot(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes()
 	s1.EXPECT().ID().Return(uint32(1)).AnyTimes()
 	s2.EXPECT().ID().Return(uint32(2)).AnyTimes()
-	s1.EXPECT().MarkWarmFlushStateSuccessOrError(gomock.Any(), nil).AnyTimes()
-	s2.EXPECT().MarkWarmFlushStateSuccessOrError(gomock.Any(), nil).AnyTimes()
 
 	var (
 		mockFlushPersist    = persist.NewMockFlushPreparer(ctrl)
@@ -377,17 +375,13 @@ func TestFlushManagerNamespaceIndexingEnabled(t *testing.T) {
 	// Validate that the flush state is marked as successful only AFTER all prequisite steps have been run.
 	// Order is important to avoid any edge case where data is GCed from memory without all flushing operations
 	// being completed.
-	mockFlushedShards := shardFlushes{
-		shardFlushKey{shardID: s1.ID(), blockStart: xtime.Now().Add(time.Minute * 1)}: s1,
-		shardFlushKey{shardID: s2.ID(), blockStart: xtime.Now().Add(time.Minute * 1)}: s2,
-	}
 	steps := make([]*gomock.Call, 0)
 	steps = append(steps,
-		ns.EXPECT().WarmFlush(gomock.Any(), gomock.Any()).Return([]databaseShard{s1, s2}, nil).Times(blocks),
+		ns.EXPECT().WarmFlush(gomock.Any(), gomock.Any()).Return(nil).Times(blocks),
 		ns.EXPECT().Snapshot(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes(),
-		ns.EXPECT().FlushIndex(gomock.Any()).Return(mockFlushedShards, nil),
-		s1.EXPECT().MarkWarmFlushStateSuccessOrError(gomock.Any(), nil),
-		s2.EXPECT().MarkWarmFlushStateSuccessOrError(gomock.Any(), nil),
+		ns.EXPECT().FlushIndex(gomock.Any()).Return(nil),
+		s1.EXPECT().MarkWarmIndexFlushStateSuccessOrError(gomock.Any(), nil),
+		s2.EXPECT().MarkWarmIndexFlushStateSuccessOrError(gomock.Any(), nil),
 	)
 	gomock.InOrder(steps...)
 

diff --git a/src/dbnode/storage/fs.go b/src/dbnode/storage/fs.go
@@ -39,12 +39,17 @@ const (
 	fileOpFailed
 )
 
+type warmStatus struct {
+	DataFlushed  fileOpStatus
+	IndexFlushed fileOpStatus
+}
+
 type fileOpState struct {
 	// WarmStatus is the status of data persistence for WarmWrites only.
 	// Each block will only be warm-flushed once, so not keeping track of a
 	// version here is okay. This is used in the buffer Tick to determine when
 	// a warm bucket is evictable from memory.
-	WarmStatus fileOpStatus
+	WarmStatus warmStatus
 	// ColdVersionRetrievable keeps track of data persistence for ColdWrites only.
 	// Each block can be cold-flushed multiple times, so this tracks which
 	// version of the flush completed successfully. This is ultimately used in

diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
@@ -1007,15 +1007,15 @@ func (i *nsIndex) tickingBlocks(
 func (i *nsIndex) WarmFlush(
 	flush persist.IndexFlush,
 	shards []databaseShard,
-) (shardFlushes, error) {
+) error {
 	if len(shards) == 0 {
 		// No-op if no shards currently owned.
-		return nil, nil
+		return nil
 	}
 
 	flushable, err := i.flushableBlocks(shards, series.WarmWrite)
 	if err != nil {
-		return nil, err
+		return err
 	}
 
 	// Determine the current flush indexing concurrency.
@@ -1029,7 +1029,7 @@ func (i *nsIndex) WarmFlush(
 
 	builder, err := builder.NewBuilderFromDocuments(builderOpts)
 	if err != nil {
-		return nil, err
+		return err
 	}
 	defer builder.Close()
 
@@ -1039,11 +1039,10 @@ func (i *nsIndex) WarmFlush(
 	defer i.metrics.flushIndexingConcurrency.Update(0)
 
 	var evicted int
-	flushes := make(shardFlushes)
 	for _, block := range flushable {
 		immutableSegments, err := i.flushBlock(flush, block, shards, builder)
 		if err != nil {
-			return nil, err
+			return err
 		}
 		// Make a result that covers the entire time ranges for the
 		// block for each shard
@@ -1060,7 +1059,7 @@ func (i *nsIndex) WarmFlush(
 		results := result.NewIndexBlockByVolumeType(block.StartTime())
 		results.SetBlock(idxpersist.DefaultIndexVolumeType, blockResult)
 		if err := block.AddResults(results); err != nil {
-			return nil, err
+			return err
 		}
 
 		evicted++
@@ -1074,18 +1073,16 @@ func (i *nsIndex) WarmFlush(
 				zap.Error(err),
 				zap.Time("blockStart", block.StartTime().ToTime()),
 			)
-			continue
 		}
 
-		for _, s := range shards {
-			flushes[shardFlushKey{
-				shardID:    s.ID(),
-				blockStart: block.StartTime(),
-			}] = s
+		for _, t := range i.blockStartsFromIndexBlockStart(block.StartTime()) {
+			for _, s := range shards {
+				s.MarkWarmIndexFlushStateSuccessOrError(t, err)
+			}
 		}
 	}
 	i.metrics.blocksEvictedMutableSegments.Inc(int64(evicted))
-	return flushes, nil
+	return nil
 }
 
 func (i *nsIndex) ColdFlush(shards []databaseShard) (OnColdFlushDone, error) {
@@ -1115,6 +1112,18 @@ func (i *nsIndex) ColdFlush(shards []databaseShard) (OnColdFlushDone, error) {
 	}, nil
 }
 
+// WarmFlushedBlockStarts returns all index blockStarts which have been flushed to disk.
+func (i *nsIndex) WarmFlushedBlockStarts() []xtime.UnixNano {
+	flushed := make([]xtime.UnixNano, 0)
+	infoFiles := i.readInfoFilesAsMap()
+	for blockStart := range infoFiles {
+		if i.hasIndexWarmFlushedToDisk(infoFiles, blockStart) {
+			flushed = append(flushed, blockStart)
+		}
+	}
+	return flushed
+}
+
 func (i *nsIndex) readInfoFilesAsMap() map[xtime.UnixNano]fs.ReadIndexInfoFileResult {
 	fsOpts := i.opts.CommitLogOptions().FilesystemOptions()
 	infoFiles := i.readIndexInfoFilesFn(fs.ReadIndexInfoFilesOptions{
@@ -1198,18 +1207,15 @@ func (i *nsIndex) canFlushBlockWithRLock(
 				Debug("skipping index cold flush due to shard not bootstrapped yet")
 			continue
 		}
-		start := blockStart
-		end := blockStart.Add(i.blockSize)
-		dataBlockSize := i.nsMetadata.Options().RetentionOptions().BlockSize()
-		for t := start; t.Before(end); t = t.Add(dataBlockSize) {
+
+		for _, t := range i.blockStartsFromIndexBlockStart(blockStart) {
 			flushState, err := shard.FlushState(t)
 			if err != nil {
 				return false, err
 			}
 
-			// Skip if the data flushing failed. We mark as "success" only once both
-			// data and index are flushed.
-			if flushState.WarmStatus == fileOpFailed {
+			// Skip if the data flushing failed. Data flushing precedes index flushing.
+			if flushState.WarmStatus.DataFlushed != fileOpSuccess {
 				return false, nil
 			}
 		}
@@ -1218,6 +1224,19 @@ func (i *nsIndex) canFlushBlockWithRLock(
 	return true, nil
 }
 
+// blockStartsFromIndexBlockStart returns the possibly many blocksStarts that exist within
+// a given index block (since index block size >= data block size)
+func (i *nsIndex) blockStartsFromIndexBlockStart(blockStart xtime.UnixNano) []xtime.UnixNano {
+	start := blockStart
+	end := blockStart.Add(i.blockSize)
+	dataBlockSize := i.nsMetadata.Options().RetentionOptions().BlockSize()
+	blockStarts := make([]xtime.UnixNano, 0)
+	for t := start; t.Before(end); t = t.Add(dataBlockSize) {
+		blockStarts = append(blockStarts, t)
+	}
+	return blockStarts
+}
+
 func (i *nsIndex) hasIndexWarmFlushedToDisk(
 	infoFiles map[xtime.UnixNano]fs.ReadIndexInfoFileResult,
 	blockStart xtime.UnixNano,