cmd/collector/app/span_processor.go

// Copyright (c) 2019 The Jaeger Authors.
// Copyright (c) 2017 Uber Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package app

import (
	"sync/atomic"
	"time"

	"github.com/OneOfOne/go-utils/memory"
	tchannel "github.com/uber/tchannel-go"
	"go.uber.org/zap"

	"github.com/jaegertracing/jaeger/cmd/collector/app/sanitizer"
	"github.com/jaegertracing/jaeger/model"
	"github.com/jaegertracing/jaeger/pkg/queue"
	"github.com/jaegertracing/jaeger/storage/spanstore"
)

// if this proves to be too low, we can increase it
const MaxQueueSize = 1_000_000

// ProcessSpansOptions additional options passed to processor along with the spans.
type ProcessSpansOptions struct {
	SpanFormat       SpanFormat
	InboundTransport InboundTransport
}

// SpanProcessor handles model spans
type SpanProcessor interface {
	// ProcessSpans processes model spans and return with either a list of true/false success or an error
	ProcessSpans(mSpans []*model.Span, options ProcessSpansOptions) ([]bool, error)
}

type spanProcessor struct {
	queue              *queue.BoundedQueue
	metrics            *SpanProcessorMetrics
	preProcessSpans    ProcessSpans
	filterSpan         FilterSpan             // filter is called before the sanitizer but after preProcessSpans
	sanitizer          sanitizer.SanitizeSpan // sanitizer is called before processSpan
	processSpan        ProcessSpan
	logger             *zap.Logger
	spanWriter         spanstore.Writer
	reportBusy         bool
	numWorkers         int
	dynQueueSizeWarmup uint
	dynQueueSizeMemory uint
	bytesProcessed     uint64
	spansProcessed     uint64
	stopCh             chan struct{}
}

type queueItem struct {
	queuedTime time.Time
	span       *model.Span
}

// NewSpanProcessor returns a SpanProcessor that preProcesses, filters, queues, sanitizes, and processes spans
func NewSpanProcessor(
	spanWriter spanstore.Writer,
	opts ...Option,
) SpanProcessor {
	sp := newSpanProcessor(spanWriter, opts...)

	sp.queue.StartConsumers(sp.numWorkers, func(item interface{}) {
		value := item.(*queueItem)
		sp.processItemFromQueue(value)
	})

	sp.StartStateReporting(1 * time.Second)

	if sp.dynQueueSizeWarmup > 0 {
		sp.StartDynQueueSizeUpdater(time.Second)
	}

	return sp
}

func newSpanProcessor(spanWriter spanstore.Writer, opts ...Option) *spanProcessor {
	options := Options.apply(opts...)
	handlerMetrics := NewSpanProcessorMetrics(
		options.serviceMetrics,
		options.hostMetrics,
		options.extraFormatTypes)
	droppedItemHandler := func(item interface{}) {
		handlerMetrics.SpansDropped.Inc(1)
	}
	boundedQueue := queue.NewBoundedQueue(options.queueSize, droppedItemHandler)

	sp := spanProcessor{
		queue:              boundedQueue,
		metrics:            handlerMetrics,
		logger:             options.logger,
		preProcessSpans:    options.preProcessSpans,
		filterSpan:         options.spanFilter,
		sanitizer:          options.sanitizer,
		reportBusy:         options.reportBusy,
		numWorkers:         options.numWorkers,
		spanWriter:         spanWriter,
		dynQueueSizeWarmup: options.dynQueueSizeWarmup,
		stopCh:             make(chan struct{}),
		dynQueueSizeMemory: options.dynQueueSizeMemory,
	}

	processSpanFuncs := []ProcessSpan{}
	if options.dynQueueSizeWarmup > 0 {
		// add to processSpanFuncs
		options.logger.Info("Dynamically adjusting the queue size at runtime.",
			zap.Uint("memory-mib", options.dynQueueSizeMemory/1024/1024),
			zap.Uint("dyn-queue-size-warmup", options.dynQueueSizeWarmup))
		processSpanFuncs = append(processSpanFuncs, sp.countSpan)
	}
	processSpanFuncs = append(processSpanFuncs, options.preSave, sp.saveSpan)

	sp.processSpan = ChainedProcessSpan(processSpanFuncs...)
	return &sp
}

// Stop halts the span processor and all its go-routines.
func (sp *spanProcessor) Stop() {
	close(sp.stopCh)
	sp.queue.Stop()
}

func (sp *spanProcessor) saveSpan(span *model.Span) {
	if nil == span.Process {
		sp.logger.Error("process is empty for the span")
		sp.metrics.SavedErrBySvc.ReportServiceNameForSpan(span)
		return
	}

	startTime := time.Now()
	if err := sp.spanWriter.WriteSpan(span); err != nil {
		sp.logger.Error("Failed to save span", zap.Error(err))
		sp.metrics.SavedErrBySvc.ReportServiceNameForSpan(span)
	} else {
		sp.logger.Debug("Span written to the storage by the collector",
			zap.Stringer("trace-id", span.TraceID), zap.Stringer("span-id", span.SpanID))
		sp.metrics.SavedOkBySvc.ReportServiceNameForSpan(span)
	}
	sp.metrics.SaveLatency.Record(time.Since(startTime))
}

func (sp *spanProcessor) countSpan(span *model.Span) {
	atomic.AddUint64(&sp.bytesProcessed, memory.Sizeof(span))
	atomic.AddUint64(&sp.spansProcessed, 1)
}

func (sp *spanProcessor) ProcessSpans(mSpans []*model.Span, options ProcessSpansOptions) ([]bool, error) {
	sp.preProcessSpans(mSpans)
	sp.metrics.BatchSize.Update(int64(len(mSpans)))
	retMe := make([]bool, len(mSpans))
	for i, mSpan := range mSpans {
		ok := sp.enqueueSpan(mSpan, options.SpanFormat, options.InboundTransport)
		if !ok && sp.reportBusy {
			return nil, tchannel.ErrServerBusy
		}
		retMe[i] = ok
	}
	return retMe, nil
}

func (sp *spanProcessor) processItemFromQueue(item *queueItem) {
	sp.processSpan(sp.sanitizer(item.span))
	sp.metrics.InQueueLatency.Record(time.Since(item.queuedTime))
}

func (sp *spanProcessor) enqueueSpan(span *model.Span, originalFormat SpanFormat, transport InboundTransport) bool {
	spanCounts := sp.metrics.GetCountsForFormat(originalFormat, transport)
	spanCounts.ReceivedBySvc.ReportServiceNameForSpan(span)

	if !sp.filterSpan(span) {
		spanCounts.RejectedBySvc.ReportServiceNameForSpan(span)
		return true // as in "not dropped", because it's actively rejected
	}

	//add format tag
	span.Tags = append(span.Tags, model.String("internal.span.format", string(originalFormat)))

	item := &queueItem{
		queuedTime: time.Now(),
		span:       span,
	}
	return sp.queue.Produce(item)
}

func (sp *spanProcessor) StartStateReporting(reportPeriod time.Duration) {
	sp.background(reportPeriod, func() {
		sp.updateGauges()
	})
}

func (sp *spanProcessor) StartDynQueueSizeUpdater(reportPeriod time.Duration) {
	sp.background(reportPeriod, func() {
		sp.updateQueueSize()
	})
}

func (sp *spanProcessor) background(reportPeriod time.Duration, callback func()) {
	ticker := time.NewTicker(reportPeriod)
	go func() {
		defer ticker.Stop()
		for {
			select {
			case <-ticker.C:
				callback()
			case <-sp.stopCh:
				return
			}
		}
	}()
}

func (sp *spanProcessor) updateQueueSize() {
	if sp.spansProcessed == 0 {
		return
	}

	if sp.dynQueueSizeWarmup == 0 {
		return
	}

	if sp.spansProcessed < uint64(sp.dynQueueSizeWarmup) {
		return
	}

	// first, we get the average size of a span, by dividing the bytes processed by num of spans
	average := sp.bytesProcessed / sp.spansProcessed

	// finally, we divide the available memory by the average size of a span
	idealQueueSize := float64(sp.dynQueueSizeMemory / uint(average))

	// cap the queue size, just to be safe...
	if idealQueueSize > MaxQueueSize {
		idealQueueSize = MaxQueueSize
	}

	var diff float64
	current := float64(sp.queue.Capacity())
	if idealQueueSize > current {
		diff = idealQueueSize / current
	} else {
		diff = current / idealQueueSize
	}

	// resizing is a costly operation, we only perform it if we are at least 10% apart from the desired value
	if diff > 1.1 {
		s := int(idealQueueSize)
		sp.logger.Info("Resizing the internal span queue", zap.Int("new-size", s), zap.Uint64("average-span-size-bytes", average))
		sp.queue.Resize(s)
	}
}

func (sp *spanProcessor) updateGauges() {
	sp.metrics.SpansBytes.Update(int64(atomic.LoadUint64(&sp.bytesProcessed)))
	sp.metrics.QueueLength.Update(int64(sp.queue.Size()))
	sp.metrics.QueueCapacity.Update(int64(sp.queue.Capacity()))
}