diff --git a/OpenTelemetry.sln b/OpenTelemetry.sln index bd7a7fbe8e1..19b4f346c93 100644 --- a/OpenTelemetry.sln +++ b/OpenTelemetry.sln @@ -249,6 +249,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "getting-started-console", " EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "getting-started-jaeger", "docs\trace\getting-started-jaeger\getting-started-jaeger.csproj", "{A0C0B77C-6C7B-4EC2-AC61-EA1F489811B9}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "tail-based-sampling-example", "docs\trace\tail-based-sampling-span-level\tail-based-sampling-example.csproj", "{800DB925-6014-4136-AC01-3356CF7CADD3}" +EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "stratified-sampling-example", "docs\trace\stratified-sampling-example\stratified-sampling-example.csproj", "{9C99621C-343E-479C-A943-332DB6129B71}" EndProject Global @@ -525,6 +527,10 @@ Global {A0C0B77C-6C7B-4EC2-AC61-EA1F489811B9}.Debug|Any CPU.Build.0 = Debug|Any CPU {A0C0B77C-6C7B-4EC2-AC61-EA1F489811B9}.Release|Any CPU.ActiveCfg = Release|Any CPU {A0C0B77C-6C7B-4EC2-AC61-EA1F489811B9}.Release|Any CPU.Build.0 = Release|Any CPU + {800DB925-6014-4136-AC01-3356CF7CADD3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {800DB925-6014-4136-AC01-3356CF7CADD3}.Debug|Any CPU.Build.0 = Debug|Any CPU + {800DB925-6014-4136-AC01-3356CF7CADD3}.Release|Any CPU.ActiveCfg = Release|Any CPU + {800DB925-6014-4136-AC01-3356CF7CADD3}.Release|Any CPU.Build.0 = Release|Any CPU {9C99621C-343E-479C-A943-332DB6129B71}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {9C99621C-343E-479C-A943-332DB6129B71}.Debug|Any CPU.Build.0 = Debug|Any CPU {9C99621C-343E-479C-A943-332DB6129B71}.Release|Any CPU.ActiveCfg = Release|Any CPU @@ -568,6 +574,7 @@ Global {DEDE8442-03CA-48CF-99B9-EA224D89D148} = {5B7FB835-3FFF-4BC2-99C5-A5B5FAE3C818} {EF4F6280-14D1-49D4-8095-1AC36E169AA8} = {5B7FB835-3FFF-4BC2-99C5-A5B5FAE3C818} {A0C0B77C-6C7B-4EC2-AC61-EA1F489811B9} = {5B7FB835-3FFF-4BC2-99C5-A5B5FAE3C818} + {800DB925-6014-4136-AC01-3356CF7CADD3} = {5B7FB835-3FFF-4BC2-99C5-A5B5FAE3C818} {9C99621C-343E-479C-A943-332DB6129B71} = {5B7FB835-3FFF-4BC2-99C5-A5B5FAE3C818} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution diff --git a/docs/trace/tail-based-sampling-span-level/ParentBasedElseAlwaysRecordSampler.cs b/docs/trace/tail-based-sampling-span-level/ParentBasedElseAlwaysRecordSampler.cs new file mode 100644 index 00000000000..ae15a40d370 --- /dev/null +++ b/docs/trace/tail-based-sampling-span-level/ParentBasedElseAlwaysRecordSampler.cs @@ -0,0 +1,61 @@ +// +// Copyright The OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +using OpenTelemetry.Trace; + +namespace SDKBasedSpanLevelTailSamplingSample; + +/// +/// Note: This is a proof-of-concept and is not meant to be used directly in production. +/// This is a composite sampler used to achieve a combination of parent-based sampling +/// and SDK-side "span-level" tail-based sampling. +/// It first invokes a head-sampling mechanism using the parent based sampling approach. +/// If the parent based sampler's decision is to sample it (i.e., record and export the span), +/// it retains that decision. If not, it returns a "record-only" sampling result that can be +/// changed later by a span processor based on span attributes (e.g., failure) that become +/// available only by the end of the span. +/// +internal class ParentBasedElseAlwaysRecordSampler : Sampler +{ + private const double DefaultSamplingProbabilityForRootSpan = 0.1; + private readonly ParentBasedSampler parentBasedSampler; + + public ParentBasedElseAlwaysRecordSampler(double samplingProbabilityForRootSpan = DefaultSamplingProbabilityForRootSpan) + { + this.parentBasedSampler = new ParentBasedSampler(new TraceIdRatioBasedSampler(samplingProbabilityForRootSpan)); + } + + public override SamplingResult ShouldSample(in SamplingParameters samplingParameters) + { + // First, let's sample using the parentbased sampler. + var samplingResult = this.parentBasedSampler.ShouldSample(samplingParameters); + + if (samplingResult.Decision != SamplingDecision.Drop) + { + // Parentbased sampler decided not to drop it, so we will sample this. + return samplingResult; + } + + // Parentbased sampler decided to drop it. We will return a RecordOnly + // decision so that the span filtering processors later in the pipeline + // can apply tailbased sampling rules (e.g., to sample all failed spans). + // Returning a RecordOnly decision is relevant because: + // 1. It causes the Processor pipeline to be invoked. + // 2. It causes activity.IsAllDataRequested to return true, so most + // instrumentations end up populating the required attributes. + return new SamplingResult(SamplingDecision.RecordOnly); + } +} diff --git a/docs/trace/tail-based-sampling-span-level/Program.cs b/docs/trace/tail-based-sampling-span-level/Program.cs new file mode 100644 index 00000000000..3dfc4cf4f08 --- /dev/null +++ b/docs/trace/tail-based-sampling-span-level/Program.cs @@ -0,0 +1,59 @@ +// +// Copyright The OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +using System.Diagnostics; +using OpenTelemetry; +using OpenTelemetry.Trace; + +namespace SDKBasedSpanLevelTailSamplingSample; + +internal class Program +{ + private static readonly ActivitySource MyActivitySource = new("SDK.TailSampling.POC"); + + public static void Main(string[] args) + { + using var tracerProvider = Sdk.CreateTracerProviderBuilder() + .SetSampler(new ParentBasedElseAlwaysRecordSampler()) + .AddSource("SDK.TailSampling.POC") + .AddProcessor(new TailSamplingProcessor()) + .AddConsoleExporter() + .Build(); + + var random = new Random(2357); + + // Generate some spans + for (var i = 0; i < 50; i++) + { + using (var activity = MyActivitySource.StartActivity("SayHello")) + { + activity?.SetTag("foo", "bar"); + + // Simulate a mix of failed and successful spans + var randomValue = random.Next(5); + switch (randomValue) + { + case 0: + activity?.SetStatus(ActivityStatusCode.Error); + break; + default: + activity?.SetStatus(ActivityStatusCode.Ok); + break; + } + } + } + } +} diff --git a/docs/trace/tail-based-sampling-span-level/README.md b/docs/trace/tail-based-sampling-span-level/README.md new file mode 100644 index 00000000000..a6d2930203b --- /dev/null +++ b/docs/trace/tail-based-sampling-span-level/README.md @@ -0,0 +1,108 @@ +# Tail Based Sampling at an activity (span) level: An Example + +This document describes one possible way to achieve a form of tail-based +sampling to include all failed activities in addition to head-based sampling. + +It does this by leveraging the extensibility mechanisms in the OpenTelemetry +SDK. It uses a combination of a custom sampler and an ActivityProcessor +(span processor). + +This is a way to achieve a combination of: + +- Head-based sampling (probabilistic/unbiased sampling), and +- Tail-based sampling (a non-probabilistic/biased sampling). + +## How does this sampling example work? + +We use a hybrid approach: we do head based sampling to get a +probabilistic subset of all activities which includes both successful activities +and failure activities. In addition, we want to capture all failure activities. +To do this, if the parent based sampler's decision is to drop it, we return +a "Record-Only" sampling result. This ensures that the activity processor +receives that activity. In the activity processor, at the end of an activity, +we check if it is a failure activity. If so, we change the decision from +"Record-Only" to set the sampled flag so that the exporter receives the +activity. In this example, each activity is filtered individually without +consideration to any other activities. + +This is a basic form of tail-based sampling at an activity level. If an +activity failed, we always sample it in addition to all head-sampled +activities. + +## When should you consider such an option? + +This is a good option if you want to get all failure activities in addition to +head based sampling. With this, you get basic activity level tail-based sampling +at a SDK level without having to install any additional components. + +## Tradeoffs + +Tail-sampling this way involves many tradeoffs such as: + +1. Additional performance cost: Unlike head-based sampling where the sampling +decision is made at activity creation time, in tail sampling the decision is made +only at the end, so there is additional memory/processing cost. + +2. Partial traces: Since this sampling is at a activity level, the generated trace +will be partial. For example, if another part of the call tree is successful, +those activities may not be exported leading to an incomplete trace. + +3. If multiple exporters are used, this decision will impact all of them: +[Issue 3861](https://github.com/open-telemetry/opentelemetry-dotnet/issues/3861). + +## Sample Output + +You should see output such as the below when you run this example. + +```text +Including error activity with id +00-404ddff248b8f9a9b21e347d68d2640e-035858bc3c168885-01 and status Error +Activity.TraceId: 404ddff248b8f9a9b21e347d68d2640e +Activity.SpanId: 035858bc3c168885 +Activity.TraceFlags: Recorded +Activity.ActivitySourceName: SDK.TailSampling.POC +Activity.DisplayName: SayHello +Activity.Kind: Internal +Activity.StartTime: 2023-02-09T19:05:32.5563112Z +Activity.Duration: 00:00:00.0028144 +Activity.Tags: + foo: bar +StatusCode: Error +Resource associated with Activity: + service.name: unknown_service:Examples.TailBasedSamplingAtSpanLevel + +Dropping activity with id 00-ea861bda268c58d328ab7cbe49851499-daba29055de80a53-00 +and status Ok + +Including error activity with id +00-802dea991247e2d699d943167eb546de-cc120b0bd1741b52-01 and status Error +Activity.TraceId: 802dea991247e2d699d943167eb546de +Activity.SpanId: cc120b0bd1741b52 +Activity.TraceFlags: Recorded +Activity.ActivitySourceName: SDK.TailSampling.POC +Activity.DisplayName: SayHello +Activity.Kind: Internal +Activity.StartTime: 2023-02-09T19:05:32.7021138Z +Activity.Duration: 00:00:00.0000012 +Activity.Tags: + foo: bar +StatusCode: Error +Resource associated with Activity: + service.name: unknown_service:Examples.TailBasedSamplingAtSpanLevel + +Including head-sampled activity with id +00-f3c88010615e285c8f3cb3e2bcd70c7f-f9316215f12437c3-01 and status Ok +Activity.TraceId: f3c88010615e285c8f3cb3e2bcd70c7f +Activity.SpanId: f9316215f12437c3 +Activity.TraceFlags: Recorded +Activity.ActivitySourceName: SDK.TailSampling.POC +Activity.DisplayName: SayHello +Activity.Kind: Internal +Activity.StartTime: 2023-02-09T19:05:32.8519346Z +Activity.Duration: 00:00:00.0000034 +Activity.Tags: + foo: bar +StatusCode: Ok +Resource associated with Activity: + service.name: unknown_service:Examples.TailBasedSamplingAtSpanLevel +``` diff --git a/docs/trace/tail-based-sampling-span-level/TailSamplingProcessor.cs b/docs/trace/tail-based-sampling-span-level/TailSamplingProcessor.cs new file mode 100644 index 00000000000..19352cfa898 --- /dev/null +++ b/docs/trace/tail-based-sampling-span-level/TailSamplingProcessor.cs @@ -0,0 +1,73 @@ +// +// Copyright The OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +using System.Diagnostics; +using OpenTelemetry; + +namespace SDKBasedSpanLevelTailSamplingSample; + +/// +/// A custom processor for filtering instances. +/// +internal sealed class TailSamplingProcessor : BaseProcessor +{ + public TailSamplingProcessor() + : base() + { + } + + public override void OnEnd(Activity activity) + { + if (activity.Recorded) + { + // This means that this activity was included based on head-based sampling, + // we continue with that decision and no further change is needed. + Console.WriteLine($"Including head-sampled activity with id {activity.Id} and status {activity.Status}"); + } + else + { + this.IncludeForExportIfFailedActivity(activity); + } + + base.OnEnd(activity); + } + + // Note: This is used to filter spans at the end of a span. + // This is a basic form of tail-based sampling at a span level. + // If a span failed, we always sample it in addition to all head-sampled spans. + // In this example, each span is filtered individually without consideration to any other spans. + // Tail-sampling this way involves many tradeoffs. A few examples of the tradeoffs: + // 1. Performance: Unlike head-based sampling where the sampling decision is made at span creation time, in + // tail sampling the decision is made only at the end, so there is additional memory cost. + // 2. Traces will not be complete: Since this sampling is at a span level, the generated trace will be partial and won't be complete. + // For example, if another part of the call tree is successful, those spans may not be sampled in leading to a partial trace. + // 3. If multiple exporters are used, this decision will impact all of them: https://github.com/open-telemetry/opentelemetry-dotnet/issues/3861. + private void IncludeForExportIfFailedActivity(Activity activity) + { + if (activity.Status == ActivityStatusCode.Error) + { + // We decide to always include all the failure spans + // Set the recorded flag so that this will be exported. + activity.ActivityTraceFlags |= ActivityTraceFlags.Recorded; + Console.WriteLine($"Including error activity with id {activity.Id} and status {activity.Status}"); + } + else + { + // This span is not sampled and exporters won't see this span. + Console.WriteLine($"Dropping activity with id {activity.Id} and status {activity.Status}"); + } + } +} diff --git a/docs/trace/tail-based-sampling-span-level/tail-based-sampling-example.csproj b/docs/trace/tail-based-sampling-span-level/tail-based-sampling-example.csproj new file mode 100644 index 00000000000..19aa9791432 --- /dev/null +++ b/docs/trace/tail-based-sampling-span-level/tail-based-sampling-example.csproj @@ -0,0 +1,5 @@ + + + + +