diff --git a/tracer/src/Datadog.Trace/Configuration/ConfigurationKeys.Debugger.cs b/tracer/src/Datadog.Trace/Configuration/ConfigurationKeys.Debugger.cs index 68b9ce2e712e..701d8ca576e6 100644 --- a/tracer/src/Datadog.Trace/Configuration/ConfigurationKeys.Debugger.cs +++ b/tracer/src/Datadog.Trace/Configuration/ConfigurationKeys.Debugger.cs @@ -111,6 +111,13 @@ internal static class Debugger /// /// public const string ExceptionDebuggingCaptureFullCallStack = "DD_EXCEPTION_DEBUGGING_CAPTURE_FULL_CALLSTACK"; + + /// + /// Configuration key for the interval used to track exceptions + /// Default value is 1h. + /// + /// + public const string RateLimitSeconds = "DD_EXCEPTION_DEBUGGING_RATE_LIMIT_SECONDS"; } } } diff --git a/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/CachedDoneExceptions.cs b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/CachedDoneExceptions.cs new file mode 100644 index 000000000000..03af093d4308 --- /dev/null +++ b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/CachedDoneExceptions.cs @@ -0,0 +1,59 @@ +// +// Unless explicitly stated otherwise all files in this repository are licensed under the Apache 2 License. +// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2017 Datadog, Inc. +// + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; + +namespace Datadog.Trace.Debugger.ExceptionAutoInstrumentation +{ + internal class CachedDoneExceptions + { + private static readonly HashSet DoneExceptions = new(); + private static readonly ReaderWriterLockSlim DoneExceptionsLocker = new(); + + internal static void Add(Exception exception) + { + DoneExceptionsLocker.EnterWriteLock(); + try + { + DoneExceptions.Add(exception.ToString()); + } + finally + { + DoneExceptionsLocker.ExitWriteLock(); + } + } + + internal static bool Remove(string exceptionToString) + { + DoneExceptionsLocker.EnterWriteLock(); + try + { + return DoneExceptions.Remove(exceptionToString); + } + finally + { + DoneExceptionsLocker.ExitWriteLock(); + } + } + + internal static bool Contains(Exception exception) + { + DoneExceptionsLocker.EnterReadLock(); + try + { + return DoneExceptions.Contains(exception.ToString()); + } + finally + { + DoneExceptionsLocker.ExitReadLock(); + } + } + } +} diff --git a/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ExceptionCaseInstrumentationManager.cs b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ExceptionCaseInstrumentationManager.cs new file mode 100644 index 000000000000..4676d464f47e --- /dev/null +++ b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ExceptionCaseInstrumentationManager.cs @@ -0,0 +1,191 @@ +// +// Unless explicitly stated otherwise all files in this repository are licensed under the Apache 2 License. +// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2017 Datadog, Inc. +// + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Reflection; +using System.Threading; +using System.Threading.Tasks; +using Datadog.Trace.Debugger.Configurations.Models; +using Datadog.Trace.Debugger.Expressions; +using Datadog.Trace.Debugger.Helpers; +using Datadog.Trace.Debugger.PInvoke; +using Datadog.Trace.Debugger.RateLimiting; +using Datadog.Trace.Debugger.Sink.Models; +using Datadog.Trace.Debugger.Symbols; +using Datadog.Trace.Logging; +using Datadog.Trace.Telemetry.Metrics; +using Datadog.Trace.Util; +using Datadog.Trace.VendoredMicrosoftCode.System.Buffers; +using Datadog.Trace.Vendors.Serilog.Events; + +namespace Datadog.Trace.Debugger.ExceptionAutoInstrumentation +{ + internal class ExceptionCaseInstrumentationManager + { + private static readonly IDatadogLogger Log = DatadogLogging.GetLoggerFor(); + private static readonly ConcurrentDictionary MethodToProbe = new(); + + private static int _maxFramesToCapture = ExceptionDebuggingSettings.DefaultMaxFramesToCapture; + + public static void Initialize(int maxFramesToCapture) + { + _maxFramesToCapture = maxFramesToCapture; + } + + internal static ExceptionCase Instrument(ExceptionIdentifier exceptionId) + { + Log.Information("Instrumenting {ExceptionId}", exceptionId); + + var participatingUserMethods = GetMethodsToRejit(exceptionId.StackTrace); + + var uniqueMethods = participatingUserMethods + .Distinct(EqualityComparer.Default) + .ToArray(); + + var neverSeenBeforeMethods = uniqueMethods + .Where(frame => !MethodToProbe.ContainsKey(frame)) + .ToArray(); + + foreach (var frame in neverSeenBeforeMethods) + { + MethodToProbe.TryAdd(frame, new ExceptionDebuggingProbe(frame)); + } + + var probes = participatingUserMethods.Select((m, frameIndex) => MethodToProbe[m]).ToArray(); + + var thresholdIndex = participatingUserMethods.Count - _maxFramesToCapture; + var targetMethods = new HashSet(); + + for (var index = 0; index < probes.Length; index++) + { + if (ShouldInstrumentFrameAtIndex(index)) + { + targetMethods.Add(probes[index].Method); + } + } + + var newCase = new ExceptionCase(exceptionId, probes); + + foreach (var method in uniqueMethods) + { + var probe = MethodToProbe[method]; + probe.AddExceptionCase(newCase, targetMethods.Contains(method)); + } + + // TODO decide if a sampler is needed, ExceptionProbeProcessor does not use any sampler for now. + // TODO InnerExceptions poses struggle in ExceptionProbeProcessor leaving logic. + // TODO Capture arguments on exit upon first leave, collect lightweight snapshot for subsequent re-entrances. + // TODO AsyncLocal cleansing when done dealing with exception from the Exception Debugging instrumentation (ShadowStack cleansing) + // TODO In ExceptionProbeProcessor.ShouldProcess, maybe negotiate with the ShadowStack to determine if the top of the stack + // TODO is relevant for the specific exception case it manages. Maybe instead of ShouldProcess we can do that + // TODO in the Process method, in the branch where the exception type is checked to see if the previous method is relevant. + // TODO there's a gotcha in doing it - it might be the next method has not been instrumented (failed to instrument) + // TODO so it won't be there because it should. We will have to accommodate for that by checking the probe status and cache it. + // TODO When leaving with an exception, we can negotiate with the ShadowStack to determine if the previous frame + // TODO Is holding the same exception instance (either as inner / itself) to better decide if we should keep on collecting + // TODO or not. + // TODO Multiple AppDomains issue. The ProbeProcessor might not be there. Also relevant for DI probes. To assess how big + // TODO the issue is, we should determine how many people are using .NET Framework .VS. .NET Core. + // TODO For Exception Debugging we can possibly choose to ditch this altogether since if the same exception will + // TODO happen multiple times in different AppDomains, then they will all capture the exception. The only problem is + // TODO over-instrumenting which is not ideal. + // TODO In AsyncMethodProbe Invoker, is it always MultiProbe even when there is only one? + // TODO What do you do with empty shadow stack? meaning, all the participating methods has failed in the instrumentation process OR they are all 3rd party code? + // TODO There might be two different exceptions, that yield the same snapshots. Consider A -> B -> C with exception "InvalidOperationException" + // TODO and K -> B -> D with exception "InvalidOperationException". If we fail to instrument: A, B, K, D then there will be the same causality chain for both exceptions. + // TODO That's why ExceptionTrackManager is the only place where snapshots are uploaded, based on the exception in hand, to be able to stop tracking an exception + // TODO and keep on tracking the other. + // TODO For Lightweight/Full snapshot capturing: + // TODO Consider keeping a cache in ShadowStackTree's AsyncLocal (in ShadowStackContainer), where the cached key + // TODO will be the hash of parents & children (Enter/Leave) and the MethodToken of the method. This way, + // TODO the method that is leaving with an interesting exception can ask this AsyncLocal (top-thread-tree) cache + // TODO if it's hash (EnterHash+LeaveHash+MethodToken) is in there. If it is, collect lightweight snapshot. + // TODO if it's not, collect full snapshot. + // TODO In this technique we will have to verify AsyncLocal safety in terms of memory leaking and the cleansing timing. + // TODO we don't want this cache to be alive for a longer time than is needed or being reused by another execution + // TODO context in a later time. This cache will have to be thread-safe since many threads may access it at the same + // TODO time. Consider using Readers/Writer lock pattern or another one that is prioritizing readings than writings. + // TODO Or any other lock-free pattern that may be suitable in this case. + // TODO Better handle multiple exceptions related to concurrency - AggregateException. It's InnerException & + // TODO InnerExceptions properties. + + return newCase; + + bool ShouldInstrumentFrameAtIndex(int i) + { + return i == 0 || i >= thresholdIndex || participatingUserMethods.Count <= _maxFramesToCapture + 1; + } + } + + private static List GetMethodsToRejit(ParticipatingFrame[] allFrames) + { + var methodsToRejit = new List(); + + foreach (var frame in allFrames) + { + try + { + // HasMethod? + + if (frame.State == ParticipatingFrameState.Blacklist) + { + continue; + } + + var frameMethod = frame.Method; + if (frameMethod.IsAbstract) + { + continue; + } + + methodsToRejit.Add(frame.MethodIdentifier); + } + catch (Exception ex) + { + Log.Error(ex, "Failed to instrument frame the frame: {FrameToRejit}", frame); + } + } + + return methodsToRejit; + } + + internal static void Revert(ExceptionCase @case) + { + Log.Information("Reverting {ExceptionCase}", @case); + + foreach (var probe in @case.Probes) + { + probe.RemoveExceptionCase(@case); + } + + var revertProbeIds = new HashSet(); + + foreach (var processor in @case.Processors.Keys) + { + if (processor.ExceptionDebuggingProcessor.RemoveProbeProcessor(processor) == 0) + { + MethodToProbe.TryRemove(processor.ExceptionDebuggingProcessor.Method, out _); + revertProbeIds.Add(processor.ExceptionDebuggingProcessor.ProbeId); + } + } + + if (revertProbeIds.Count > 0) + { + Log.Information("ExceptionTrackManager: Reverting {RevertCount} Probes.", revertProbeIds.Count.ToString()); + + var removeProbesRequests = revertProbeIds.Select(p => new NativeRemoveProbeRequest(p)).ToArray(); + DebuggerNativeMethods.InstrumentProbes( + Array.Empty(), + Array.Empty(), + Array.Empty(), + removeProbesRequests); + } + } + } +} diff --git a/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ExceptionCaseScheduler.cs b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ExceptionCaseScheduler.cs new file mode 100644 index 000000000000..5cc971c5c913 --- /dev/null +++ b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ExceptionCaseScheduler.cs @@ -0,0 +1,98 @@ +// +// Unless explicitly stated otherwise all files in this repository are licensed under the Apache 2 License. +// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2017 Datadog, Inc. +// + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using Datadog.Trace.Logging; + +namespace Datadog.Trace.Debugger.ExceptionAutoInstrumentation +{ + internal class ExceptionCaseScheduler + { + private static readonly IDatadogLogger Log = DatadogLogging.GetLoggerFor(); + private static readonly List ScheduledExceptions = new(); + private static readonly object Lock = new(); + private static Timer _timer; + + public ExceptionCaseScheduler() + { + _timer = new Timer(TimerCallback, null, Timeout.Infinite, Timeout.Infinite); + } + + public void Schedule(TrackedExceptionCase doneCase, TimeSpan delay) + { + var dueTime = DateTime.UtcNow.Add(delay); + var scheduledTask = new ScheduledException { Case = doneCase, DueTime = dueTime }; + + lock (Lock) + { + ScheduledExceptions.Add(scheduledTask); + ScheduledExceptions.Sort(); + if (ScheduledExceptions[0] == scheduledTask) + { + SetNextTimer(dueTime); + } + } + } + + private void TimerCallback(object state) + { + try + { + SafeTimerCallback(state); + } + catch (Exception ex) + { + Log.Error(ex, "There was an error while processing the Exception Cases scheduler."); + } + } + + private void SafeTimerCallback(object state) + { + var casesToInstrument = new List(); + + lock (Lock) + { + var now = DateTime.UtcNow; + var dueTasks = ScheduledExceptions.TakeWhile(e => e.DueTime <= now).ToList(); + foreach (var task in dueTasks) + { + casesToInstrument.Add(task.Case); + ScheduledExceptions.Remove(task); + } + + if (ScheduledExceptions.Any()) + { + SetNextTimer(ScheduledExceptions[0].DueTime); + } + } + + foreach (var @case in casesToInstrument) + { + @case.Instrument(); + } + } + + private void SetNextTimer(DateTime dueTime) + { + var delay = Math.Max((dueTime - DateTime.UtcNow).TotalMilliseconds, 0); + _timer.Change((int)delay, Timeout.Infinite); + } + + private class ScheduledException : IComparable + { + public TrackedExceptionCase Case { get; set; } + + public DateTime DueTime { get; set; } + + public int CompareTo(ScheduledException other) + { + return DueTime.CompareTo(other?.DueTime); + } + } + } +} diff --git a/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ExceptionDebuggingSettings.cs b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ExceptionDebuggingSettings.cs index 5266509e35f3..7a6e18e701aa 100644 --- a/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ExceptionDebuggingSettings.cs +++ b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ExceptionDebuggingSettings.cs @@ -18,6 +18,7 @@ namespace Datadog.Trace.Debugger.ExceptionAutoInstrumentation internal class ExceptionDebuggingSettings { public const int DefaultMaxFramesToCapture = 3; + public const int DefaultRateLimitSeconds = 60 * 60; // 1 hour public ExceptionDebuggingSettings(IConfigurationSource? source, IConfigurationTelemetry telemetry) { @@ -34,6 +35,13 @@ public ExceptionDebuggingSettings(IConfigurationSource? source, IConfigurationTe .Value; MaximumFramesToCapture = CaptureFullCallStack ? int.MaxValue : maximumFramesToCapture; + + var seconds = config + .WithKeys(ConfigurationKeys.Debugger.RateLimitSeconds) + .AsInt32(DefaultRateLimitSeconds, x => x > 0) + .Value; + + RateLimit = TimeSpan.FromSeconds(seconds); } public bool Enabled { get; } @@ -42,6 +50,8 @@ public ExceptionDebuggingSettings(IConfigurationSource? source, IConfigurationTe public bool CaptureFullCallStack { get; } + public TimeSpan RateLimit { get; } + public static ExceptionDebuggingSettings FromSource(IConfigurationSource source, IConfigurationTelemetry telemetry) { return new ExceptionDebuggingSettings(source, telemetry); diff --git a/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ExceptionTrackManager.cs b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ExceptionTrackManager.cs index f6d02e12cd10..e28cdbc00b5a 100644 --- a/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ExceptionTrackManager.cs +++ b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ExceptionTrackManager.cs @@ -31,14 +31,13 @@ internal class ExceptionTrackManager { private static readonly IDatadogLogger Log = DatadogLogging.GetLoggerFor(); private static readonly ConcurrentDictionary TrackedExceptionCases = new(); - private static readonly ConcurrentDictionary MethodToProbe = new(); private static readonly ConcurrentQueue ExceptionProcessQueue = new(); private static readonly SemaphoreSlim WorkAvailable = new(0, int.MaxValue); private static readonly CancellationTokenSource Cts = new(); - private static readonly HashSet CachedDoneExceptions = new(); - private static readonly ReaderWriterLockSlim DoneExceptionsLocker = new(); + private static readonly ExceptionCaseScheduler ExceptionsScheduler = new(); + private static readonly int MaxFramesToCapture = ExceptionDebugging.Settings.MaximumFramesToCapture; + private static readonly TimeSpan RateLimit = ExceptionDebugging.Settings.RateLimit; private static Task _exceptionProcessorTask; - private static int _maxFramesToCapture; private static async Task StartExceptionProcessingAsync(CancellationToken cancellationToken) { @@ -75,7 +74,7 @@ public static void Report(Span span, Exception exception) private static void ReportInternal(Exception exception, ErrorOriginType errorOrigin, Span rootSpan) { - if (IsInDoneState(exception)) + if (CachedDoneExceptions.Contains(exception)) { // Quick exit. return; @@ -93,22 +92,6 @@ private static void ReportInternal(Exception exception, ErrorOriginType errorOri } } - /// - /// Quick lookup for early exit. - /// - private static bool IsInDoneState(Exception exception) - { - DoneExceptionsLocker.EnterReadLock(); - try - { - return CachedDoneExceptions.Contains(exception.ToString()); - } - finally - { - DoneExceptionsLocker.ExitReadLock(); - } - } - private static void ProcessException(Exception exception, ErrorOriginType errorOrigin, Span rootSpan) { var allParticipatingFrames = GetAllExceptionRelatedStackFrames(exception); @@ -136,19 +119,11 @@ private static void ProcessException(Exception exception, ErrorOriginType errorO var exceptionId = new ExceptionIdentifier(exceptionTypes, allParticipatingFramesFlattened, errorOrigin); - var trackedExceptionCase = TrackedExceptionCases.GetOrAdd(exceptionId, _ => new TrackedExceptionCase(exceptionId, TimeSpan.FromSeconds(1))); + var trackedExceptionCase = TrackedExceptionCases.GetOrAdd(exceptionId, _ => new TrackedExceptionCase(exceptionId, exception.ToString())); if (trackedExceptionCase.IsDone) { - DoneExceptionsLocker.EnterWriteLock(); - try - { - CachedDoneExceptions.Add(exception.ToString()); - } - finally - { - DoneExceptionsLocker.ExitWriteLock(); - } + CachedDoneExceptions.Add(exception); } else if (trackedExceptionCase.IsCollecting) { @@ -170,7 +145,13 @@ private static void ProcessException(Exception exception, ErrorOriginType errorO if (resultCallStackTree == null || !resultCallStackTree.Frames.Any()) { Log.Error("ExceptionTrackManager: Received an empty tree from the shadow stack for exception: {Exception}.", exception.ToString()); - System.Diagnostics.Debugger.Break(); + + // If we failed to instrument all the probes. + if (trackedExceptionCase.ExceptionCase.Probes.All(p => p.IsInstrumented && (p.ProbeStatus == Status.ERROR || p.ProbeStatus == Status.BLOCKED))) + { + Log.Information("Reverting the exception case of the empty stack tree since none of the methods were instrumented, for exception: {Name}, Message: {Message}, StackTrace: {StackTrace}", exception.GetType().Name, exception.Message, exception.StackTrace); + trackedExceptionCase.Revert(); + } } else { @@ -190,7 +171,7 @@ private static void ProcessException(Exception exception, ErrorOriginType errorO var assignIndex = 0; // Upload tail frames - while (frameIndex >= 0 && capturedFrameIndex >= 0 && assignIndex < _maxFramesToCapture) + while (frameIndex >= 0 && capturedFrameIndex >= 0 && assignIndex < MaxFramesToCapture) { var frame = capturedFrames[capturedFrameIndex]; @@ -206,7 +187,7 @@ private static void ProcessException(Exception exception, ErrorOriginType errorO } // Upload head frame - if (capturedFrames.Count > _maxFramesToCapture) + if (capturedFrames.Count > MaxFramesToCapture) { var frame = capturedFrames[0]; @@ -222,53 +203,15 @@ private static void ProcessException(Exception exception, ErrorOriginType errorO TagAndUpload(rootSpan, prefix, frame); } - if (trackedExceptionCase.BeginTeardown()) - { - foreach (var probe in trackedExceptionCase.ExceptionCase.Probes) - { - probe.RemoveExceptionCase(trackedExceptionCase.ExceptionCase); - } - - var revertProbeIds = new HashSet(); - - foreach (var processor in trackedExceptionCase.ExceptionCase.Processors.Keys) - { - if (processor.ExceptionDebuggingProcessor.RemoveProbeProcessor(processor) == 0) - { - MethodToProbe.TryRemove(processor.ExceptionDebuggingProcessor.Method, out _); - revertProbeIds.Add(processor.ExceptionDebuggingProcessor.ProbeId); - } - } - - if (revertProbeIds.Count > 0) - { - Log.Information("ExceptionTrackManager: Reverting {RevertCount} Probes.", revertProbeIds.Count.ToString()); - - var removeProbesRequests = revertProbeIds.Select(p => new NativeRemoveProbeRequest(p)).ToArray(); - DebuggerNativeMethods.InstrumentProbes( - Array.Empty(), - Array.Empty(), - Array.Empty(), - removeProbesRequests); - } - - trackedExceptionCase.EndTeardown(); - } + Log.Information("Reverting an exception case for exception: {Name}, Message: {Message}, StackTrace: {StackTrace}", exception.GetType().Name, exception.Message, exception.StackTrace); + trackedExceptionCase.Revert(); + ExceptionsScheduler.Schedule(trackedExceptionCase, RateLimit); } } else { - // else - If there is a concurrent initialization or tearing down, ignore this case - if (!trackedExceptionCase.Initialized()) - { - return; - } - Log.Information("New exception case occurred, initiating data collection for exception: {Name}, Message: {Message}, StackTrace: {StackTrace}", exception.GetType().Name, exception.Message, exception.StackTrace); - - trackedExceptionCase.ExceptionCase = InstrumentFrames(trackedExceptionCase.ExceptionIdentifier); - - trackedExceptionCase.BeginCollect(); + trackedExceptionCase.Instrument(); } } @@ -317,7 +260,7 @@ private static bool IsSupportedExceptionType(Exception ex) => public static void Initialize() { - _maxFramesToCapture = ExceptionDebugging.Settings.MaximumFramesToCapture; + ExceptionCaseInstrumentationManager.Initialize(MaxFramesToCapture); _exceptionProcessorTask = Task.Factory.StartNew( async () => await StartExceptionProcessingAsync(Cts.Token).ConfigureAwait(false), TaskCreationOptions.LongRunning) @@ -333,121 +276,6 @@ public static bool IsSupportedExceptionType(Type ex) => ex != typeof(TypeLoadException) && ex != typeof(OutOfMemoryException); - private static List GetMethodsToRejit(ParticipatingFrame[] allFrames) - { - var methodsToRejit = new List(); - - foreach (var frame in allFrames) - { - try - { - // HasMethod? - - if (frame.State == ParticipatingFrameState.Blacklist) - { - continue; - } - - var frameMethod = frame.Method; - if (frameMethod.IsAbstract) - { - continue; - } - - methodsToRejit.Add(frame.MethodIdentifier); - } - catch (Exception ex) - { - Log.Error(ex, "Failed to instrument frame the frame: {FrameToRejit}", frame); - } - } - - return methodsToRejit; - } - - private static ExceptionCase InstrumentFrames(ExceptionIdentifier exceptionIdentifier) - { - var participatingUserMethods = GetMethodsToRejit(exceptionIdentifier.StackTrace); - - var uniqueMethods = participatingUserMethods - .Distinct(EqualityComparer.Default) - .ToArray(); - - var neverSeenBeforeMethods = uniqueMethods - .Where(frame => !MethodToProbe.ContainsKey(frame)) - .ToArray(); - - foreach (var frame in neverSeenBeforeMethods) - { - MethodToProbe.TryAdd(frame, new ExceptionDebuggingProbe(frame)); - } - - var probes = participatingUserMethods.Select((m, frameIndex) => MethodToProbe[m]).ToArray(); - - var thresholdIndex = participatingUserMethods.Count - _maxFramesToCapture; - var targetMethods = new HashSet(); - - for (var index = 0; index < probes.Length; index++) - { - if (ShouldInstrumentFrameAtIndex(index)) - { - targetMethods.Add(probes[index].Method); - } - } - - var newCase = new ExceptionCase(exceptionIdentifier, probes); - - foreach (var method in uniqueMethods) - { - var probe = MethodToProbe[method]; - probe.AddExceptionCase(newCase, targetMethods.Contains(method)); - } - - // TODO decide if a sampler is needed, ExceptionProbeProcessor does not use any sampler for now. - // TODO InnerExceptions poses struggle in ExceptionProbeProcessor leaving logic. - // TODO Capture arguments on exit upon first leave, collect lightweight snapshot for subsequent re-entrances. - // TODO AsyncLocal cleansing when done dealing with exception from the Exception Debugging instrumentation (ShadowStack cleansing) - // TODO In ExceptionProbeProcessor.ShouldProcess, maybe negotiate with the ShadowStack to determine if the top of the stack - // TODO is relevant for the specific exception case it manages. Maybe instead of ShouldProcess we can do that - // TODO in the Process method, in the branch where the exception type is checked to see if the previous method is relevant. - // TODO there's a gotcha in doing it - it might be the next method has not been instrumented (failed to instrument) - // TODO so it won't be there because it should. We will have to accommodate for that by checking the probe status and cache it. - // TODO When leaving with an exception, we can negotiate with the ShadowStack to determine if the previous frame - // TODO Is holding the same exception instance (either as inner / itself) to better decide if we should keep on collecting - // TODO or not. - // TODO Multiple AppDomains issue. The ProbeProcessor might not be there. Also relevant for DI probes. To assess how big - // TODO the issue is, we should determine how many people are using .NET Framework .VS. .NET Core. - // TODO For Exception Debugging we can possibly choose to ditch this altogether since if the same exception will - // TODO happen multiple times in different AppDomains, then they will all capture the exception. The only problem is - // TODO over-instrumenting which is not ideal. - // TODO In AsyncMethodProbe Invoker, is it always MultiProbe even when there is only one? - // TODO What do you do with empty shadow stack? meaning, all the participating methods has failed in the instrumentation process OR they are all 3rd party code? - // TODO There might be two different exceptions, that yield the same snapshots. Consider A -> B -> C with exception "InvalidOperationException" - // TODO and K -> B -> D with exception "InvalidOperationException". If we fail to instrument: A, B, K, D then there will be the same causality chain for both exceptions. - // TODO That's why ExceptionTrackManager is the only place where snapshots are uploaded, based on the exception in hand, to be able to stop tracking an exception - // TODO and keep on tracking the other. - // TODO For Lightweight/Full snapshot capturing: - // TODO Consider keeping a cache in ShadowStackTree's AsyncLocal (in ShadowStackContainer), where the cached key - // TODO will be the hash of parents & children (Enter/Leave) and the MethodToken of the method. This way, - // TODO the method that is leaving with an interesting exception can ask this AsyncLocal (top-thread-tree) cache - // TODO if it's hash (EnterHash+LeaveHash+MethodToken) is in there. If it is, collect lightweight snapshot. - // TODO if it's not, collect full snapshot. - // TODO In this technique we will have to verify AsyncLocal safety in terms of memory leaking and the cleansing timing. - // TODO we don't want this cache to be alive for a longer time than is needed or being reused by another execution - // TODO context in a later time. This cache will have to be thread-safe since many threads may access it at the same - // TODO time. Consider using Readers/Writer lock pattern or another one that is prioritizing readings than writings. - // TODO Or any other lock-free pattern that may be suitable in this case. - // TODO Better handle multiple exceptions related to concurrency - AggregateException. It's InnerException & - // TODO InnerExceptions properties. - - return newCase; - - bool ShouldInstrumentFrameAtIndex(int i) - { - return i == 0 || i >= thresholdIndex || participatingUserMethods.Count <= _maxFramesToCapture + 1; - } - } - private static void Stop() { Cts.Cancel(); diff --git a/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/MethodUniqueIdentifier.cs b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/MethodUniqueIdentifier.cs index 23f8872e3629..ab773d01ec3f 100644 --- a/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/MethodUniqueIdentifier.cs +++ b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/MethodUniqueIdentifier.cs @@ -76,6 +76,13 @@ public override int GetHashCode() { return _hashCode; } + + public override string ToString() + { + var probesInfo = Probes == null ? "null" : $"{Probes.Length} probes"; + var processorsCount = Processors?.Count ?? 0; + return $"ExceptionCase: ExceptionId={ExceptionId}, Probes=[{probesInfo}], Processors={processorsCount}"; + } } internal readonly struct ExceptionIdentifier : IEquatable @@ -135,6 +142,13 @@ public override int GetHashCode() { return _hashCode; } + + public override string ToString() + { + var exceptionTypes = string.Join(", ", ExceptionTypes.Select(t => t.FullName)); + var stackTrace = string.Join("; ", StackTrace.Select(frame => frame.ToString())); + return $"ErrorOrigin: {ErrorOrigin}, ExceptionTypes: [{exceptionTypes}], StackTrace: [{stackTrace}]"; + } } internal class ExceptionDebuggingProbe diff --git a/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ParticipatingFrame.cs b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ParticipatingFrame.cs index 58af16323d44..154093c51c74 100644 --- a/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ParticipatingFrame.cs +++ b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ParticipatingFrame.cs @@ -62,5 +62,13 @@ public override bool Equals(object obj) return Equals((ParticipatingFrame)obj); } + + public override string ToString() + { + var methodName = Method?.DeclaringType?.FullName + "." + Method?.Name ?? "Unknown Method"; + var blacklistStatus = IsInBlackList ? "Blacklisted" : "Not Blacklisted"; + var ilOffsetInfo = ILOffset != UndefinedIlOffset ? $"IL Offset: {ILOffset}" : "IL Offset: Undefined"; + return $"ParticipatingFrame: Method={methodName}, State={State}, {ilOffsetInfo}, {blacklistStatus}"; + } } } diff --git a/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/TrackedExceptionCase.cs b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/TrackedExceptionCase.cs index cbb611792dbb..05f2b9a462b6 100644 --- a/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/TrackedExceptionCase.cs +++ b/tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/TrackedExceptionCase.cs @@ -5,7 +5,6 @@ using System; using System.Threading; -using Datadog.Trace.Debugger.RateLimiting; namespace Datadog.Trace.Debugger.ExceptionAutoInstrumentation { @@ -21,12 +20,12 @@ internal class TrackedExceptionCase { private volatile int _initializationOrTearDownInProgress; - public TrackedExceptionCase(ExceptionIdentifier exceptionId, TimeSpan windowDuration) + public TrackedExceptionCase(ExceptionIdentifier exceptionId, string exceptionToString) { ExceptionIdentifier = exceptionId; ErrorHash = MD5HashProvider.GetHash(exceptionId); + ExceptionToString = exceptionToString; StartCollectingTime = DateTime.MaxValue; - Sampler = new AdaptiveSampler(windowDuration, 1, 180, 16, null); } public bool IsCollecting => TrackingExceptionCollectionState == ExceptionCollectionState.Collecting; @@ -35,15 +34,15 @@ public TrackedExceptionCase(ExceptionIdentifier exceptionId, TimeSpan windowDura public string ErrorHash { get; } + public string ExceptionToString { get; } + public ExceptionCollectionState TrackingExceptionCollectionState { get; private set; } = ExceptionCollectionState.None; public bool IsDone => TrackingExceptionCollectionState == ExceptionCollectionState.Finalizing || TrackingExceptionCollectionState == ExceptionCollectionState.Done; public DateTime StartCollectingTime { get; private set; } - public ExceptionCase ExceptionCase { get; set; } - - public AdaptiveSampler Sampler { get; } + public ExceptionCase ExceptionCase { get; private set; } private bool BeginInProgressState(ExceptionCollectionState exceptionCollectionState) { @@ -62,25 +61,47 @@ private void EndInProgressState(ExceptionCollectionState exceptionCollectionStat _initializationOrTearDownInProgress = 0; } - public bool Initialized() + private bool Initialized() { return BeginInProgressState(ExceptionCollectionState.Initializing); } - public void BeginCollect() + public void Instrument() + { + // else - If there is a concurrent initialization or tearing down, ignore this case + if (Initialized()) + { + var @case = ExceptionCaseInstrumentationManager.Instrument(ExceptionIdentifier); + BeginCollect(@case); + CachedDoneExceptions.Remove(ExceptionToString); + } + } + + private void BeginCollect(ExceptionCase @case) { + ExceptionCase = @case; EndInProgressState(ExceptionCollectionState.Collecting); StartCollectingTime = DateTime.UtcNow; } - public bool BeginTeardown() + public void Revert() + { + if (BeginTeardown()) + { + ExceptionCaseInstrumentationManager.Revert(ExceptionCase); + EndTeardown(); + } + } + + private bool BeginTeardown() { return BeginInProgressState(ExceptionCollectionState.Finalizing); } - public void EndTeardown() + private void EndTeardown() { EndInProgressState(ExceptionCollectionState.Done); + ExceptionCase = default; } public override int GetHashCode()