Skip to content

Commit

Permalink
Added scheduling capablity to Exception Cases that turn to Done
Browse files Browse the repository at this point in the history
  • Loading branch information
GreenMatan committed Feb 12, 2024
1 parent 48068e5 commit 615f65d
Show file tree
Hide file tree
Showing 9 changed files with 438 additions and 202 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,13 @@ internal static class Debugger
/// </summary>
/// <seealso cref="ExceptionDebuggingSettings.CaptureFullCallStack"/>
public const string ExceptionDebuggingCaptureFullCallStack = "DD_EXCEPTION_DEBUGGING_CAPTURE_FULL_CALLSTACK";

/// <summary>
/// Configuration key for the interval used to track exceptions
/// Default value is <c>1</c>h.
/// </summary>
/// <seealso cref="ExceptionDebuggingSettings.RateLimit"/>
public const string RateLimitSeconds = "DD_EXCEPTION_DEBUGGING_RATE_LIMIT_SECONDS";
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// <copyright file="CachedDoneExceptions.cs" company="Datadog">
// Unless explicitly stated otherwise all files in this repository are licensed under the Apache 2 License.
// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2017 Datadog, Inc.
// </copyright>

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;

namespace Datadog.Trace.Debugger.ExceptionAutoInstrumentation
{
internal class CachedDoneExceptions
{
private static readonly HashSet<string> DoneExceptions = new();
private static readonly ReaderWriterLockSlim DoneExceptionsLocker = new();

internal static void Add(Exception exception)
{
DoneExceptionsLocker.EnterWriteLock();
try
{
DoneExceptions.Add(exception.ToString());
}
finally
{
DoneExceptionsLocker.ExitWriteLock();
}
}

internal static bool Remove(string exceptionToString)
{
DoneExceptionsLocker.EnterWriteLock();
try
{
return DoneExceptions.Remove(exceptionToString);
}
finally
{
DoneExceptionsLocker.ExitWriteLock();
}
}

internal static bool Contains(Exception exception)
{
DoneExceptionsLocker.EnterReadLock();
try
{
return DoneExceptions.Contains(exception.ToString());
}
finally
{
DoneExceptionsLocker.ExitReadLock();
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
// <copyright file="ExceptionCaseInstrumentationManager.cs" company="Datadog">
// Unless explicitly stated otherwise all files in this repository are licensed under the Apache 2 License.
// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2017 Datadog, Inc.
// </copyright>

using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Reflection;
using System.Threading;
using System.Threading.Tasks;
using Datadog.Trace.Debugger.Configurations.Models;
using Datadog.Trace.Debugger.Expressions;
using Datadog.Trace.Debugger.Helpers;
using Datadog.Trace.Debugger.PInvoke;
using Datadog.Trace.Debugger.RateLimiting;
using Datadog.Trace.Debugger.Sink.Models;
using Datadog.Trace.Debugger.Symbols;
using Datadog.Trace.Logging;
using Datadog.Trace.Telemetry.Metrics;
using Datadog.Trace.Util;
using Datadog.Trace.VendoredMicrosoftCode.System.Buffers;
using Datadog.Trace.Vendors.Serilog.Events;

namespace Datadog.Trace.Debugger.ExceptionAutoInstrumentation
{
internal class ExceptionCaseInstrumentationManager
{
private static readonly IDatadogLogger Log = DatadogLogging.GetLoggerFor<ExceptionCaseInstrumentationManager>();
private static readonly ConcurrentDictionary<MethodUniqueIdentifier, ExceptionDebuggingProbe> MethodToProbe = new();

private static int _maxFramesToCapture = ExceptionDebuggingSettings.DefaultMaxFramesToCapture;

public static void Initialize(int maxFramesToCapture)
{
_maxFramesToCapture = maxFramesToCapture;
}

internal static ExceptionCase Instrument(ExceptionIdentifier exceptionId)
{
Log.Information("Instrumenting {ExceptionId}", exceptionId);

var participatingUserMethods = GetMethodsToRejit(exceptionId.StackTrace);

var uniqueMethods = participatingUserMethods
.Distinct(EqualityComparer<MethodUniqueIdentifier>.Default)
.ToArray();

var neverSeenBeforeMethods = uniqueMethods
.Where(frame => !MethodToProbe.ContainsKey(frame))
.ToArray();

foreach (var frame in neverSeenBeforeMethods)
{
MethodToProbe.TryAdd(frame, new ExceptionDebuggingProbe(frame));
}

var probes = participatingUserMethods.Select((m, frameIndex) => MethodToProbe[m]).ToArray();

var thresholdIndex = participatingUserMethods.Count - _maxFramesToCapture;
var targetMethods = new HashSet<MethodUniqueIdentifier>();

for (var index = 0; index < probes.Length; index++)
{
if (ShouldInstrumentFrameAtIndex(index))
{
targetMethods.Add(probes[index].Method);
}
}

var newCase = new ExceptionCase(exceptionId, probes);

foreach (var method in uniqueMethods)
{
var probe = MethodToProbe[method];
probe.AddExceptionCase(newCase, targetMethods.Contains(method));
}

// TODO decide if a sampler is needed, ExceptionProbeProcessor does not use any sampler for now.
// TODO InnerExceptions poses struggle in ExceptionProbeProcessor leaving logic.
// TODO Capture arguments on exit upon first leave, collect lightweight snapshot for subsequent re-entrances.
// TODO AsyncLocal cleansing when done dealing with exception from the Exception Debugging instrumentation (ShadowStack cleansing)
// TODO In ExceptionProbeProcessor.ShouldProcess, maybe negotiate with the ShadowStack to determine if the top of the stack
// TODO is relevant for the specific exception case it manages. Maybe instead of ShouldProcess we can do that
// TODO in the Process method, in the branch where the exception type is checked to see if the previous method is relevant.
// TODO there's a gotcha in doing it - it might be the next method has not been instrumented (failed to instrument)
// TODO so it won't be there because it should. We will have to accommodate for that by checking the probe status and cache it.
// TODO When leaving with an exception, we can negotiate with the ShadowStack to determine if the previous frame
// TODO Is holding the same exception instance (either as inner / itself) to better decide if we should keep on collecting
// TODO or not.
// TODO Multiple AppDomains issue. The ProbeProcessor might not be there. Also relevant for DI probes. To assess how big
// TODO the issue is, we should determine how many people are using .NET Framework .VS. .NET Core.
// TODO For Exception Debugging we can possibly choose to ditch this altogether since if the same exception will
// TODO happen multiple times in different AppDomains, then they will all capture the exception. The only problem is
// TODO over-instrumenting which is not ideal.
// TODO In AsyncMethodProbe Invoker, is it always MultiProbe even when there is only one?
// TODO What do you do with empty shadow stack? meaning, all the participating methods has failed in the instrumentation process OR they are all 3rd party code?
// TODO There might be two different exceptions, that yield the same snapshots. Consider A -> B -> C with exception "InvalidOperationException"
// TODO and K -> B -> D with exception "InvalidOperationException". If we fail to instrument: A, B, K, D then there will be the same causality chain for both exceptions.
// TODO That's why ExceptionTrackManager is the only place where snapshots are uploaded, based on the exception in hand, to be able to stop tracking an exception
// TODO and keep on tracking the other.
// TODO For Lightweight/Full snapshot capturing:
// TODO Consider keeping a cache in ShadowStackTree's AsyncLocal (in ShadowStackContainer), where the cached key
// TODO will be the hash of parents & children (Enter/Leave) and the MethodToken of the method. This way,
// TODO the method that is leaving with an interesting exception can ask this AsyncLocal (top-thread-tree) cache
// TODO if it's hash (EnterHash+LeaveHash+MethodToken) is in there. If it is, collect lightweight snapshot.
// TODO if it's not, collect full snapshot.
// TODO In this technique we will have to verify AsyncLocal safety in terms of memory leaking and the cleansing timing.
// TODO we don't want this cache to be alive for a longer time than is needed or being reused by another execution
// TODO context in a later time. This cache will have to be thread-safe since many threads may access it at the same
// TODO time. Consider using Readers/Writer lock pattern or another one that is prioritizing readings than writings.
// TODO Or any other lock-free pattern that may be suitable in this case.
// TODO Better handle multiple exceptions related to concurrency - AggregateException. It's InnerException &
// TODO InnerExceptions properties.

return newCase;

bool ShouldInstrumentFrameAtIndex(int i)
{
return i == 0 || i >= thresholdIndex || participatingUserMethods.Count <= _maxFramesToCapture + 1;
}
}

private static List<MethodUniqueIdentifier> GetMethodsToRejit(ParticipatingFrame[] allFrames)
{
var methodsToRejit = new List<MethodUniqueIdentifier>();

foreach (var frame in allFrames)
{
try
{
// HasMethod?

if (frame.State == ParticipatingFrameState.Blacklist)
{
continue;
}

var frameMethod = frame.Method;
if (frameMethod.IsAbstract)
{
continue;
}

methodsToRejit.Add(frame.MethodIdentifier);
}
catch (Exception ex)
{
Log.Error(ex, "Failed to instrument frame the frame: {FrameToRejit}", frame);
}
}

return methodsToRejit;
}

internal static void Revert(ExceptionCase @case)
{
Log.Information("Reverting {ExceptionCase}", @case);

foreach (var probe in @case.Probes)
{
probe.RemoveExceptionCase(@case);
}

var revertProbeIds = new HashSet<string>();

foreach (var processor in @case.Processors.Keys)
{
if (processor.ExceptionDebuggingProcessor.RemoveProbeProcessor(processor) == 0)
{
MethodToProbe.TryRemove(processor.ExceptionDebuggingProcessor.Method, out _);
revertProbeIds.Add(processor.ExceptionDebuggingProcessor.ProbeId);
}
}

if (revertProbeIds.Count > 0)
{
Log.Information("ExceptionTrackManager: Reverting {RevertCount} Probes.", revertProbeIds.Count.ToString());

var removeProbesRequests = revertProbeIds.Select(p => new NativeRemoveProbeRequest(p)).ToArray();
DebuggerNativeMethods.InstrumentProbes(
Array.Empty<NativeMethodProbeDefinition>(),
Array.Empty<NativeLineProbeDefinition>(),
Array.Empty<NativeSpanProbeDefinition>(),
removeProbesRequests);
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// <copyright file="ExceptionCaseScheduler.cs" company="Datadog">
// Unless explicitly stated otherwise all files in this repository are licensed under the Apache 2 License.
// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2017 Datadog, Inc.
// </copyright>

using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using Datadog.Trace.Logging;

namespace Datadog.Trace.Debugger.ExceptionAutoInstrumentation
{
internal class ExceptionCaseScheduler
{
private static readonly IDatadogLogger Log = DatadogLogging.GetLoggerFor<ExceptionCaseScheduler>();
private static readonly List<ScheduledException> ScheduledExceptions = new();
private static readonly object Lock = new();
private static Timer _timer;

public ExceptionCaseScheduler()
{
_timer = new Timer(TimerCallback, null, Timeout.Infinite, Timeout.Infinite);
}

public void Schedule(TrackedExceptionCase doneCase, TimeSpan delay)
{
var dueTime = DateTime.UtcNow.Add(delay);
var scheduledTask = new ScheduledException { Case = doneCase, DueTime = dueTime };

lock (Lock)
{
ScheduledExceptions.Add(scheduledTask);
ScheduledExceptions.Sort();
if (ScheduledExceptions[0] == scheduledTask)
{
SetNextTimer(dueTime);
}
}
}

private void TimerCallback(object state)
{
try
{
SafeTimerCallback(state);
}
catch (Exception ex)
{
Log.Error(ex, "There was an error while processing the Exception Cases scheduler.");
}
}

private void SafeTimerCallback(object state)
{
var casesToInstrument = new List<TrackedExceptionCase>();

lock (Lock)
{
var now = DateTime.UtcNow;
var dueTasks = ScheduledExceptions.TakeWhile(e => e.DueTime <= now).ToList();
foreach (var task in dueTasks)
{
casesToInstrument.Add(task.Case);
ScheduledExceptions.Remove(task);
}

if (ScheduledExceptions.Any())
{
SetNextTimer(ScheduledExceptions[0].DueTime);
}
}

foreach (var @case in casesToInstrument)
{
@case.Instrument();
}
}

private void SetNextTimer(DateTime dueTime)
{
var delay = Math.Max((dueTime - DateTime.UtcNow).TotalMilliseconds, 0);
_timer.Change((int)delay, Timeout.Infinite);
}

private class ScheduledException : IComparable<ScheduledException>
{
public TrackedExceptionCase Case { get; set; }

public DateTime DueTime { get; set; }

public int CompareTo(ScheduledException other)
{
return DueTime.CompareTo(other?.DueTime);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ namespace Datadog.Trace.Debugger.ExceptionAutoInstrumentation
internal class ExceptionDebuggingSettings
{
public const int DefaultMaxFramesToCapture = 3;
public const int DefaultRateLimitSeconds = 60 * 60; // 1 hour

public ExceptionDebuggingSettings(IConfigurationSource? source, IConfigurationTelemetry telemetry)
{
Expand All @@ -34,6 +35,13 @@ public ExceptionDebuggingSettings(IConfigurationSource? source, IConfigurationTe
.Value;

MaximumFramesToCapture = CaptureFullCallStack ? int.MaxValue : maximumFramesToCapture;

var seconds = config
.WithKeys(ConfigurationKeys.Debugger.RateLimitSeconds)
.AsInt32(DefaultRateLimitSeconds, x => x > 0)
.Value;

RateLimit = TimeSpan.FromSeconds(seconds);
}

public bool Enabled { get; }
Expand All @@ -42,6 +50,8 @@ public ExceptionDebuggingSettings(IConfigurationSource? source, IConfigurationTe

public bool CaptureFullCallStack { get; }

public TimeSpan RateLimit { get; }

public static ExceptionDebuggingSettings FromSource(IConfigurationSource source, IConfigurationTelemetry telemetry)
{
return new ExceptionDebuggingSettings(source, telemetry);
Expand Down
Loading

0 comments on commit 615f65d

Please sign in to comment.