From ca919f7fdbcfe588f667a2b0980b7a052706a3f8 Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 14:43:04 -0400 Subject: [PATCH 01/19] Add some DMAPI debug logging --- src/DMAPI/tgs/v5/api.dm | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/DMAPI/tgs/v5/api.dm b/src/DMAPI/tgs/v5/api.dm index 95b8edd3ee5..05d0dee25b3 100644 --- a/src/DMAPI/tgs/v5/api.dm +++ b/src/DMAPI/tgs/v5/api.dm @@ -50,7 +50,9 @@ version = null // we want this to be the TGS version, not the interop version // sleep once to prevent an issue where world.Export on the first tick can hang indefinitely + TGS_DEBUG_LOG("Starting Export bug prevention sleep tick. time:[world.time] sleep_offline:[world.sleep_offline]") sleep(world.tick_lag) + TGS_DEBUG_LOG("Export bug prevention sleep complete") var/list/bridge_response = Bridge(DMAPI5_BRIDGE_COMMAND_STARTUP, list(DMAPI5_BRIDGE_PARAMETER_MINIMUM_SECURITY_LEVEL = minimum_required_security_level, DMAPI5_BRIDGE_PARAMETER_VERSION = api_version.raw_parameter, DMAPI5_PARAMETER_CUSTOM_COMMANDS = ListCustomCommands(), DMAPI5_PARAMETER_TOPIC_PORT = GetTopicPort())) if(!istype(bridge_response)) From 73bbf001369a110e73857b7e065236e878f608ac Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 14:43:57 -0400 Subject: [PATCH 02/19] Fix a linter message --- src/Tgstation.Server.Api/Models/EngineVersion.cs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/Tgstation.Server.Api/Models/EngineVersion.cs b/src/Tgstation.Server.Api/Models/EngineVersion.cs index adaa90bcbf0..61966105973 100644 --- a/src/Tgstation.Server.Api/Models/EngineVersion.cs +++ b/src/Tgstation.Server.Api/Models/EngineVersion.cs @@ -12,6 +12,11 @@ namespace Tgstation.Server.Api.Models /// public sealed class EngineVersion : IEquatable { + /// + /// An array of a single '-' . + /// + static readonly char[] DashChar = ['-']; + /// /// The . /// @@ -48,7 +53,7 @@ public static bool TryParse(string input, out EngineVersion? engineVersion) if (input == null) throw new ArgumentNullException(nameof(input)); - var splits = input.Split(new char[] { '-' }, StringSplitOptions.RemoveEmptyEntries); + var splits = input.Split(DashChar, StringSplitOptions.RemoveEmptyEntries); engineVersion = null; if (splits.Length > 3) From ff9c5c241a8d048a2c4d36e07eca9aed7f43db78 Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 14:44:16 -0400 Subject: [PATCH 03/19] Respect `LogTopicRequests` here --- .../Components/Session/SessionController.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Tgstation.Server.Host/Components/Session/SessionController.cs b/src/Tgstation.Server.Host/Components/Session/SessionController.cs index e32c251eb55..10556ca9fc2 100644 --- a/src/Tgstation.Server.Host/Components/Session/SessionController.cs +++ b/src/Tgstation.Server.Host/Components/Session/SessionController.cs @@ -7,6 +7,7 @@ using System.Threading.Tasks; using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; using Newtonsoft.Json; @@ -942,7 +943,9 @@ string GenerateQueryString(TopicParameters parameters, out string json) using (await TopicSendSemaphore.Lock(cancellationToken)) byondResponse = await byondTopicSender.SendWithOptionalPriority( asyncDelayer, - Logger, + LogTopicRequests + ? Logger + : NullLogger.Instance, queryString, targetPort, priority, From 3eb96f34d0772d01760e73413956a98d29a2528c Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 14:44:40 -0400 Subject: [PATCH 04/19] Lockdown `WatchdogBase.Status`. Improve logging --- .../Components/Watchdog/WatchdogBase.cs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Tgstation.Server.Host/Components/Watchdog/WatchdogBase.cs b/src/Tgstation.Server.Host/Components/Watchdog/WatchdogBase.cs index 211249e0dfb..ec3ae853469 100644 --- a/src/Tgstation.Server.Host/Components/Watchdog/WatchdogBase.cs +++ b/src/Tgstation.Server.Host/Components/Watchdog/WatchdogBase.cs @@ -42,10 +42,11 @@ abstract class WatchdogBase : IWatchdog, ICustomCommandHandler, IRestartHandler public WatchdogStatus Status { get => status; - set + protected set { + var oldStatus = status; status = value; - Logger.LogTrace("Status set to {status}", status); + Logger.LogTrace("Status set from {oldStatus} to {status}", oldStatus, status); } } From 3cf2cd7d207c11a01557811a08252fe1dbddbc87 Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 14:44:56 -0400 Subject: [PATCH 05/19] Improve `TopicClientExtensions` logging --- .../Extensions/TopicClientExtensions.cs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/Tgstation.Server.Host/Extensions/TopicClientExtensions.cs b/src/Tgstation.Server.Host/Extensions/TopicClientExtensions.cs index 3a7fb564036..55ead2144a5 100644 --- a/src/Tgstation.Server.Host/Extensions/TopicClientExtensions.cs +++ b/src/Tgstation.Server.Host/Extensions/TopicClientExtensions.cs @@ -16,6 +16,11 @@ namespace Tgstation.Server.Host.Extensions /// static class TopicClientExtensions { + /// + /// Counter for topic request logging. + /// + static ulong topicRequestId; + /// /// Send a with optional repeated priority. /// @@ -45,13 +50,14 @@ static class TopicClientExtensions { firstSend = false; - logger.LogTrace("Begin topic request"); + var localRequestId = Interlocked.Increment(ref topicRequestId); + logger.LogTrace("Begin topic request #{requestId}: {query}", localRequestId, queryString); var byondResponse = await topicClient.SendTopic( endpoint, queryString, cancellationToken); - logger.LogTrace("End topic request"); + logger.LogTrace("End topic request #{requestId}", localRequestId); return byondResponse; } catch (Exception ex) when (ex is not OperationCanceledException) From 27b6e1aed26dbf95a1c3d4141a4930b870643fd6 Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 14:45:07 -0400 Subject: [PATCH 06/19] Improve DMAPI test logging --- tests/DMAPI/LongRunning/Test.dm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/DMAPI/LongRunning/Test.dm b/tests/DMAPI/LongRunning/Test.dm index 2735c8e0ab8..d99513a6efc 100644 --- a/tests/DMAPI/LongRunning/Test.dm +++ b/tests/DMAPI/LongRunning/Test.dm @@ -3,7 +3,7 @@ loop_checks = FALSE /world/proc/RunTest() - log << "Initial value of sleep_offline: [sleep_offline]" + log << "Initial value of sleep_offline: [sleep_offline], setting to FALSE" sleep_offline = FALSE if(params["slow_start"]) @@ -215,6 +215,7 @@ var/run_bridge_test var/kajigger_test = FALSE /world/Reboot(reason) + log << "Reboot Start" TgsChatBroadcast("World Rebooting") if(kajigger_test && !fexists("kajigger.txt")) @@ -222,6 +223,7 @@ var/kajigger_test = FALSE TgsReboot() + log << "Calling base reboot" ..() var/received_health_check = FALSE From a0d3ae380a93ffd2b981adf714042adfe57a8cac Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 14:46:16 -0400 Subject: [PATCH 07/19] Set appropriate BYOND version for `-D` dm.exe usage --- .../Live/Instance/DeploymentTest.cs | 11 ++++++----- .../Live/Instance/InstanceTest.cs | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/Tgstation.Server.Tests/Live/Instance/DeploymentTest.cs b/tests/Tgstation.Server.Tests/Live/Instance/DeploymentTest.cs index a158ed4edf0..fe32e9e1969 100644 --- a/tests/Tgstation.Server.Tests/Live/Instance/DeploymentTest.cs +++ b/tests/Tgstation.Server.Tests/Live/Instance/DeploymentTest.cs @@ -24,7 +24,7 @@ sealed class DeploymentTest : JobsRequiredTest readonly ushort dmPort; readonly ushort ddPort; readonly bool lowPriorityDeployments; - readonly EngineType testEngine; + readonly EngineVersion testEngine; Task vpTest; @@ -34,7 +34,7 @@ public DeploymentTest( ushort dmPort, ushort ddPort, bool lowPriorityDeployments, - EngineType testEngine) : base(jobsClient) + EngineVersion testEngine) : base(jobsClient) { this.instanceClient = instanceClient ?? throw new ArgumentNullException(nameof(instanceClient)); dreamMakerClient = instanceClient.DreamMaker; @@ -65,7 +65,7 @@ async ValueTask CheckDreamDaemonPriority(Task deploymentJobWaitTask, Cancellatio // this doesn't check dm's priority, but it really should while (!deploymentJobWaitTask.IsCompleted) { - var allProcesses = TestLiveServer.GetEngineServerProcessesOnPort(testEngine, dmPort); + var allProcesses = TestLiveServer.GetEngineServerProcessesOnPort(testEngine.Engine.Value, dmPort); if (allProcesses.Count == 0) continue; @@ -133,13 +133,14 @@ public async Task RunPostRepoClone(Task byondTask, CancellationToken cancellatio } else { + var canUseDashD = testEngine.Engine == EngineType.Byond && testEngine.Version >= new Version(515, 1597); var updatedDM = await dreamMakerClient.Update(new DreamMakerRequest { ApiValidationPort = dmPort, - CompilerAdditionalArguments = testEngine == EngineType.Byond ? " -DBABABOOEY" : " ", + CompilerAdditionalArguments = canUseDashD ? " -DBABABOOEY" : " ", }, cancellationToken); Assert.AreEqual(dmPort, updatedDM.ApiValidationPort); - if (testEngine == EngineType.Byond) + if (canUseDashD) Assert.AreEqual("-DBABABOOEY", updatedDM.CompilerAdditionalArguments); else Assert.IsNull(updatedDM.CompilerAdditionalArguments); diff --git a/tests/Tgstation.Server.Tests/Live/Instance/InstanceTest.cs b/tests/Tgstation.Server.Tests/Live/Instance/InstanceTest.cs index a37ceb26e15..de12715043f 100644 --- a/tests/Tgstation.Server.Tests/Live/Instance/InstanceTest.cs +++ b/tests/Tgstation.Server.Tests/Live/Instance/InstanceTest.cs @@ -51,7 +51,7 @@ public async Task RunTests( await using var chatTest = new ChatTest(instanceClient.ChatBots, instanceManagerClient, instanceClient.Jobs, instanceClient.Metadata); var configTest = new ConfigurationTest(instanceClient.Configuration, instanceClient.Metadata); await using var repoTest = new RepositoryTest(instanceClient.Repository, instanceClient.Jobs); - await using var dmTest = new DeploymentTest(instanceClient, instanceClient.Jobs, dmPort, ddPort, lowPrioDeployment, testVersion.Engine.Value); + await using var dmTest = new DeploymentTest(instanceClient, instanceClient.Jobs, dmPort, ddPort, lowPrioDeployment, testVersion); var byondTask = engineTest.Run(cancellationToken, out var firstInstall); var chatTask = chatTest.RunPreWatchdog(cancellationToken); From b7db2fc51e66a195ab3f6475eb993db21dd7269e Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 14:47:27 -0400 Subject: [PATCH 08/19] Improve test `IGitHubClient` naming --- .../Tgstation.Server.Tests/Live/Instance/EngineTest.cs | 2 +- .../Live/TestingGitHubService.cs | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/Tgstation.Server.Tests/Live/Instance/EngineTest.cs b/tests/Tgstation.Server.Tests/Live/Instance/EngineTest.cs index a2f82495448..ffdaeb63e4a 100644 --- a/tests/Tgstation.Server.Tests/Live/Instance/EngineTest.cs +++ b/tests/Tgstation.Server.Tests/Live/Instance/EngineTest.cs @@ -93,7 +93,7 @@ public static async ValueTask GetEdgeVersion(EngineType engineTyp } else { - var masterBranch = await TestingGitHubService.RealTestClient.Repository.Branch.Get("OpenDreamProject", "OpenDream", "master"); + var masterBranch = await TestingGitHubService.RealClient.Repository.Branch.Get("OpenDreamProject", "OpenDream", "master"); engineVersion = new EngineVersion { diff --git a/tests/Tgstation.Server.Tests/Live/TestingGitHubService.cs b/tests/Tgstation.Server.Tests/Live/TestingGitHubService.cs index f1b3c8d95a1..d104dbd6d3a 100644 --- a/tests/Tgstation.Server.Tests/Live/TestingGitHubService.cs +++ b/tests/Tgstation.Server.Tests/Live/TestingGitHubService.cs @@ -28,7 +28,7 @@ sealed class TestingGitHubService : IAuthenticatedGitHubService readonly ICryptographySuite cryptographySuite; readonly ILogger logger; - public static readonly IGitHubClient RealTestClient; + public static readonly IGitHubClient RealClient; static TestingGitHubService() { @@ -39,7 +39,7 @@ static TestingGitHubService() }); var gitHubClientFactory = new GitHubClientFactory(new AssemblyInformationProvider(), Mock.Of>(), mockOptions.Object); - RealTestClient = gitHubClientFactory.CreateClient(); + RealClient = gitHubClientFactory.CreateClient(); } public static async Task InitializeAndInject(CancellationToken cancellationToken) @@ -47,7 +47,7 @@ public static async Task InitializeAndInject(CancellationToken cancellationToken Release targetRelease; do { - var releases = await RealTestClient + var releases = await RealClient .Repository .Release .GetAll("tgstation", "tgstation-server") @@ -62,12 +62,12 @@ public static async Task InitializeAndInject(CancellationToken cancellationToken { TestLiveServer.TestUpdateVersion, targetRelease } }; - var testCommitTask = RealTestClient + var testCommitTask = RealClient .Repository .Commit .Get("Cyberboss", "common_core", "4b4926dfaf6295f19f8ae7abf03cb357dbb05b29") .WaitAsync(cancellationToken); - testPr = await RealTestClient + testPr = await RealClient .PullRequest .Get("Cyberboss", "common_core", 2) .WaitAsync(cancellationToken); From bd731bf51ae84f2a4340606dc31eaa41f1ff87da Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 14:48:31 -0400 Subject: [PATCH 09/19] DMAPI enabled deployments now set to `WatchdogStatus.Restoring` on Reboot --- .../Components/Watchdog/BasicWatchdog.cs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/Tgstation.Server.Host/Components/Watchdog/BasicWatchdog.cs b/src/Tgstation.Server.Host/Components/Watchdog/BasicWatchdog.cs index 8dfb10e412e..edc50518d45 100644 --- a/src/Tgstation.Server.Host/Components/Watchdog/BasicWatchdog.cs +++ b/src/Tgstation.Server.Host/Components/Watchdog/BasicWatchdog.cs @@ -181,7 +181,8 @@ protected override async ValueTask HandleMonitorWakeup(MonitorAct await HandleEventImpl(EventType.WorldPrime, Enumerable.Empty(), false, cancellationToken); break; case MonitorActivationReason.ActiveServerStartup: - break; // unused in BasicWatchdog + Status = Api.Models.WatchdogStatus.Online; + break; case MonitorActivationReason.HealthCheck: default: throw new InvalidOperationException($"Invalid activation reason: {reason}"); @@ -296,7 +297,12 @@ protected virtual ValueTask SessionStartupPersist(CancellationToken cancellation /// The for the operation. /// A resulting in the to take. protected virtual ValueTask HandleNormalReboot(CancellationToken cancellationToken) - => ValueTask.FromResult(MonitorAction.Continue); + { + if (Server!.CompileJob.DMApiVersion != null) + Status = Api.Models.WatchdogStatus.Restoring; + + return ValueTask.FromResult(MonitorAction.Continue); + } /// /// Handler for . From caaded3d1b28bf53212b6358e9468c99056c2ea2 Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 14:48:44 -0400 Subject: [PATCH 10/19] Fix live tests race condition --- .../Live/Instance/WatchdogTest.cs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/Tgstation.Server.Tests/Live/Instance/WatchdogTest.cs b/tests/Tgstation.Server.Tests/Live/Instance/WatchdogTest.cs index 48d8a5faee6..cf1dad38f3a 100644 --- a/tests/Tgstation.Server.Tests/Live/Instance/WatchdogTest.cs +++ b/tests/Tgstation.Server.Tests/Live/Instance/WatchdogTest.cs @@ -1069,6 +1069,15 @@ ushort FindTopicPort() // - Injects a custom bridge handler into the bridge registrar and makes the test hack into the DMAPI and change its access_identifier async Task WhiteBoxChatCommandTest(CancellationToken cancellationToken) { + var ddInfo = await instanceClient.DreamDaemon.Read(cancellationToken); + for (int i = 0; ddInfo.Status != WatchdogStatus.Online && i < 15; ++i) + { + await Task.Delay(TimeSpan.FromSeconds(1), cancellationToken); + ddInfo = await instanceClient.DreamDaemon.Read(cancellationToken); + } + + Assert.AreEqual(WatchdogStatus.Online, ddInfo.Status); + MessageContent embedsResponse, overloadResponse, overloadResponse2, embedsResponse2; var startTime = DateTimeOffset.UtcNow - TimeSpan.FromSeconds(5); using (var instanceReference = instanceManager.GetInstanceReference(instanceClient.Metadata)) @@ -1115,7 +1124,7 @@ async Task WhiteBoxChatCommandTest(CancellationToken cancellationToken) var endTime = DateTimeOffset.UtcNow + TimeSpan.FromSeconds(5); - var ddInfo = await instanceClient.DreamDaemon.Read(cancellationToken); + ddInfo = await instanceClient.DreamDaemon.Read(cancellationToken); await CheckDMApiFail(ddInfo.ActiveCompileJob, cancellationToken); CheckEmbedsTest(embedsResponse, startTime, endTime); From bbbbe6caa61563166f19c38214e8035ed7aea76b Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 15:38:12 -0400 Subject: [PATCH 11/19] Assert expected game directory counts after watchdog tests --- .../Live/Instance/InstanceTest.cs | 8 +++-- .../Live/Instance/WatchdogTest.cs | 32 +++++++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/tests/Tgstation.Server.Tests/Live/Instance/InstanceTest.cs b/tests/Tgstation.Server.Tests/Live/Instance/InstanceTest.cs index de12715043f..d84ac0b6c47 100644 --- a/tests/Tgstation.Server.Tests/Live/Instance/InstanceTest.cs +++ b/tests/Tgstation.Server.Tests/Live/Instance/InstanceTest.cs @@ -1,8 +1,6 @@ using System; using System.Collections.Generic; -using System.IO; using System.Linq; -using System.Reflection; using System.Threading; using System.Threading.Tasks; @@ -24,7 +22,6 @@ using Tgstation.Server.Host.Components.Repository; using Tgstation.Server.Host.Configuration; using Tgstation.Server.Host.IO; -using Tgstation.Server.Host.Jobs; using Tgstation.Server.Host.System; using Tgstation.Server.Host.Utils; @@ -77,6 +74,7 @@ public async Task RunTests( ddPort, usingBasicWatchdog); await wdt.Run(cancellationToken); + await wdt.ExpectGameDirectoryCount(2, cancellationToken); } public static async ValueTask DownloadEngineVersion( @@ -285,6 +283,10 @@ await instanceClient.Repository.Update(new RepositoryUpdateRequest await using var wdt = new WatchdogTest(compatVersion, instanceClient, instanceManager, serverPort, highPrioDD, ddPort, usingBasicWatchdog); await wdt.Run(cancellationToken); + await instanceClient.DreamDaemon.Shutdown(cancellationToken); + + await wdt.ExpectGameDirectoryCount(1, cancellationToken); + await instanceManagerClient.Update(new InstanceUpdateRequest { Id = instanceClient.Metadata.Id, diff --git a/tests/Tgstation.Server.Tests/Live/Instance/WatchdogTest.cs b/tests/Tgstation.Server.Tests/Live/Instance/WatchdogTest.cs index cf1dad38f3a..9227e491ae6 100644 --- a/tests/Tgstation.Server.Tests/Live/Instance/WatchdogTest.cs +++ b/tests/Tgstation.Server.Tests/Live/Instance/WatchdogTest.cs @@ -595,6 +595,8 @@ async Task TestDMApiFreeDeploy(CancellationToken cancellationToken) Assert.IsNull(daemonStatus.ActiveCompileJob.DMApiVersion); Assert.AreEqual(DreamDaemonSecurity.Ultrasafe, daemonStatus.ActiveCompileJob.MinimumSecurityLevel); + await ExpectGameDirectoryCount(1, cancellationToken); + var startJob = await StartDD(cancellationToken); await WaitForJob(startJob, 40, false, null, cancellationToken); @@ -618,11 +620,38 @@ async Task TestDMApiFreeDeploy(CancellationToken cancellationToken) Assert.AreEqual(initialCompileJob.Id, daemonStatus.ActiveCompileJob.Id); var newerCompileJob = daemonStatus.StagedCompileJob; + await ExpectGameDirectoryCount(2, cancellationToken); + Assert.IsNotNull(newerCompileJob); Assert.AreNotEqual(initialCompileJob.Id, newerCompileJob.Id); Assert.AreEqual(DreamDaemonSecurity.Trusted, newerCompileJob.MinimumSecurityLevel); Assert.AreEqual(DMApiConstants.InteropVersion, daemonStatus.StagedCompileJob.DMApiVersion); await instanceClient.DreamDaemon.Shutdown(cancellationToken); + + await ExpectGameDirectoryCount(1, cancellationToken); + } + + public async ValueTask ExpectGameDirectoryCount(int expected, CancellationToken cancellationToken) + { + int CountNonLiveDirs() + { + var directories = Directory.GetDirectories(Path.Combine(instanceClient.Metadata.Path, "Game")); + return directories.Where(directory => Path.GetFileName(directory) != "Live").Count(); + } + + int nonLiveDirs = 0; + // cleanup task is async + for(var i = 0; i < 15; ++i) + { + nonLiveDirs = CountNonLiveDirs(); + if (expected == nonLiveDirs) + return; + + await Task.Delay(TimeSpan.FromSeconds(1), cancellationToken); + } + + nonLiveDirs = CountNonLiveDirs(); + Assert.AreEqual(expected, nonLiveDirs); } async Task RunBasicTest(CancellationToken cancellationToken) @@ -643,6 +672,8 @@ async Task RunBasicTest(CancellationToken cancellationToken) Assert.AreEqual(DMApiConstants.InteropVersion, daemonStatus.ActiveCompileJob.DMApiVersion); Assert.AreEqual(DreamDaemonSecurity.Trusted, daemonStatus.ActiveCompileJob.MinimumSecurityLevel); + await ExpectGameDirectoryCount(1, cancellationToken); + JobResponse startJob; if (new PlatformIdentifier().IsWindows) // Can't get address reuse to trigger on linux for some reason using (var blockSocket = new Socket( @@ -683,6 +714,7 @@ async Task RunBasicTest(CancellationToken cancellationToken) daemonStatus = await instanceClient.DreamDaemon.Read(cancellationToken); Assert.AreEqual(WatchdogStatus.Offline, daemonStatus.Status.Value); Assert.IsFalse(daemonStatus.SessionId.HasValue); + await ExpectGameDirectoryCount(1, cancellationToken); await CheckDMApiFail(daemonStatus.ActiveCompileJob, cancellationToken, false); From 7d3eed888edd8db20df49b95e6509dbc6bca6dba Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 16:25:10 -0400 Subject: [PATCH 12/19] Short-circuit Robust.Server exiting if possible --- .../Components/Engine/OpenDreamInstallation.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Tgstation.Server.Host/Components/Engine/OpenDreamInstallation.cs b/src/Tgstation.Server.Host/Components/Engine/OpenDreamInstallation.cs index 49c5ed1470b..13c07a2d35f 100644 --- a/src/Tgstation.Server.Host/Components/Engine/OpenDreamInstallation.cs +++ b/src/Tgstation.Server.Host/Components/Engine/OpenDreamInstallation.cs @@ -137,6 +137,8 @@ public override async ValueTask StopServerProcess( logger.LogTrace("Attempting Robust.Server graceful exit (Timeout: {seconds}s)...", MaximumTerminationSeconds); var timeout = asyncDelayer.Delay(TimeSpan.FromSeconds(MaximumTerminationSeconds), cancellationToken); var lifetime = process.Lifetime; + if (lifetime.IsCompleted) + logger.LogTrace("Robust.Server already exited"); var stopwatch = Stopwatch.StartNew(); try From 21a8f7cbe1f43b79b299db7df80dcd3c75e9536e Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Sun, 16 Jun 2024 11:35:00 -0400 Subject: [PATCH 13/19] Rewrite IDmbProvider locking so that locks are tracked and reasons are given --- .../Chat/Commands/PullRequestsCommand.cs | 2 +- .../Deployment/DeploymentLockManager.cs | 149 +++++++++ .../Components/Deployment/DmbFactory.cs | 294 ++++++++---------- .../Components/Deployment/DmbLock.cs | 82 +++++ .../Components/Deployment/DreamMaker.cs | 2 +- .../Components/Deployment/IDmbFactory.cs | 12 +- .../Deployment/ILatestCompileJobProvider.cs | 8 +- .../Components/Instance.cs | 4 +- .../Components/InstanceWrapper.cs | 2 +- .../Components/Session/SessionPersistor.cs | 4 +- .../Components/Watchdog/AdvancedWatchdog.cs | 2 +- .../Components/Watchdog/BasicWatchdog.cs | 2 +- .../Components/Watchdog/WatchdogBase.cs | 13 +- .../Components/Watchdog/WindowsWatchdog.cs | 2 +- .../Controllers/DreamDaemonController.cs | 2 +- 15 files changed, 397 insertions(+), 183 deletions(-) create mode 100644 src/Tgstation.Server.Host/Components/Deployment/DeploymentLockManager.cs create mode 100644 src/Tgstation.Server.Host/Components/Deployment/DmbLock.cs diff --git a/src/Tgstation.Server.Host/Components/Chat/Commands/PullRequestsCommand.cs b/src/Tgstation.Server.Host/Components/Chat/Commands/PullRequestsCommand.cs index 3fd8c9e64bf..5cc52d2b838 100644 --- a/src/Tgstation.Server.Host/Components/Chat/Commands/PullRequestsCommand.cs +++ b/src/Tgstation.Server.Host/Components/Chat/Commands/PullRequestsCommand.cs @@ -137,7 +137,7 @@ await databaseContextFactory.UseContext( var compileJobToUse = watchdog.ActiveCompileJob; if (hasStaged) { - var latestCompileJob = compileJobProvider.LatestCompileJob(); + var latestCompileJob = await compileJobProvider.LatestCompileJob(); if (latestCompileJob?.Id != compileJobToUse?.Id) compileJobToUse = latestCompileJob; else diff --git a/src/Tgstation.Server.Host/Components/Deployment/DeploymentLockManager.cs b/src/Tgstation.Server.Host/Components/Deployment/DeploymentLockManager.cs new file mode 100644 index 00000000000..552fda6c8de --- /dev/null +++ b/src/Tgstation.Server.Host/Components/Deployment/DeploymentLockManager.cs @@ -0,0 +1,149 @@ +using System; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Threading.Tasks; + +using Microsoft.Extensions.Logging; + +using Tgstation.Server.Host.Models; + +namespace Tgstation.Server.Host.Components.Deployment +{ + /// + /// Manages locks on a given . + /// + sealed class DeploymentLockManager : IAsyncDisposable + { + /// + /// The represented by the . + /// + public CompileJob CompileJob => dmbProvider.CompileJob; + + /// + /// The for the . + /// + readonly ILogger logger; + + /// + /// The the is managing. + /// + readonly IDmbProvider dmbProvider; + + /// + /// The s on the . + /// + readonly HashSet locks; + + /// + /// The first lock acquired by the . + /// + readonly DmbLock firstLock; + + /// + /// Create a . + /// + /// The value of . + /// The value of . + /// The reason for the first lock. + /// The that represents the first lock. + /// The file path of the calling function. + /// The line number of the call invocation. + /// A new . + public static DeploymentLockManager Create(IDmbProvider dmbProvider, ILogger logger, string initialLockReason, out IDmbProvider firstLock, [CallerFilePath] string? callerFile = null, [CallerLineNumber] int callerLine = default) + { + var manager = new DeploymentLockManager(dmbProvider, logger, initialLockReason, callerFile!, callerLine); + firstLock = manager.firstLock; + return manager; + } + + /// + /// Generates a verbose description of a given . + /// + /// The to get a description of. + /// A verbose description of . + static string GetFullLockDescriptor(DmbLock dmbLock) => $"{dmbLock.LockID} {dmbLock.Descriptor} (Created at {dmbLock.LockTime}){(dmbLock.KeptAlive ? " (RELEASED)" : String.Empty)}"; + + /// + /// Initializes a new instance of the class. + /// + /// The value of . + /// The value of . + /// The reason for the first lock. + /// The file path of the calling function. + /// The line number of the call invocation. + /// A new . + DeploymentLockManager(IDmbProvider dmbProvider, ILogger logger, string initialLockReason, string callerFile, int callerLine) + { + this.dmbProvider = dmbProvider ?? throw new ArgumentNullException(nameof(dmbProvider)); + this.logger = logger ?? throw new ArgumentNullException(nameof(logger)); + + logger.LogTrace("Initializing lock manager for compile job {id}", dmbProvider.CompileJob.Id); + locks = new HashSet(); + firstLock = CreateLock(initialLockReason, callerFile, callerLine); + } + + /// + public ValueTask DisposeAsync() + => firstLock.DisposeAsync(); + + /// + /// Add a lock to the managed . + /// + /// The reason for the lock. + /// The file path of the calling function. + /// The line number of the call invocation. + /// A whose lifetime represents the lock. + public IDmbProvider AddLock(string reason, [CallerFilePath] string? callerFile = null, [CallerLineNumber]int callerLine = default) + { + lock (locks) + { + if (locks.Count == 0) + throw new InvalidOperationException($"No locks exist on the DmbProvider for CompileJob {dmbProvider.CompileJob.Id}!"); + + return CreateLock(reason, callerFile!, callerLine); + } + } + + /// + /// Creates a and adds it to . + /// + /// The reason for the lock. + /// The file path of the calling function. + /// The line number of the call invocation. + /// A new . + /// Requires exclusive write access to be held by the caller. + DmbLock CreateLock(string reason, string callerFile, int callerLine) + { + DmbLock? newLock = null; + string? descriptor = null; + ValueTask LockCleanupAction() + { + ValueTask disposeTask = ValueTask.CompletedTask; + lock (locks) + { + logger.LogTrace("Removing .dmb Lock: {descriptor}", descriptor); + + if (locks.Remove(newLock!)) + logger.LogTrace("Lock was removed from list successfully"); + else + logger.LogError("A .dmb lock was disposed more than once: {descriptor}", descriptor); + + if (locks.Count == 0) + disposeTask = dmbProvider.DisposeAsync(); + else if (newLock == firstLock) + logger.LogDebug("First lock on CompileJob #{compileJobId} removed, it must cleanup {remaining} remaining locks to be cleaned", CompileJob.Id, locks.Count); + } + + return disposeTask; + } + + newLock = new DmbLock(LockCleanupAction, dmbProvider, $"{callerFile}#{callerLine}: {reason}"); + locks.Add(newLock); + + descriptor = GetFullLockDescriptor(newLock!); + logger.LogTrace("Created .dmb Lock: {descriptor}", descriptor); + + return newLock; + } + } +} diff --git a/src/Tgstation.Server.Host/Components/Deployment/DmbFactory.cs b/src/Tgstation.Server.Host/Components/Deployment/DmbFactory.cs index fbeb7d3f920..de885df1725 100644 --- a/src/Tgstation.Server.Host/Components/Deployment/DmbFactory.cs +++ b/src/Tgstation.Server.Host/Components/Deployment/DmbFactory.cs @@ -3,7 +3,7 @@ using System.Diagnostics.CodeAnalysis; using System.IO; using System.Linq; -using System.Text; +using System.Runtime.CompilerServices; using System.Threading; using System.Threading.Tasks; @@ -31,14 +31,14 @@ public Task OnNewerDmb { get { - lock (jobLockCounts) + lock (jobLockManagers) return newerDmbTcs.Task; } } /// - [MemberNotNullWhen(true, nameof(nextDmbProvider))] - public bool DmbAvailable => nextDmbProvider != null; + [MemberNotNullWhen(true, nameof(nextLockManager))] + public bool DmbAvailable => nextLockManager != null; /// /// The for the . @@ -78,7 +78,7 @@ public Task OnNewerDmb /// /// Map of s to locks on them. /// - readonly Dictionary jobLockCounts; + readonly Dictionary jobLockManagers; /// /// resulting in the latest yet to exist. @@ -91,9 +91,9 @@ public Task OnNewerDmb Task cleanupTask; /// - /// The latest . + /// The for the latest . /// - IDmbProvider? nextDmbProvider; + DeploymentLockManager? nextLockManager; /// /// If the is "started" via . @@ -127,7 +127,7 @@ public DmbFactory( cleanupTask = Task.CompletedTask; newerDmbTcs = new TaskCompletionSource(); cleanupCts = new CancellationTokenSource(); - jobLockCounts = new Dictionary(); + jobLockManagers = new Dictionary(); } /// @@ -138,10 +138,13 @@ public async ValueTask LoadCompileJob(CompileJob job, Action? activationAc { ArgumentNullException.ThrowIfNull(job); - var newProvider = await FromCompileJob(job, cancellationToken); - if (newProvider == null) + var (dmbProvider, lockManager) = await FromCompileJobInternal(job, "Compile job loading", cancellationToken); + if (dmbProvider == null) return; + if (lockManager == null) + throw new InvalidOperationException($"We did not acquire the first lock for compile job {job.Id}!"); + // Do this first, because it's entirely possible when we set the tcs it will immediately need to be applied if (started) { @@ -149,16 +152,16 @@ public async ValueTask LoadCompileJob(CompileJob job, Action? activationAc metadata, job); await remoteDeploymentManager.StageDeployment( - newProvider.CompileJob, + lockManager.CompileJob, activationAction, cancellationToken); } ValueTask dmbDisposeTask; - lock (jobLockCounts) + lock (jobLockManagers) { - dmbDisposeTask = nextDmbProvider?.DisposeAsync() ?? ValueTask.CompletedTask; - nextDmbProvider = newProvider; + dmbDisposeTask = nextLockManager?.DisposeAsync() ?? ValueTask.CompletedTask; + nextLockManager = lockManager; // Oh god dammit var temp = Interlocked.Exchange(ref newerDmbTcs, new TaskCompletionSource()); @@ -169,20 +172,12 @@ await remoteDeploymentManager.StageDeployment( } /// - public IDmbProvider LockNextDmb(int lockCount) + public IDmbProvider LockNextDmb(string reason, [CallerFilePath] string? callerFile = null, [CallerLineNumber] int callerLine = default) { if (!DmbAvailable) throw new InvalidOperationException("No .dmb available!"); - if (lockCount < 0) - throw new ArgumentOutOfRangeException(nameof(lockCount), lockCount, "lockCount must be greater than or equal to 0!"); - lock (jobLockCounts) - { - var jobId = nextDmbProvider.CompileJob.Require(x => x.Id); - var incremented = jobLockCounts[jobId] += lockCount; - logger.LogTrace("Compile job {jobId} lock increased by: {increment}", jobId, lockCount); - LogLockCounts(); - return nextDmbProvider; - } + + return nextLockManager.AddLock(reason, callerFile, callerLine); } /// @@ -217,8 +212,8 @@ public async Task StopAsync(CancellationToken cancellationToken) { try { - lock (jobLockCounts) - remoteDeploymentManagerFactory.ForgetLocalStateForCompileJobs(jobLockCounts.Keys); + lock (jobLockManagers) + remoteDeploymentManagerFactory.ForgetLocalStateForCompileJobs(jobLockManagers.Keys); using (cancellationToken.Register(() => cleanupCts.Cancel())) await cleanupTask; @@ -231,12 +226,106 @@ public async Task StopAsync(CancellationToken cancellationToken) /// #pragma warning disable CA1506 // TODO: Decomplexify - public async ValueTask FromCompileJob(CompileJob compileJob, CancellationToken cancellationToken) + public async ValueTask FromCompileJob(CompileJob compileJob, string reason, CancellationToken cancellationToken, [CallerFilePath] string? callerFile = null, [CallerLineNumber] int callerLine = default) { ArgumentNullException.ThrowIfNull(compileJob); + ArgumentNullException.ThrowIfNull(reason); + + var (dmb, _) = await FromCompileJobInternal(compileJob, reason, cancellationToken, callerFile, callerLine); + + return dmb; + } + + /// +#pragma warning disable CA1506 // TODO: Decomplexify + public async ValueTask CleanUnusedCompileJobs(CancellationToken cancellationToken) + { + List jobIdsToSkip; + + // don't clean locked directories + lock (jobLockManagers) + jobIdsToSkip = jobLockManagers.Keys.ToList(); + + List? jobUidsToNotErase = null; + // find the uids of locked directories + if (jobIdsToSkip.Count > 0) + { + await databaseContextFactory.UseContext(async db => + { + jobUidsToNotErase = (await db + .CompileJobs + .AsQueryable() + .Where( + x => x.Job.Instance!.Id == metadata.Id + && jobIdsToSkip.Contains(x.Id!.Value)) + .Select(x => x.DirectoryName!.Value) + .ToListAsync(cancellationToken)) + .Select(x => x.ToString()) + .ToList(); + }); + } + else + jobUidsToNotErase = new List(); + + jobUidsToNotErase!.Add(SwappableDmbProvider.LiveGameDirectory); + + logger.LogTrace("We will not clean the following directories: {directoriesToNotClean}", String.Join(", ", jobUidsToNotErase)); + + // cleanup + var gameDirectory = ioManager.ResolvePath(); + await ioManager.CreateDirectory(gameDirectory, cancellationToken); + var directories = await ioManager.GetDirectories(gameDirectory, cancellationToken); + int deleting = 0; + var tasks = directories.Select(async x => + { + var nameOnly = ioManager.GetFileName(x); + if (jobUidsToNotErase.Contains(nameOnly)) + return; + logger.LogDebug("Cleaning unused game folder: {dirName}...", nameOnly); + try + { + ++deleting; + await DeleteCompileJobContent(x, cancellationToken); + } + catch (Exception e) when (e is not OperationCanceledException) + { + logger.LogWarning(e, "Error deleting directory {dirName}!", x); + } + }).ToList(); + if (deleting > 0) + await ValueTaskExtensions.WhenAll(tasks); + } +#pragma warning restore CA1506 + + /// + public async ValueTask LatestCompileJob() + { + if (!DmbAvailable) + return null; + + await using IDmbProvider provider = LockNextDmb("Checking latest CompileJob"); + + return provider.CompileJob; + } + + /// + /// Gets a and potentially the for a given . + /// + /// The to make the for. + /// The reason the compile job needed to be loaded. + /// The for the operation. + /// The file path of the calling function. + /// The line number of the call invocation. + /// A resulting in, on success, a tuple containing new representing the . If the first lock on was acquired, the will also be returned. On failure, Will be returned. + async ValueTask<(IDmbProvider? DmbProvider, DeploymentLockManager? LockManager)> FromCompileJobInternal(CompileJob compileJob, string reason, CancellationToken cancellationToken, [CallerFilePath] string? callerFile = null, [CallerLineNumber] int callerLine = default) + { // ensure we have the entire metadata tree var compileJobId = compileJob.Require(x => x.Id); + lock (jobLockManagers) + if (jobLockManagers.TryGetValue(compileJobId, out var lockManager)) + return (DmbProvider: lockManager.AddLock(reason, callerFile, callerLine), LockManager: null); // fast path + logger.LogTrace("Loading compile job {id}...", compileJobId); await databaseContextFactory.UseContext( async db => compileJob = await db @@ -260,7 +349,7 @@ await databaseContextFactory.UseContext( if (!EngineVersion.TryParse(compileJob.EngineVersion, out var engineVersionNullable)) { logger.LogWarning("Error loading compile job, bad engine version: {engineVersion}", compileJob.EngineVersion); - return null; // omae wa mou shinderu + return (null, null); // omae wa mou shinderu } else engineVersion = engineVersionNullable!; @@ -275,7 +364,6 @@ await databaseContextFactory.UseContext( } var providerSubmitted = false; - void CleanupAction() { if (providerSubmitted) @@ -313,33 +401,34 @@ void CleanupAction() if (!(await primaryCheckTask && await secondaryCheckTask)) { logger.LogWarning("Error loading compile job, .dmb missing!"); - return null; // omae wa mou shinderu + return (null, null); // omae wa mou shinderu } // rebuild the provider because it's using the legacy style directories // Don't dispose it logger.LogDebug("Creating legacy two folder .dmb provider targeting {aDirName} directory...", LegacyADirectoryName); +#pragma warning disable CA2000 // Dispose objects before losing scope (false positive) newProvider = new DmbProvider(compileJob, engineVersion, ioManager, new DisposeInvoker(CleanupAction), Path.DirectorySeparatorChar + LegacyADirectoryName); +#pragma warning restore CA2000 // Dispose objects before losing scope } - lock (jobLockCounts) + lock (jobLockManagers) { - if (!jobLockCounts.TryGetValue(compileJobId, out int value)) + IDmbProvider lockedProvider; + if (!jobLockManagers.TryGetValue(compileJobId, out var lockManager)) { - value = 1; - logger.LogTrace("Initializing lock count for compile job {id}", compileJobId); - jobLockCounts.Add(compileJobId, 1); + lockManager = DeploymentLockManager.Create(newProvider, logger, reason, out lockedProvider); + jobLockManagers.Add(compileJobId, lockManager); + + providerSubmitted = true; } else { - logger.LogTrace("FromCompileJob already had a jobLockCounts entry for {id}. Incrementing lock count to {value}.", compileJobId, value); - jobLockCounts[compileJobId] = ++value; + lockedProvider = lockManager.AddLock(reason, callerFile, callerLine); // race condition + lockManager = null; } - providerSubmitted = true; - - LogLockCounts(); - return newProvider; + return (DmbProvider: lockedProvider, LockManager: lockManager); } } finally @@ -348,77 +437,6 @@ void CleanupAction() await newProvider.DisposeAsync(); } } -#pragma warning restore CA1506 - - /// -#pragma warning disable CA1506 // TODO: Decomplexify - public async ValueTask CleanUnusedCompileJobs(CancellationToken cancellationToken) - { - List jobIdsToSkip; - - // don't clean locked directories - lock (jobLockCounts) - jobIdsToSkip = jobLockCounts.Keys.ToList(); - - List? jobUidsToNotErase = null; - - // find the uids of locked directories - if (jobIdsToSkip.Count > 0) - { - await databaseContextFactory.UseContext(async db => - { - jobUidsToNotErase = (await db - .CompileJobs - .AsQueryable() - .Where( - x => x.Job.Instance!.Id == metadata.Id - && jobIdsToSkip.Contains(x.Id!.Value)) - .Select(x => x.DirectoryName!.Value) - .ToListAsync(cancellationToken)) - .Select(x => x.ToString()) - .ToList(); - }); - } - else - jobUidsToNotErase = new List(); - - jobUidsToNotErase!.Add(SwappableDmbProvider.LiveGameDirectory); - - logger.LogTrace("We will not clean the following directories: {directoriesToNotClean}", String.Join(", ", jobUidsToNotErase)); - - // cleanup - var gameDirectory = ioManager.ResolvePath(); - await ioManager.CreateDirectory(gameDirectory, cancellationToken); - var directories = await ioManager.GetDirectories(gameDirectory, cancellationToken); - int deleting = 0; - var tasks = directories.Select(async x => - { - var nameOnly = ioManager.GetFileName(x); - if (jobUidsToNotErase.Contains(nameOnly)) - return; - logger.LogDebug("Cleaning unused game folder: {dirName}...", nameOnly); - try - { - ++deleting; - await DeleteCompileJobContent(x, cancellationToken); - } - catch (Exception e) when (e is not OperationCanceledException) - { - logger.LogWarning(e, "Error deleting directory {dirName}!", x); - } - }).ToList(); - if (deleting > 0) - await ValueTaskExtensions.WhenAll(tasks); - } -#pragma warning restore CA1506 - - /// - public CompileJob? LatestCompileJob() - { - if (!DmbAvailable) - return null; - return LockNextDmb(0)?.CompileJob; - } /// /// Delete the of . @@ -428,6 +446,9 @@ void CleanRegisteredCompileJob(CompileJob job) { Task HandleCleanup() { + lock (jobLockManagers) + jobLockManagers.Remove(job.Require(x => x.Id)); + var otherTask = cleanupTask; async Task WrapThrowableTasks() @@ -453,26 +474,8 @@ async Task WrapThrowableTasks() return Task.WhenAll(otherTask, WrapThrowableTasks()); } - lock (jobLockCounts) - { - var jobId = job.Require(x => x.Id); - if (jobLockCounts.TryGetValue(jobId, out var currentVal)) - if (currentVal == 1) - { - jobLockCounts.Remove(jobId); - logger.LogDebug("Cleaning lock-free compile job {id} => {dirName}", jobId, job.DirectoryName); - cleanupTask = HandleCleanup(); - } - else - { - var decremented = --jobLockCounts[jobId]; - logger.LogTrace("Compile job {id} lock count now: {lockCount}", jobId, decremented); - } - else - logger.LogError("Extra Dispose of DmbProvider for CompileJob {compileJobId}!", jobId); - - LogLockCounts(); - } + lock (cleanupCts) + cleanupTask = HandleCleanup(); } /// @@ -487,30 +490,5 @@ async ValueTask DeleteCompileJobContent(string directory, CancellationToken canc await eventConsumer.HandleEvent(EventType.DeploymentCleanup, new List { ioManager.ResolvePath(directory) }, true, cancellationToken); await ioManager.DeleteDirectory(directory, cancellationToken); } - - /// - /// Log out the current lock counts to Trace. - /// - /// must be locked before calling this function. - void LogLockCounts() - { - if (jobLockCounts.Count == 0) - { - logger.LogWarning("No compile jobs registered!"); - return; - } - - var builder = new StringBuilder(); - foreach (var jobId in jobLockCounts.Keys) - { - builder.AppendLine(); - builder.Append("\t- "); - builder.Append(jobId); - builder.Append(": "); - builder.Append(jobLockCounts[jobId]); - } - - logger.LogTrace("Compile Job Lock Counts:{details}", builder.ToString()); - } } } diff --git a/src/Tgstation.Server.Host/Components/Deployment/DmbLock.cs b/src/Tgstation.Server.Host/Components/Deployment/DmbLock.cs new file mode 100644 index 00000000000..677da1d915a --- /dev/null +++ b/src/Tgstation.Server.Host/Components/Deployment/DmbLock.cs @@ -0,0 +1,82 @@ +using System; +using System.Threading.Tasks; + +using Tgstation.Server.Api.Models; +using Tgstation.Server.Host.Models; + +namespace Tgstation.Server.Host.Components.Deployment +{ + /// + /// Represents a lock on a given . + /// + sealed class DmbLock : IDmbProvider + { + /// + public string DmbName => baseProvider.DmbName; + + /// + public string Directory => baseProvider.Directory; + + /// + public CompileJob CompileJob => baseProvider.CompileJob; + + /// + public EngineVersion EngineVersion => baseProvider.EngineVersion; + + /// + /// Unique ID of the lock. + /// + public Guid LockID { get; } + + /// + /// The of when the lock was acquired. + /// + public DateTimeOffset LockTime { get; } + + /// + /// A description of the 's purpose. + /// + public string Descriptor { get; } + + /// + /// If was called on the . + /// + public bool KeptAlive { get; private set; } + + /// + /// The being wrapped. + /// + readonly IDmbProvider baseProvider; + + /// + /// A to use as the implementation of . + /// + readonly Func disposeAction; + + /// + /// Initializes a new instance of the class. + /// + /// The value of . + /// The value of . + /// The value of . + public DmbLock(Func disposeAction, IDmbProvider baseProvider, string descriptor) + { + this.disposeAction = disposeAction ?? throw new ArgumentNullException(nameof(disposeAction)); + this.baseProvider = baseProvider ?? throw new ArgumentNullException(nameof(baseProvider)); + Descriptor = descriptor ?? throw new ArgumentNullException(nameof(descriptor)); + + LockID = Guid.NewGuid(); + LockTime = DateTimeOffset.UtcNow; + } + + /// + public ValueTask DisposeAsync() => disposeAction(); + + /// + public void KeepAlive() + { + KeptAlive = true; + baseProvider.KeepAlive(); + } + } +} diff --git a/src/Tgstation.Server.Host/Components/Deployment/DreamMaker.cs b/src/Tgstation.Server.Host/Components/Deployment/DreamMaker.cs index 76c415bac45..1b8eed488d4 100644 --- a/src/Tgstation.Server.Host/Components/Deployment/DreamMaker.cs +++ b/src/Tgstation.Server.Host/Components/Deployment/DreamMaker.cs @@ -326,7 +326,7 @@ await databaseContextFactory.UseContext( likelyPushedTestMergeCommit, cancellationToken); - var activeCompileJob = compileJobConsumer.LatestCompileJob(); + var activeCompileJob = await compileJobConsumer.LatestCompileJob(); try { await databaseContextFactory.UseContext( diff --git a/src/Tgstation.Server.Host/Components/Deployment/IDmbFactory.cs b/src/Tgstation.Server.Host/Components/Deployment/IDmbFactory.cs index 08d1a782b54..c737b8b3585 100644 --- a/src/Tgstation.Server.Host/Components/Deployment/IDmbFactory.cs +++ b/src/Tgstation.Server.Host/Components/Deployment/IDmbFactory.cs @@ -1,4 +1,5 @@ using System; +using System.Runtime.CompilerServices; using System.Threading; using System.Threading.Tasks; @@ -25,17 +26,22 @@ public interface IDmbFactory : ILatestCompileJobProvider, IComponentService, IDi /// /// Gets the next . is a precondition. /// - /// The amount of locks to give the resulting . It's must be called this many times to properly clean the job. + /// The reason the lock is being acquired. + /// The file path of the calling function. + /// The line number of the call invocation. /// A new . - IDmbProvider LockNextDmb(int lockCount); + IDmbProvider LockNextDmb(string reason, [CallerFilePath] string? callerFile = null, [CallerLineNumber] int callerLine = default); /// /// Gets a for a given . /// /// The to make the for. + /// The reason the compile job needed to be loaded. /// The for the operation. + /// The file path of the calling function. + /// The line number of the call invocation. /// A resulting in a new representing the on success, on failure. - ValueTask FromCompileJob(CompileJob compileJob, CancellationToken cancellationToken); + ValueTask FromCompileJob(CompileJob compileJob, string reason, CancellationToken cancellationToken, [CallerFilePath] string? callerFile = null, [CallerLineNumber] int callerLine = default); /// /// Deletes all compile jobs that are inactive in the Game folder. diff --git a/src/Tgstation.Server.Host/Components/Deployment/ILatestCompileJobProvider.cs b/src/Tgstation.Server.Host/Components/Deployment/ILatestCompileJobProvider.cs index 021f62602d7..253fe939faa 100644 --- a/src/Tgstation.Server.Host/Components/Deployment/ILatestCompileJobProvider.cs +++ b/src/Tgstation.Server.Host/Components/Deployment/ILatestCompileJobProvider.cs @@ -1,4 +1,6 @@ -using Tgstation.Server.Host.Models; +using System.Threading.Tasks; + +using Tgstation.Server.Host.Models; namespace Tgstation.Server.Host.Components.Deployment { @@ -10,7 +12,7 @@ public interface ILatestCompileJobProvider /// /// Gets the latest . /// - /// The latest or if none are available. - CompileJob? LatestCompileJob(); + /// A resulting in the latest or if none are available. + ValueTask LatestCompileJob(); } } diff --git a/src/Tgstation.Server.Host/Components/Instance.cs b/src/Tgstation.Server.Host/Components/Instance.cs index f6d417eb7a3..d178eb94e0f 100644 --- a/src/Tgstation.Server.Host/Components/Instance.cs +++ b/src/Tgstation.Server.Host/Components/Instance.cs @@ -258,7 +258,7 @@ public async ValueTask ScheduleAutoUpdate(uint newInterval, string? newCron) } /// - public CompileJob? LatestCompileJob() => dmbFactory.LatestCompileJob(); + public ValueTask LatestCompileJob() => dmbFactory.LatestCompileJob(); /// /// The for updating the repository. @@ -576,7 +576,7 @@ await jobManager.RegisterOperation( continue; } - if (deploySha == LatestCompileJob()?.RevisionInformation.CommitSha) + if (deploySha == (await LatestCompileJob())?.RevisionInformation.CommitSha) { logger.LogTrace("Aborting auto update, same revision as latest CompileJob"); continue; diff --git a/src/Tgstation.Server.Host/Components/InstanceWrapper.cs b/src/Tgstation.Server.Host/Components/InstanceWrapper.cs index 249342fc379..b5a835420d9 100644 --- a/src/Tgstation.Server.Host/Components/InstanceWrapper.cs +++ b/src/Tgstation.Server.Host/Components/InstanceWrapper.cs @@ -58,6 +58,6 @@ public InstanceWrapper() public ValueTask ScheduleAutoUpdate(uint newInterval, string? newCron) => Instance.ScheduleAutoUpdate(newInterval, newCron); /// - public CompileJob? LatestCompileJob() => Instance.LatestCompileJob(); + public ValueTask LatestCompileJob() => Instance.LatestCompileJob(); } } diff --git a/src/Tgstation.Server.Host/Components/Session/SessionPersistor.cs b/src/Tgstation.Server.Host/Components/Session/SessionPersistor.cs index d6615647231..5c87bfda5f0 100644 --- a/src/Tgstation.Server.Host/Components/Session/SessionPersistor.cs +++ b/src/Tgstation.Server.Host/Components/Session/SessionPersistor.cs @@ -208,7 +208,7 @@ await databaseContextFactory.UseContext(async (db) => return null; } - var dmb = await dmbFactory.FromCompileJob(result!.CompileJob!, cancellationToken); + var dmb = await dmbFactory.FromCompileJob(result!.CompileJob!, "Session Loading Main Deployment", cancellationToken); if (dmb == null) { logger.LogError("Unable to reattach! Could not load .dmb!"); @@ -230,7 +230,7 @@ await db if (result.InitialCompileJob != null) { logger.LogTrace("Loading initial compile job..."); - initialDmb = await dmbFactory.FromCompileJob(result.InitialCompileJob, cancellationToken); + initialDmb = await dmbFactory.FromCompileJob(result.InitialCompileJob, "Session Loading Initial Deployment", cancellationToken); } logger.LogTrace("Retrieved ReattachInformation"); diff --git a/src/Tgstation.Server.Host/Components/Watchdog/AdvancedWatchdog.cs b/src/Tgstation.Server.Host/Components/Watchdog/AdvancedWatchdog.cs index 610fe25bba3..b58db536015 100644 --- a/src/Tgstation.Server.Host/Components/Watchdog/AdvancedWatchdog.cs +++ b/src/Tgstation.Server.Host/Components/Watchdog/AdvancedWatchdog.cs @@ -232,7 +232,7 @@ async Task CleanupLingeringDeployment() /// protected sealed override async ValueTask HandleNewDmbAvailable(CancellationToken cancellationToken) { - IDmbProvider compileJobProvider = DmbFactory.LockNextDmb(1); + IDmbProvider compileJobProvider = DmbFactory.LockNextDmb("AdvancedWatchdog next compile job preload"); bool canSeamlesslySwap = CanUseSwappableDmbProvider(compileJobProvider); if (canSeamlesslySwap) if (compileJobProvider.CompileJob.EngineVersion != ActiveCompileJob!.EngineVersion) diff --git a/src/Tgstation.Server.Host/Components/Watchdog/BasicWatchdog.cs b/src/Tgstation.Server.Host/Components/Watchdog/BasicWatchdog.cs index edc50518d45..5bc8089824d 100644 --- a/src/Tgstation.Server.Host/Components/Watchdog/BasicWatchdog.cs +++ b/src/Tgstation.Server.Host/Components/Watchdog/BasicWatchdog.cs @@ -214,7 +214,7 @@ protected override async ValueTask InitController( { // don't need a new dmb if reattaching var reattachInProgress = reattachInfo != null; - var dmbToUse = reattachInProgress ? null : DmbFactory.LockNextDmb(1); + var dmbToUse = reattachInProgress ? null : DmbFactory.LockNextDmb("Watchdog initialization"); // if this try catches something, both servers are killed try diff --git a/src/Tgstation.Server.Host/Components/Watchdog/WatchdogBase.cs b/src/Tgstation.Server.Host/Components/Watchdog/WatchdogBase.cs index ec3ae853469..e8d8290aa09 100644 --- a/src/Tgstation.Server.Host/Components/Watchdog/WatchdogBase.cs +++ b/src/Tgstation.Server.Host/Components/Watchdog/WatchdogBase.cs @@ -815,21 +815,18 @@ await AsyncDelayer.Delay( /// /// The session's current . /// A that completes if and when a newer is available. - Task InitialCheckDmbUpdated(CompileJob currentCompileJob) + async Task InitialCheckDmbUpdated(CompileJob currentCompileJob) { var factoryTask = DmbFactory.OnNewerDmb; - var latestCompileJob = DmbFactory.LatestCompileJob(); - if (latestCompileJob == null) - return factoryTask; - - if (latestCompileJob.Id != currentCompileJob.Id) + var latestCompileJob = await DmbFactory.LatestCompileJob(); + if (latestCompileJob != null && latestCompileJob.Id != currentCompileJob.Id) { Logger.LogDebug("Found new CompileJob without waiting"); - return Task.CompletedTask; + return; } - return factoryTask; + await factoryTask; } /// diff --git a/src/Tgstation.Server.Host/Components/Watchdog/WindowsWatchdog.cs b/src/Tgstation.Server.Host/Components/Watchdog/WindowsWatchdog.cs index e75ba745119..b03cad9bae3 100644 --- a/src/Tgstation.Server.Host/Components/Watchdog/WindowsWatchdog.cs +++ b/src/Tgstation.Server.Host/Components/Watchdog/WindowsWatchdog.cs @@ -87,7 +87,7 @@ protected override async ValueTask ApplyInitialDmb(CancellationToken cancellatio return; } - Server.ReattachInformation.InitialDmb = await DmbFactory.FromCompileJob(Server.CompileJob, cancellationToken); + Server.ReattachInformation.InitialDmb = await DmbFactory.FromCompileJob(Server.CompileJob, "WindowsWatchdog Initial Deployment", cancellationToken); } /// diff --git a/src/Tgstation.Server.Host/Controllers/DreamDaemonController.cs b/src/Tgstation.Server.Host/Controllers/DreamDaemonController.cs index 17006bfa2e2..11e8fc1d165 100644 --- a/src/Tgstation.Server.Host/Controllers/DreamDaemonController.cs +++ b/src/Tgstation.Server.Host/Controllers/DreamDaemonController.cs @@ -386,7 +386,7 @@ ValueTask ReadImpl(DreamDaemonSettings? settings, bool knownForce if (revision) { - var latestCompileJob = instance.LatestCompileJob(); + var latestCompileJob = await instance.LatestCompileJob(); result.ActiveCompileJob = ((instance.Watchdog.Status != WatchdogStatus.Offline ? dd.ActiveCompileJob : latestCompileJob) ?? latestCompileJob) From cbb1c27b19fe5d3ee811c89b2d082bda2330e609 Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Sun, 16 Jun 2024 12:13:05 -0400 Subject: [PATCH 14/19] Add periodic deployment lock state logging --- .../Deployment/DeploymentLockManager.cs | 17 ++++++ .../Components/Deployment/DmbFactory.cs | 56 ++++++++++++++++++- .../Components/InstanceFactory.cs | 1 + 3 files changed, 72 insertions(+), 2 deletions(-) diff --git a/src/Tgstation.Server.Host/Components/Deployment/DeploymentLockManager.cs b/src/Tgstation.Server.Host/Components/Deployment/DeploymentLockManager.cs index 552fda6c8de..46efbfdbc90 100644 --- a/src/Tgstation.Server.Host/Components/Deployment/DeploymentLockManager.cs +++ b/src/Tgstation.Server.Host/Components/Deployment/DeploymentLockManager.cs @@ -1,6 +1,8 @@ using System; using System.Collections.Generic; +using System.Globalization; using System.Runtime.CompilerServices; +using System.Text; using System.Threading.Tasks; using Microsoft.Extensions.Logging; @@ -95,6 +97,7 @@ public ValueTask DisposeAsync() /// A whose lifetime represents the lock. public IDmbProvider AddLock(string reason, [CallerFilePath] string? callerFile = null, [CallerLineNumber]int callerLine = default) { + ArgumentNullException.ThrowIfNull(reason); lock (locks) { if (locks.Count == 0) @@ -104,6 +107,20 @@ public IDmbProvider AddLock(string reason, [CallerFilePath] string? callerFile = } } + /// + /// Add lock stats to a given . + /// + /// The to append to. + public void LogLockStats(StringBuilder stringBuilder) + { + ArgumentNullException.ThrowIfNull(stringBuilder); + + stringBuilder.AppendLine(CultureInfo.InvariantCulture, $"Compile Job #{CompileJob.Id}: {CompileJob.DirectoryName}"); + lock (locks) + foreach (var dmbLock in locks) + stringBuilder.AppendLine(CultureInfo.InvariantCulture, $"\t-{GetFullLockDescriptor(dmbLock)}"); + } + /// /// Creates a and adds it to . /// diff --git a/src/Tgstation.Server.Host/Components/Deployment/DmbFactory.cs b/src/Tgstation.Server.Host/Components/Deployment/DmbFactory.cs index de885df1725..ab2349d3a23 100644 --- a/src/Tgstation.Server.Host/Components/Deployment/DmbFactory.cs +++ b/src/Tgstation.Server.Host/Components/Deployment/DmbFactory.cs @@ -4,6 +4,7 @@ using System.IO; using System.Linq; using System.Runtime.CompilerServices; +using System.Text; using System.Threading; using System.Threading.Tasks; @@ -61,10 +62,15 @@ public Task OnNewerDmb readonly ILogger logger; /// - /// The for . + /// The for the . /// readonly IEventConsumer eventConsumer; + /// + /// The for the . + /// + readonly IAsyncDelayer asyncDelayer; + /// /// The for the . /// @@ -75,6 +81,11 @@ public Task OnNewerDmb /// readonly CancellationTokenSource cleanupCts; + /// + /// The for . + /// + readonly CancellationTokenSource lockLogCts; + /// /// Map of s to locks on them. /// @@ -107,6 +118,7 @@ public Task OnNewerDmb /// The value of . /// The value of . /// The value of . + /// The value of . /// The value of . /// The value of . public DmbFactory( @@ -114,6 +126,7 @@ public DmbFactory( IIOManager ioManager, IRemoteDeploymentManagerFactory remoteDeploymentManagerFactory, IEventConsumer eventConsumer, + IAsyncDelayer asyncDelayer, ILogger logger, Api.Models.Instance metadata) { @@ -121,17 +134,24 @@ public DmbFactory( this.ioManager = ioManager ?? throw new ArgumentNullException(nameof(ioManager)); this.remoteDeploymentManagerFactory = remoteDeploymentManagerFactory ?? throw new ArgumentNullException(nameof(remoteDeploymentManagerFactory)); this.eventConsumer = eventConsumer ?? throw new ArgumentNullException(nameof(eventConsumer)); + this.asyncDelayer = asyncDelayer ?? throw new ArgumentNullException(nameof(asyncDelayer)); this.logger = logger ?? throw new ArgumentNullException(nameof(logger)); this.metadata = metadata ?? throw new ArgumentNullException(nameof(metadata)); cleanupTask = Task.CompletedTask; newerDmbTcs = new TaskCompletionSource(); cleanupCts = new CancellationTokenSource(); + lockLogCts = new CancellationTokenSource(); jobLockManagers = new Dictionary(); } /// - public void Dispose() => cleanupCts.Dispose(); // we don't dispose nextDmbProvider here, since it might be the only thing we have + public void Dispose() + { + // we don't dispose nextDmbProvider here, since it might be the only thing we have + lockLogCts.Dispose(); + cleanupCts.Dispose(); + } /// public async ValueTask LoadCompileJob(CompileJob job, Action? activationAction, CancellationToken cancellationToken) @@ -205,6 +225,7 @@ await databaseContextFactory.UseContext( } // we dont do CleanUnusedCompileJobs here because the watchdog may have plans for them yet + cleanupTask = Task.WhenAll(cleanupTask, LogLockStates()); } /// @@ -212,6 +233,8 @@ public async Task StopAsync(CancellationToken cancellationToken) { try { + lockLogCts.Cancel(); + lock (jobLockManagers) remoteDeploymentManagerFactory.ForgetLocalStateForCompileJobs(jobLockManagers.Keys); @@ -490,5 +513,34 @@ async ValueTask DeleteCompileJobContent(string directory, CancellationToken canc await eventConsumer.HandleEvent(EventType.DeploymentCleanup, new List { ioManager.ResolvePath(directory) }, true, cancellationToken); await ioManager.DeleteDirectory(directory, cancellationToken); } + + /// + /// Lock all s states. + /// + /// A representing the running operation. + async Task LogLockStates() + { + logger.LogTrace("Entering lock logging loop"); + CancellationToken cancellationToken = lockLogCts.Token; + + while (!cancellationToken.IsCancellationRequested) + try + { + var builder = new StringBuilder(); + + lock (jobLockManagers) + foreach (var lockManager in jobLockManagers.Values) + lockManager.LogLockStats(builder); + + logger.LogDebug("Periodic deployment log states report (R.e. Issue #1779):{newLine}{report}", Environment.NewLine, builder); // TODO: Reduce to trace once #1779 is fixed + + await asyncDelayer.Delay(TimeSpan.FromMinutes(10), cancellationToken); + } + catch (OperationCanceledException ex) + { + logger.LogTrace(ex, "Exiting lock logging loop"); + break; + } + } } } diff --git a/src/Tgstation.Server.Host/Components/InstanceFactory.cs b/src/Tgstation.Server.Host/Components/InstanceFactory.cs index f12eb28fa5e..70ef1dd0284 100644 --- a/src/Tgstation.Server.Host/Components/InstanceFactory.cs +++ b/src/Tgstation.Server.Host/Components/InstanceFactory.cs @@ -290,6 +290,7 @@ public async ValueTask CreateInstance(IBridgeRegistrar bridgeRegistra gameIoManager, remoteDeploymentManagerFactory, eventConsumer, + asyncDelayer, loggerFactory.CreateLogger(), metadata); try From 03dfbbaa65c658f801cf362420e80a1c7ed18ae2 Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 18:00:08 -0400 Subject: [PATCH 15/19] Promote this warning to an error --- src/Tgstation.Server.Host/Components/Deployment/DmbFactory.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Tgstation.Server.Host/Components/Deployment/DmbFactory.cs b/src/Tgstation.Server.Host/Components/Deployment/DmbFactory.cs index ab2349d3a23..e71385439d7 100644 --- a/src/Tgstation.Server.Host/Components/Deployment/DmbFactory.cs +++ b/src/Tgstation.Server.Host/Components/Deployment/DmbFactory.cs @@ -371,7 +371,7 @@ await databaseContextFactory.UseContext( EngineVersion engineVersion; if (!EngineVersion.TryParse(compileJob.EngineVersion, out var engineVersionNullable)) { - logger.LogWarning("Error loading compile job, bad engine version: {engineVersion}", compileJob.EngineVersion); + logger.LogError("Error loading compile job, bad engine version: {engineVersion}", compileJob.EngineVersion); return (null, null); // omae wa mou shinderu } else From 6748193fcbb3b3cacc646e10f6c5d23789e41c67 Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 18:00:19 -0400 Subject: [PATCH 16/19] Promote this debug message to info --- src/Tgstation.Server.Host/Components/Engine/EngineManager.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Tgstation.Server.Host/Components/Engine/EngineManager.cs b/src/Tgstation.Server.Host/Components/Engine/EngineManager.cs index a7cbd10f4f0..f2a60ae5b9f 100644 --- a/src/Tgstation.Server.Host/Components/Engine/EngineManager.cs +++ b/src/Tgstation.Server.Host/Components/Engine/EngineManager.cs @@ -466,7 +466,7 @@ async ValueTask AssertAndLockVersion( logger.LogWarning("The required engine version ({version}) is not readily available! We will have to install it.", version); } else - logger.LogDebug("Requested engine version {version} not currently installed. Doing so now...", version); + logger.LogInformation("Requested engine version {version} not currently installed. Doing so now...", version); if (progressReporter != null) progressReporter.StageName = "Running event"; From d7ecfd0ed1e3e5b18e0ea9e06f4d7842c41f6710 Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 18:00:48 -0400 Subject: [PATCH 17/19] Fix a weird overflow error I've seen --- tests/Tgstation.Server.Tests/Live/Instance/JobsRequiredTest.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/Tgstation.Server.Tests/Live/Instance/JobsRequiredTest.cs b/tests/Tgstation.Server.Tests/Live/Instance/JobsRequiredTest.cs index c6501e6766a..d5125e749c9 100644 --- a/tests/Tgstation.Server.Tests/Live/Instance/JobsRequiredTest.cs +++ b/tests/Tgstation.Server.Tests/Live/Instance/JobsRequiredTest.cs @@ -162,7 +162,8 @@ protected async Task WaitForJobProgressThenCancel(JobResponse origi await JobsClient.Cancel(job, cancellationToken); - timeout -= (int)Math.Ceiling((DateTimeOffset.UtcNow - start).TotalSeconds); + timeout -= Math.Max(0, (int)Math.Ceiling((DateTimeOffset.UtcNow - start).TotalSeconds)); + timeout = Math.Max(0, timeout); return await WaitForJob(job, timeout, false, null, cancellationToken); } } From d370c2cafba9023d6e762d13f7d40938cba5f9e4 Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Fri, 12 Jul 2024 18:01:46 -0400 Subject: [PATCH 18/19] Version bump to 6.6.1 --- build/Version.props | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/Version.props b/build/Version.props index ae7135aacbf..e8e62c7dc6f 100644 --- a/build/Version.props +++ b/build/Version.props @@ -3,7 +3,7 @@ - 6.6.0 + 6.6.1 5.1.0 10.4.0 7.0.0 From 8401f3c3e18224e8ae377854335571ddae92fc51 Mon Sep 17 00:00:00 2001 From: Jordan Dominion Date: Sat, 13 Jul 2024 19:36:39 -0400 Subject: [PATCH 19/19] Fix game directory leaks in `HardLinkDmbProvider` - Adjust tests to account for expected directories correctly. - Better logging of leftover directories. - Fix reboot not being waited on in a test case. --- .../Deployment/HardLinkDmbProvider.cs | 106 +++++++++++++----- .../Live/Instance/InstanceTest.cs | 11 +- .../Live/Instance/WatchdogTest.cs | 9 +- .../Live/TestLiveServer.cs | 2 +- 4 files changed, 93 insertions(+), 35 deletions(-) diff --git a/src/Tgstation.Server.Host/Components/Deployment/HardLinkDmbProvider.cs b/src/Tgstation.Server.Host/Components/Deployment/HardLinkDmbProvider.cs index 09cdfb31d5e..ed6e72b4b6a 100644 --- a/src/Tgstation.Server.Host/Components/Deployment/HardLinkDmbProvider.cs +++ b/src/Tgstation.Server.Host/Components/Deployment/HardLinkDmbProvider.cs @@ -31,7 +31,7 @@ sealed class HardLinkDmbProvider : SwappableDmbProvider /// /// The representing the base provider mirroring operation. /// - readonly Task mirroringTask; + readonly Task mirroringTask; /// /// The for the . @@ -77,13 +77,27 @@ public override async ValueTask DisposeAsync() { cancellationTokenSource.Cancel(); cancellationTokenSource.Dispose(); - try + var mirroredDir = await mirroringTask; + if (mirroredDir != null && !Swapped) { - await mirroringTask; - } - catch (OperationCanceledException ex) - { - logger.LogDebug(ex, "Mirroring task cancelled!"); + logger.LogDebug("Cancelled mirroring task, we must cleanup!"); + + // We shouldn't be doing long running I/O ops because this could be under an HTTP request (DELETE /api/DreamDaemon) + // dirty shit to follow: + async void AsyncCleanup() + { + try + { + await IOManager.DeleteDirectory(mirroredDir, CancellationToken.None); // DCT: None available + logger.LogTrace("Completed async cleanup of unused mirror directory: {mirroredDir}", mirroredDir); + } + catch (Exception ex) + { + logger.LogError(ex, "Error cleaning up mirrored directory {mirroredDir}!", mirroredDir); + } + } + + AsyncCleanup(); } await base.DisposeAsync(); @@ -103,6 +117,13 @@ protected override async ValueTask DoSwap(CancellationToken cancellationToken) { logger.LogTrace("Begin DoSwap, mirroring task complete: {complete}...", mirroringTask.IsCompleted); var mirroredDir = await mirroringTask.WaitAsync(cancellationToken); + if (mirroredDir == null) + { + // huh, how? + cancellationToken.ThrowIfCancellationRequested(); + throw new InvalidOperationException("mirroringTask was cancelled without us being cancelled?"); + } + var goAheadTcs = new TaskCompletionSource(); // I feel dirty... @@ -119,6 +140,7 @@ async void DisposeOfOldDirectory() goAheadTcs.SetResult(); logger.LogTrace("Deleting old Live directory {path}...", disposePath); await IOManager.DeleteDirectory(disposePath, CancellationToken.None); // DCT: We're detached at this point + logger.LogTrace("Completed async cleanup of old Live directory: {disposePath}", disposePath); } catch (DirectoryNotFoundException ex) { @@ -148,31 +170,59 @@ async void DisposeOfOldDirectory() /// The launch level. /// The for the operation. /// A resulting in the full path to the mirrored directory. - async Task MirrorSourceDirectory(int? taskThrottle, DreamDaemonSecurity securityLevel, CancellationToken cancellationToken) + async Task MirrorSourceDirectory(int? taskThrottle, DreamDaemonSecurity securityLevel, CancellationToken cancellationToken) { - var stopwatch = Stopwatch.StartNew(); - var mirrorGuid = Guid.NewGuid(); - logger.LogDebug("Starting to mirror {sourceDir} as hard links to {mirrorGuid}...", CompileJob.DirectoryName, mirrorGuid); if (taskThrottle.HasValue && taskThrottle < 1) throw new ArgumentOutOfRangeException(nameof(taskThrottle), taskThrottle, "taskThrottle must be at least 1!"); - var src = IOManager.ResolvePath(CompileJob.DirectoryName!.Value.ToString()); - var dest = IOManager.ResolvePath(mirrorGuid.ToString()); - - using var semaphore = taskThrottle.HasValue ? new SemaphoreSlim(taskThrottle.Value) : null; - await Task.WhenAll(MirrorDirectoryImpl( - src, - dest, - semaphore, - securityLevel, - cancellationToken)); - stopwatch.Stop(); - - logger.LogDebug( - "Finished mirror of {sourceDir} to {mirrorGuid} in {seconds}s...", - CompileJob.DirectoryName, - mirrorGuid, - stopwatch.Elapsed.TotalSeconds.ToString("0.##", CultureInfo.InvariantCulture)); + string? dest = null; + try + { + var stopwatch = Stopwatch.StartNew(); + var mirrorGuid = Guid.NewGuid(); + + logger.LogDebug("Starting to mirror {sourceDir} as hard links to {mirrorGuid}...", CompileJob.DirectoryName, mirrorGuid); + + var src = IOManager.ResolvePath(CompileJob.DirectoryName!.Value.ToString()); + dest = IOManager.ResolvePath(mirrorGuid.ToString()); + + using var semaphore = taskThrottle.HasValue ? new SemaphoreSlim(taskThrottle.Value) : null; + await Task.WhenAll(MirrorDirectoryImpl( + src, + dest, + semaphore, + securityLevel, + cancellationToken)); + stopwatch.Stop(); + + logger.LogDebug( + "Finished mirror of {sourceDir} to {mirrorGuid} in {seconds}s...", + CompileJob.DirectoryName, + mirrorGuid, + stopwatch.Elapsed.TotalSeconds.ToString("0.##", CultureInfo.InvariantCulture)); + } + catch (OperationCanceledException ex) + { + logger.LogDebug(ex, "Cancelled while mirroring"); + } + catch (Exception ex) + { + logger.LogError(ex, "Could not mirror!"); + + if (dest != null) + try + { + logger.LogDebug("Cleaning up mirror attempt: {dest}", dest); + await IOManager.DeleteDirectory(dest, cancellationToken); + } + catch (OperationCanceledException ex2) + { + logger.LogDebug(ex2, "Errored cleanup cancellation edge case!"); + return dest; + } + + return null; + } return dest; } diff --git a/tests/Tgstation.Server.Tests/Live/Instance/InstanceTest.cs b/tests/Tgstation.Server.Tests/Live/Instance/InstanceTest.cs index d84ac0b6c47..9c2b44c9624 100644 --- a/tests/Tgstation.Server.Tests/Live/Instance/InstanceTest.cs +++ b/tests/Tgstation.Server.Tests/Live/Instance/InstanceTest.cs @@ -74,7 +74,12 @@ public async Task RunTests( ddPort, usingBasicWatchdog); await wdt.Run(cancellationToken); - await wdt.ExpectGameDirectoryCount(2, cancellationToken); + + await wdt.ExpectGameDirectoryCount( + usingBasicWatchdog || new PlatformIdentifier().IsWindows + ? 2 // old + new deployment + : 3, // + new mirrored deployment waiting to take over Live + cancellationToken); } public static async ValueTask DownloadEngineVersion( @@ -285,7 +290,9 @@ await instanceClient.Repository.Update(new RepositoryUpdateRequest await instanceClient.DreamDaemon.Shutdown(cancellationToken); - await wdt.ExpectGameDirectoryCount(1, cancellationToken); + await wdt.ExpectGameDirectoryCount( + 1, // current deployment + cancellationToken); await instanceManagerClient.Update(new InstanceUpdateRequest { diff --git a/tests/Tgstation.Server.Tests/Live/Instance/WatchdogTest.cs b/tests/Tgstation.Server.Tests/Live/Instance/WatchdogTest.cs index 9227e491ae6..db91b6f1961 100644 --- a/tests/Tgstation.Server.Tests/Live/Instance/WatchdogTest.cs +++ b/tests/Tgstation.Server.Tests/Live/Instance/WatchdogTest.cs @@ -633,15 +633,16 @@ async Task TestDMApiFreeDeploy(CancellationToken cancellationToken) public async ValueTask ExpectGameDirectoryCount(int expected, CancellationToken cancellationToken) { + string[] lastDirectories; int CountNonLiveDirs() { - var directories = Directory.GetDirectories(Path.Combine(instanceClient.Metadata.Path, "Game")); - return directories.Where(directory => Path.GetFileName(directory) != "Live").Count(); + lastDirectories = Directory.GetDirectories(Path.Combine(instanceClient.Metadata.Path, "Game")); + return lastDirectories.Where(directory => Path.GetFileName(directory) != "Live").Count(); } int nonLiveDirs = 0; // cleanup task is async - for(var i = 0; i < 15; ++i) + for(var i = 0; i < 20; ++i) { nonLiveDirs = CountNonLiveDirs(); if (expected == nonLiveDirs) @@ -651,7 +652,7 @@ int CountNonLiveDirs() } nonLiveDirs = CountNonLiveDirs(); - Assert.AreEqual(expected, nonLiveDirs); + Assert.AreEqual(expected, nonLiveDirs, $"Directories present: {String.Join(", ", lastDirectories.Select(Path.GetFileName))}"); } async Task RunBasicTest(CancellationToken cancellationToken) diff --git a/tests/Tgstation.Server.Tests/Live/TestLiveServer.cs b/tests/Tgstation.Server.Tests/Live/TestLiveServer.cs index b218b146802..bfc4538c685 100644 --- a/tests/Tgstation.Server.Tests/Live/TestLiveServer.cs +++ b/tests/Tgstation.Server.Tests/Live/TestLiveServer.cs @@ -1724,7 +1724,7 @@ async Task WaitForInitialJobs(IInstanceClient instanceClient) Assert.AreEqual(dd.StagedCompileJob.Job.Id, compileJob.Id); expectedCompileJobId = compileJob.Id.Value; - dd = await wdt.TellWorldToReboot(server.UsingBasicWatchdog, cancellationToken); + dd = await wdt.TellWorldToReboot(true, cancellationToken); Assert.AreEqual(dd.ActiveCompileJob.Job.Id, expectedCompileJobId); Assert.AreEqual(WatchdogStatus.Online, dd.Status.Value);