From 574066b30b32ed9742042706b59ea0c12d0541d8 Mon Sep 17 00:00:00 2001 From: Marc Huisinga Date: Wed, 7 Aug 2024 08:19:33 +0200 Subject: [PATCH] fix: language server windows issues (#4821) This PR resolves two language server bugs that especially affect Windows users: 1. Editing the header could result in the watchdog not correctly restarting the file worker (#3786, #3787), which would lead to the file seemingly being processed forever. - The cause of this issue was a race condition in the watchdog that was accidentally introduced as far back as #1884: In specific circumstances, the watchdog will attempt forwarding a message to the file worker after the process has exited due to a changed header, but before the file worker exiting has been noticed by the watchdog (which will then restart the file worker). In this case, the watchdog would mark the file worker as having crashed and not look at its exit code to restart the file worker, but instead treat it like a crashed file worker that will only be restarted when editing the file again. Not inspecting the exit code of the file worker when it crashed from forwarding a message from the file worker is necessary since we do not restart the file worker until another notification from the client arrives, and so we would read the same crash exit code over and over again in the main loop of the watchdog if we did not remove it from our list of file workers that we listen to. - This PR resolves this issue by distinguishing between "crashes when forwarding messages to the file worker" and "crashes when forwarding messages from the file worker". In the former case, we still inspect the exit code of the file worker and potentially restart it if the imports changed, whereas in the latter case, we stop inspecting the exit code of the file worker. This is correct because the latter case is exactly the one where we need to stop inspecting the exit code but where a crash cannot occur as a result of a changed header, whereas the former case is exactly the one where we still need to inspect the exit code after a crash to ensure that we restart the file worker in case it exited because the header changed. - At some point in the future, it would be nice to revamp the concurrency model of the watchdog entirely now that we have all those fancy concurrency primitives that were not available four years ago when the watchdog was first written. 2. On an especially slow Windows machine, we found that starting the language server would sometimes not succeed at all because reading from the stdin pipe in the watchdog produced an EINVAL error, which was in turn caused by an NT "pipe empty" error. - After lots of debugging, @Kha found that Lake accidentally passes its stdin to Git because it does not explicitly set the `stdin` field to `null` when spawning the process. - Changing this fixes the issue, which suggests that Git may mutate the pipe we pass to it to be non-blocking, which then causes a "pipe empty" error in the watchdog when we also attempt to read from that same pipe. - I'm still very uncertain why we only saw this issue on one particularly slow machine and not across the whole eco system. This PR also resolves an issue where we would not correctly emit messages that we received while the file worker is being restarted to the corresponding file worker after the restart. Closes #3786, closes #3787. --------- Co-authored-by: Sebastian Ullrich --- src/Lean/Server/Watchdog.lean | 60 ++++++++++++++++++++++++----------- src/lake/Lake/Util/Proc.lean | 1 + 2 files changed, 43 insertions(+), 18 deletions(-) diff --git a/src/Lean/Server/Watchdog.lean b/src/Lean/Server/Watchdog.lean index f49cd50ffc8b..cb1eef0d58c3 100644 --- a/src/Lean/Server/Watchdog.lean +++ b/src/Lean/Server/Watchdog.lean @@ -90,6 +90,10 @@ section Utils | crashed (e : IO.Error) | ioError (e : IO.Error) + inductive CrashOrigin + | fileWorkerToClientForwarding + | clientToFileWorkerForwarding + inductive WorkerState where /-- The watchdog can detect a crashed file worker in two places: When trying to send a message to the file worker and when reading a request reply. @@ -98,7 +102,7 @@ section Utils that are in-flight are errored. Upon receiving the next packet for that file worker, the file worker is restarted and the packet is forwarded to it. If the crash was detected while writing a packet, we queue that packet until the next packet for the file worker arrives. -/ - | crashed (queuedMsgs : Array JsonRpc.Message) + | crashed (queuedMsgs : Array JsonRpc.Message) (origin : CrashOrigin) | running abbrev PendingRequestMap := RBMap RequestID JsonRpc.Message compare @@ -136,6 +140,11 @@ section FileWorker for ⟨id, _⟩ in pendingRequests do hError.writeLspResponseError { id := id, code := code, message := msg } + def queuedMsgs (fw : FileWorker) : Array JsonRpc.Message := + match fw.state with + | .running => #[] + | .crashed queuedMsgs _ => queuedMsgs + end FileWorker end FileWorker @@ -404,10 +413,23 @@ section ServerM return eraseFileWorker uri - def handleCrash (uri : DocumentUri) (queuedMsgs : Array JsonRpc.Message) : ServerM Unit := do + def handleCrash (uri : DocumentUri) (queuedMsgs : Array JsonRpc.Message) (origin: CrashOrigin) : ServerM Unit := do let some fw ← findFileWorker? uri | return - updateFileWorkers { fw with state := WorkerState.crashed queuedMsgs } + updateFileWorkers { fw with state := WorkerState.crashed queuedMsgs origin } + + def tryDischargeQueuedMessages (uri : DocumentUri) (queuedMsgs : Array JsonRpc.Message) : ServerM Unit := do + let some fw ← findFileWorker? uri + | throwServerError "Cannot find file worker for '{uri}'." + let mut crashedMsgs := #[] + -- Try to discharge all queued msgs, tracking the ones that we can't discharge + for msg in queuedMsgs do + try + fw.stdin.writeLspMessage msg + catch _ => + crashedMsgs := crashedMsgs.push msg + if ¬ crashedMsgs.isEmpty then + handleCrash uri crashedMsgs .clientToFileWorkerForwarding /-- Tries to write a message, sets the state of the FileWorker to `crashed` if it does not succeed and restarts the file worker if the `crashed` flag was already set. Just logs an error if @@ -423,7 +445,7 @@ section ServerM let some fw ← findFileWorker? uri | return match fw.state with - | WorkerState.crashed queuedMsgs => + | WorkerState.crashed queuedMsgs _ => let mut queuedMsgs := queuedMsgs if queueFailedMessage then queuedMsgs := queuedMsgs.push msg @@ -432,17 +454,7 @@ section ServerM -- restart the crashed FileWorker eraseFileWorker uri startFileWorker fw.doc - let some newFw ← findFileWorker? uri - | throwServerError "Cannot find file worker for '{uri}'." - let mut crashedMsgs := #[] - -- try to discharge all queued msgs, tracking the ones that we can't discharge - for msg in queuedMsgs do - try - newFw.stdin.writeLspMessage msg - catch _ => - crashedMsgs := crashedMsgs.push msg - if ¬ crashedMsgs.isEmpty then - handleCrash uri crashedMsgs + tryDischargeQueuedMessages uri queuedMsgs | WorkerState.running => let initialQueuedMsgs := if queueFailedMessage then @@ -452,7 +464,7 @@ section ServerM try fw.stdin.writeLspMessage msg catch _ => - handleCrash uri initialQueuedMsgs + handleCrash uri initialQueuedMsgs .clientToFileWorkerForwarding /-- Sends a notification to the file worker identified by `uri` that its dependency `staleDependency` @@ -955,7 +967,16 @@ section MainLoop let workers ← st.fileWorkersRef.get let mut workerTasks := #[] for (_, fw) in workers do - if let WorkerState.running := fw.state then + -- When the forwarding task crashes, its return value will be stuck at + -- `WorkerEvent.crashed _`. + -- We want to handle this event only once, not over and over again, + -- so once the state becomes `WorkerState.crashed _ .fileWorkerToClientForwarding` + -- as a result of `WorkerEvent.crashed _`, we stop handling this event until + -- eventually the file worker is restarted by a notification from the client. + -- We do not want to filter the forwarding task in case of + -- `WorkerState.crashed _ .clientToFileWorkerForwarding`, since the forwarding task + -- exit code may still contain valuable information in this case (e.g. that the imports changed). + if !(fw.state matches WorkerState.crashed _ .fileWorkerToClientForwarding) then workerTasks := workerTasks.push <| fw.commTask.map (ServerEvent.workerEvent fw) let ev ← IO.waitAny (clientTask :: workerTasks.toList) @@ -984,13 +1005,16 @@ section MainLoop | WorkerEvent.ioError e => throwServerError s!"IO error while processing events for {fw.doc.uri}: {e}" | WorkerEvent.crashed _ => - handleCrash fw.doc.uri #[] + handleCrash fw.doc.uri fw.queuedMsgs .fileWorkerToClientForwarding mainLoop clientTask | WorkerEvent.terminated => throwServerError <| "Internal server error: got termination event for worker that " ++ "should have been removed" | .importsChanged => + let uri := fw.doc.uri + let queuedMsgs := fw.queuedMsgs startFileWorker fw.doc + tryDischargeQueuedMessages uri queuedMsgs mainLoop clientTask end MainLoop diff --git a/src/lake/Lake/Util/Proc.lean b/src/lake/Lake/Util/Proc.lean index ec4dd70bc44f..f40f70d92345 100644 --- a/src/lake/Lake/Util/Proc.lean +++ b/src/lake/Lake/Util/Proc.lean @@ -57,6 +57,7 @@ def testProc (args : IO.Process.SpawnArgs) : BaseIO Bool := EIO.catchExceptions (h := fun _ => pure false) do let child ← IO.Process.spawn { args with + stdin := IO.Process.Stdio.null stdout := IO.Process.Stdio.null stderr := IO.Process.Stdio.null }