From 241ee203ebe0c1fda35c10cc3c6e469ff91b60f5 Mon Sep 17 00:00:00 2001 From: Paulo Janotti Date: Tue, 12 Dec 2023 09:29:51 -0800 Subject: [PATCH] Fix detection if the collector is running as service on Windows (#9042) **Description:** Fix run as a service detection on Windows. Instead of trying to detect if it is a service or not, for which both `svc.IsAnInteractiveSession()` and `svc.IsWindowsService()` are somehow broken, try to run as a Windows service, if that fails fallback to run as an interactive process. This follows a recommendation of the Windows [service API documentation](https://learn.microsoft.com/en-us/windows/win32/api/winsvc/nf-winsvc-startservicectrldispatchera#return-value). The new code calls `svc.Run` and in case of error checks for `windows.ERROR_FAILED_SERVICE_CONTROLLER_CONNECT`. If this is the error the application can safely assume that it is not running as a service. The duration of a call to `svc.Run` failing with this error was below 3 microseconds in the current GH runner and below 5 microseconds on my box. While this value seems fine for startup I'm keeping the `NO_WINDOWS_SERVICE` option instead of deprecating it (it doesn't seem worth the trouble of deprecating it). **Link to tracking Issue:** Fix #7350 **Testing:** Fix tested on the Splunk fork that deploys the collector as a service and as an interactive process on Windows containers. **Documentation:** Added changelog. --- .../fix-running-as-service-detection.yaml | 25 ++++++++++++ .../builder/templates/main_windows.go.tmpl | 38 +++++-------------- cmd/otelcorecol/main_windows.go | 38 +++++-------------- docs/troubleshooting.md | 2 +- 4 files changed, 46 insertions(+), 57 deletions(-) create mode 100644 .chloggen/fix-running-as-service-detection.yaml diff --git a/.chloggen/fix-running-as-service-detection.yaml b/.chloggen/fix-running-as-service-detection.yaml new file mode 100644 index 00000000000..d5f3909e6c2 --- /dev/null +++ b/.chloggen/fix-running-as-service-detection.yaml @@ -0,0 +1,25 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: bug_fix + +# The name of the component, or a single word describing the area of concern, (e.g. otlpreceiver) +component: cmd/otelcorecol + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Fix the code detecting if the collector is running as a service on Windows. + +# One or more tracking issues or pull requests related to the change +issues: [7350] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: Removed the `NO_WINDOWS_SERVICE` environment variable given it is not needed anymore. + +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [user] \ No newline at end of file diff --git a/cmd/builder/internal/builder/templates/main_windows.go.tmpl b/cmd/builder/internal/builder/templates/main_windows.go.tmpl index ba327180578..3e001b250bf 100644 --- a/cmd/builder/internal/builder/templates/main_windows.go.tmpl +++ b/cmd/builder/internal/builder/templates/main_windows.go.tmpl @@ -6,41 +6,23 @@ package main import ( + "errors" "fmt" - "os" + "golang.org/x/sys/windows" "golang.org/x/sys/windows/svc" "go.opentelemetry.io/collector/otelcol" ) func run(params otelcol.CollectorSettings) error { - if useInteractiveMode, err := checkUseInteractiveMode(); err != nil { - return err - } else if useInteractiveMode { - return runInteractive(params) - } else { - return runService(params) - } -} - -func checkUseInteractiveMode() (bool, error) { - // If environment variable NO_WINDOWS_SERVICE is set with any value other - // than 0, use interactive mode instead of running as a service. This should - // be set in case running as a service is not possible or desired even - // though the current session is not detected to be interactive - if value, present := os.LookupEnv("NO_WINDOWS_SERVICE"); present && value != "0" { - return true, nil - } - - isInteractiveSession, err := svc.IsAnInteractiveSession() - if err != nil { - return false, fmt.Errorf("failed to determine if we are running in an interactive session: %w", err) - } - return isInteractiveSession, nil -} - -func runService(params otelcol.CollectorSettings) error { - // do not need to supply service name when startup is invoked through Service Control Manager directly + // No need to supply service name when startup is invoked through + // the Service Control Manager directly. if err := svc.Run("", otelcol.NewSvcHandler(params)); err != nil { + if errors.Is(err, windows.ERROR_FAILED_SERVICE_CONTROLLER_CONNECT) { + // Per https://learn.microsoft.com/en-us/windows/win32/api/winsvc/nf-winsvc-startservicectrldispatchera#return-value + // this means that the process is not running as a service, so run interactively. + return runInteractive(params) + } + return fmt.Errorf("failed to start collector server: %w", err) } diff --git a/cmd/otelcorecol/main_windows.go b/cmd/otelcorecol/main_windows.go index ad96fd1e801..285aab81289 100644 --- a/cmd/otelcorecol/main_windows.go +++ b/cmd/otelcorecol/main_windows.go @@ -6,43 +6,25 @@ package main import ( + "errors" "fmt" - "os" + "golang.org/x/sys/windows" "golang.org/x/sys/windows/svc" "go.opentelemetry.io/collector/otelcol" ) func run(params otelcol.CollectorSettings) error { - if useInteractiveMode, err := checkUseInteractiveMode(); err != nil { - return err - } else if useInteractiveMode { - return runInteractive(params) - } else { - return runService(params) - } -} - -func checkUseInteractiveMode() (bool, error) { - // If environment variable NO_WINDOWS_SERVICE is set with any value other - // than 0, use interactive mode instead of running as a service. This should - // be set in case running as a service is not possible or desired even - // though the current session is not detected to be interactive - if value, present := os.LookupEnv("NO_WINDOWS_SERVICE"); present && value != "0" { - return true, nil - } - - isInteractiveSession, err := svc.IsAnInteractiveSession() - if err != nil { - return false, fmt.Errorf("failed to determine if we are running in an interactive session: %w", err) - } - return isInteractiveSession, nil -} - -func runService(params otelcol.CollectorSettings) error { - // do not need to supply service name when startup is invoked through Service Control Manager directly + // No need to supply service name when startup is invoked through + // the Service Control Manager directly. if err := svc.Run("", otelcol.NewSvcHandler(params)); err != nil { + if errors.Is(err, windows.ERROR_FAILED_SERVICE_CONTROLLER_CONNECT) { + // Per https://learn.microsoft.com/en-us/windows/win32/api/winsvc/nf-winsvc-startservicectrldispatchera#return-value + // this means that the process is not running as a service, so run interactively. + return runInteractive(params) + } + return fmt.Errorf("failed to start collector server: %w", err) } diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 24e20bdfe8b..21caec042fc 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -283,7 +283,7 @@ configuration issue. This could be due to a firewall, DNS, or proxy issue. Note that the Collector does have [proxy support](https://github.com/open-telemetry/opentelemetry-collector/tree/main/exporter#proxy-support). -### Startup failing in Windows Docker containers +### Startup failing in Windows Docker containers (v0.90.1 and earlier) The process may fail to start in a Windows Docker container with the following error: `The service process could not connect to the service controller`. In