Skip to content

Commit

Permalink
feat_: add Sentry panic reporting (#6054)
Browse files Browse the repository at this point in the history
* feat_: report panics to sentry

* test_: sentry options, params and utils

* feat_: toggle sentry with centralized metrics

* test_: sentry init, report and close

* refactor_: rename public api to generic

* docs_: sentry

* fix_: typo in internal/sentry/README.md

Co-authored-by: osmaczko <33099791+osmaczko@users.noreply.github.com>

* fix_: linter

---------

Co-authored-by: osmaczko <33099791+osmaczko@users.noreply.github.com>
  • Loading branch information
igor-sirotin and osmaczko authored Nov 25, 2024
1 parent 474658b commit 987a9e8
Show file tree
Hide file tree
Showing 16 changed files with 741 additions and 7 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ cmd/status-backend/server/endpoints.go
protocol/messenger_handlers.go
internal/version/VERSION
internal/version/GIT_COMMIT
internal/sentry/SENTRY_CONTEXT_NAME
internal/sentry/SENTRY_CONTEXT_VERSION
internal/sentry/SENTRY_PRODUCTION

# Don't ignore generated files in the vendor/ directory
!vendor/**/*.pb.go
Expand Down
25 changes: 25 additions & 0 deletions api/geth_backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import (
"github.com/status-im/status-go/eth-node/crypto"
"github.com/status-im/status-go/eth-node/types"
"github.com/status-im/status-go/images"
"github.com/status-im/status-go/internal/sentry"
"github.com/status-im/status-go/internal/version"
"github.com/status-im/status-go/logutils"
"github.com/status-im/status-go/multiaccounts"
Expand Down Expand Up @@ -101,6 +102,7 @@ type GethStatusBackend struct {
allowAllRPC bool // used only for tests, disables api method restrictions
LocalPairingStateManager *statecontrol.ProcessStateManager
centralizedMetrics *centralizedmetrics.MetricService
sentryDSN string

logger *zap.Logger
}
Expand Down Expand Up @@ -2119,6 +2121,7 @@ func (b *GethStatusBackend) startNode(config *params.NodeConfig) (err error) {
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("node crashed on start: %v", err)
sentry.RecoverError(err)
}
}()

Expand Down Expand Up @@ -2845,3 +2848,25 @@ func (b *GethStatusBackend) getWalletDBPath(keyUID string) (string, error) {

return filepath.Join(b.rootDataDir, fmt.Sprintf("%s-wallet.db", keyUID)), nil
}

func (b *GethStatusBackend) SetSentryDSN(dsn string) {
b.sentryDSN = dsn
}

func (b *GethStatusBackend) EnablePanicReporting() error {
return sentry.Init(
sentry.WithDSN(b.sentryDSN),
sentry.WithDefaultContext(),
)
}

func (b *GethStatusBackend) DisablePanicReporting() error {
return sentry.Close()
}

func (b *GethStatusBackend) TogglePanicReporting(enabled bool) error {
if enabled {
return b.EnablePanicReporting()
}
return b.DisablePanicReporting()
}
9 changes: 8 additions & 1 deletion cmd/status-backend/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@ import (
"github.com/ethereum/go-ethereum/log"

"github.com/status-im/status-go/cmd/status-backend/server"
"github.com/status-im/status-go/internal/sentry"
"github.com/status-im/status-go/internal/version"
"github.com/status-im/status-go/logutils"
)

var (
address = flag.String("address", "", "host:port to listen")
address = flag.String("address", "127.0.0.1:0", "host:port to listen")
logger = log.New("package", "status-go/cmd/status-backend")
)

Expand All @@ -32,6 +33,12 @@ func init() {
}

func main() {
sentry.MustInit(
sentry.WithDefaultEnvironmentDSN(),
sentry.WithContext("status-backend", version.Version()),
)
defer sentry.Recover()

flag.Parse()

srv := server.NewServer()
Expand Down
15 changes: 12 additions & 3 deletions common/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"go.uber.org/zap"

"github.com/status-im/status-go/eth-node/crypto"
"github.com/status-im/status-go/internal/sentry"
"github.com/status-im/status-go/logutils"
"github.com/status-im/status-go/protocol/identity/alias"
"github.com/status-im/status-go/protocol/protobuf"
Expand Down Expand Up @@ -89,8 +90,16 @@ func IsNil(i interface{}) bool {
}

func LogOnPanic() {
if err := recover(); err != nil {
logutils.ZapLogger().Error("panic in goroutine", zap.Any("error", err), zap.Stack("stacktrace"))
panic(err)
err := recover()
if err == nil {
return
}

logutils.ZapLogger().Error("panic in goroutine",
zap.Any("error", err),
zap.Stack("stacktrace"))

sentry.RecoverError(err)

panic(err)
}
110 changes: 110 additions & 0 deletions internal/sentry/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# Description

This package encapsulates Sentry integration. So far:
- only for status-go (including when running as part of desktop and mobile)
- only for panics (no other error reporting)

Sentry is only enabled for users that **both**:
1. Opted-in for metrics
2. Use builds from our release CI

## 🛬 Where

We use self-hosted Sentry: https://sentry.infra.status.im/

## 🕐 When

### Which panics are reported:

- When running inside `status-desktop`/`status-mobile`:
- during API calls in `/mobile/status.go`
- inside all goroutines
- When running `status-backend`:
- any panic

### Which panics are NOT reported:

- When running inside `status-desktop`/`status-mobile`:
- during API calls in `/services/**/api.go` \
NOTE: These endpoints are executed through `go-ethereum`'s JSON-RPC server, which internally recovers all panics and doesn't provide any events or option to set an interceptor.
The only way to catch these panics is to replace the JSON-RPC server implementation.
- When running `status-go` unit tests:
- any panic \
NOTE: Go internally runs tests in a goroutine. The only way to catch these panics in tests is to manually `defer sentry.Recover()` in each test. This also requires a linter (similar to `lint-panics`) that checks this call is present. \
This is not a priority right now, because:
1. We have direct access to failed tests logs, which contain the panic stacktrace.
2. Panics are not expected to happen. Test must be passing to be able to merge the code. So it's only possible with a flaky test.
3. This would only apply to nightly/develop jobs, as we don't gather panic reports from PR-level jobs.


## 📦 What

Full list can be found in `sentry.Event`.

Notes regarding identity-disclosing properties:
- `ServerName` - completely removed from all events
- `Stacktrace`:
- No private user paths are exposed, as we only gather reports from CI-built binaries.
- `TraceID` - so far will be unique for each event
>Trace: A collection of spans representing the end-to-end journey of a request through your system that all share the same trace ID.
More details in [sentry docs](https://docs.sentry.io/product/explore/traces/#key-concepts).

# Configuration

There are 2 main tags to identify the error. The configuration is a bit complicated, but provides full information.

## Parameters

### `Environment`

| | |
|-------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Defining question | Where it is running? |
| Set time | - `production` can only be set at build time to prevent users from hacking the environment<br>- All others can be set at runtime, because on CI we sometimes use same build for multiple environments |
| Expected values | <table><thead><tr><th>Value</th><th>Description</th></tr></thead><tr><td>`production`</td><td>End user machine</td></tr><tr><td>~~`development`~~</td><td>Developer machine</td></tr><tr><td>~~`ci-pr`~~</td><td>PR-level CI runs</td></tr><tr><td>`ci-main`</td><td>CI runs for stable branch</td></tr><tr><td>`ci-nightly`</td><td>CI nightly jobs on stable branch</td></tr></table>`development` and `ci-pr` are dropped, because we only want to consider panics from stable code |

### `Context`


| | |
|-------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Defining question | What is the executable for the library? |
| Set time | Always at build-time |
| Expected values | <table><thead><tr><th>Value</th><th>Running as...</th></tr></thead><tr><td>`status-desktop`</td><td>Library embedded into [status-desktop](https://github.com/status-im/status-desktop)</td></tr><tr><td>`status-mobile`</td><td>Library embedded into [status-mobile](https://github.com/status-im/status-mobile)</td></tr><tr><td>`status-backend`</td><td>Part of `cmd/status-backend`<br>Can be other `cmd/*` as well.</td></tr><tr><td>`matterbridge`</td><td>Part of [Status/Discord bridge app](https://github.com/status-im/matterbridge)</td></tr><tr><td>`status-go-tests`</td><td>Inside status-go tests</td></tr></table> |


## Environment variables

To cover these requirements, I added these environment variables:

| Environment variable | Provide time | Description |
|---------------------------------------------------|---------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
| `SENTRY_DSN` | - At build time with direct call to `sentry.Init`<br>- At runtime with `InitializeApplication` endpoint | [Sentry DSN](https://docs.sentry.io/concepts/key-terms/dsn-explainer/) to be used |
| `SENTRY_CONTEXT_NAME`<br>`SENTRY_CONTEXT_VERSION` | Build time | Execution context of status-go |
| `SENTRY_PRODUCTION` | Build time | When `true` or `1`:<br>-Defines if this is a production build<br>-Sets environment to `production`<br>-Has precedence over runtime `SENTRY_ENVIRONMENT` |
| `SENTRY_ENVIRONMENT` | Run time | Sets the environment. Has no effect when `SENTRY_PRODUCTION` is set |

# Client integration

1. Set `SENTRY_CONTEXT_NAME` and `SENTRY_CONTEXT_VERSION` at status-go build time
2. Provide `sentryDSN` to the `InitializeApplication` call.
DSN must be kept private and will be provided by CI. Expect a `STATUS_GO_SENTRY_DSN` environment variable to be provided by CI.

<details>
<summary>Why can't we consume `STATUS_GO_SENTRY_DSN` directly in status-go build?</summary>

In theory, we could. But this would require us to mix approaches of getting the env variable to the code.
Right now we prefer `go:generate + go:embed` approach (e.g. https://github.com/status-im/status-go/pull/6014), but we can't do it in this case, because we must not write the DSN to a local file, which would be a bit vulnerable. And I don't want to go back to `-ldflags="-X github.com/status-im/status-go/internal/sentry.sentryDSN=$(STATUS_GO_SENTRY_DSN:v%=%)` approach.
</details>

# Implementation details

- We recover from panics in here:
https://github.com/status-im/status-go/blob/fcedb013166785e7def8710118086f4b650c33b1/common/utils.go#L102 https://github.com/status-im/status-go/blob/fcedb013166785e7def8710118086f4b650c33b1/mobile/callog/status_request_log.go#L69 https://github.com/status-im/status-go/blob/fcedb013166785e7def8710118086f4b650c33b1/cmd/status-backend/main.go#L40
This covers all goroutines, because we have a linter to check that all goroutines have `defer common.LogOnPanic`.
- Sentry is currently initialized in 2 places:
- `InitializeApplication` - covers desktop/mobile clients
https://github.com/status-im/status-go/blob/fcedb013166785e7def8710118086f4b650c33b1/mobile/status.go#L105-L108
- in `status-backend` - covers functional tests:
https://github.com/status-im/status-go/blob/fcedb013166785e7def8710118086f4b650c33b1/cmd/status-backend/main.go#L36-L39
46 changes: 46 additions & 0 deletions internal/sentry/options.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package sentry

import (
"os"

"github.com/getsentry/sentry-go"
)

const DefaultEnvVarDSN = "SENTRY_DSN_STATUS_GO"
const DefaultEnvVarEnvironment = "SENTRY_ENVIRONMENT"

type Option func(*sentry.ClientOptions)

func WithDSN(dsn string) Option {
return func(o *sentry.ClientOptions) {
o.Dsn = dsn
}
}

func WithEnvironmentDSN(name string) Option {
return WithDSN(os.Getenv(name))
}

func WithDefaultEnvironmentDSN() Option {
return WithEnvironmentDSN(DefaultEnvVarDSN)
}

func WithContext(name string, version string) Option {
return func(o *sentry.ClientOptions) {
if o.Tags == nil {
o.Tags = make(map[string]string)
}
o.Tags["context.name"] = name
o.Tags["context.version"] = version
}
}

func WithDefaultContext() Option {
return WithContext(DefaultContext(), DefaultContextVersion())
}

func applyOptions(cfg *sentry.ClientOptions, opts ...Option) {
for _, opt := range opts {
opt(cfg)
}
}
70 changes: 70 additions & 0 deletions internal/sentry/options_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package sentry

import (
"os"
"testing"

"github.com/brianvoe/gofakeit/v6"
"github.com/getsentry/sentry-go"
"github.com/stretchr/testify/require"
)

func TestWithDSN(t *testing.T) {
t.Parallel()

dsn := "https://examplePublicKey@o0.ingest.sentry.io/0"
option := WithDSN(dsn)
cfg := &sentry.ClientOptions{}
option(cfg)
require.Equal(t, dsn, cfg.Dsn)
}

func TestWithEnvironmentDSN(t *testing.T) {
t.Parallel()

expectedDSN := gofakeit.LetterN(10)
envVar := gofakeit.LetterN(10)

err := os.Setenv(envVar, expectedDSN)
require.NoError(t, err)
t.Cleanup(func() {
err = os.Unsetenv(envVar)
require.NoError(t, err)
})

option := WithEnvironmentDSN(envVar)
cfg := &sentry.ClientOptions{}
option(cfg)
require.Equal(t, expectedDSN, cfg.Dsn)
}

func TestWithContext(t *testing.T) {
t.Parallel()

name := "test-context"
version := "v1.0.0"
option := WithContext(name, version)
cfg := &sentry.ClientOptions{}
option(cfg)
require.Equal(t, name, cfg.Tags["context.name"])
require.Equal(t, version, cfg.Tags["context.version"])
}

func TestApplyOptions(t *testing.T) {
t.Parallel()

dsn := gofakeit.LetterN(10)
name := gofakeit.LetterN(10)
version := gofakeit.LetterN(5)

options := []Option{
WithDSN(dsn),
WithContext(name, version),
}
cfg := &sentry.ClientOptions{}
applyOptions(cfg, options...)

require.Equal(t, dsn, cfg.Dsn)
require.Equal(t, name, cfg.Tags["context.name"])
require.Equal(t, version, cfg.Tags["context.version"])
}
51 changes: 51 additions & 0 deletions internal/sentry/params.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package sentry

import (
_ "embed"
"os"
)

//go:generate sh -c "echo $SENTRY_CONTEXT_NAME > SENTRY_CONTEXT_NAME"
//go:generate sh -c "echo $SENTRY_CONTEXT_VERSION > SENTRY_CONTEXT_VERSION"
//go:generate sh -c "echo $SENTRY_PRODUCTION > SENTRY_PRODUCTION"

const productionEnvironment = "production"

var (
//go:embed SENTRY_CONTEXT_NAME
defaultContextName string

//go:embed SENTRY_CONTEXT_VERSION
defaultContextVersion string

//go:embed SENTRY_PRODUCTION
production string
)

func DefaultContext() string {
return defaultContextName
}

func DefaultContextVersion() string {
return defaultContextVersion
}

func Production() bool {
return production == "true" || production == "1"
}

func Environment() string {
return environment(Production(), DefaultEnvVarEnvironment)
}

func environment(production bool, envvar string) string {
if production {
return productionEnvironment
}
env := os.Getenv(envvar)
if env == productionEnvironment {
// Production environment can only be set during build
return ""
}
return env
}
Loading

0 comments on commit 987a9e8

Please sign in to comment.