diff --git a/cmd/exporter/init.go b/cmd/exporter/init.go new file mode 100644 index 00000000..c5f60e40 --- /dev/null +++ b/cmd/exporter/init.go @@ -0,0 +1,24 @@ +package main + +import ( + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/nlconntrack" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/nlqdisc" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/procfd" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/procio" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/procipvs" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/procnetdev" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/procnetstat" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/procsched" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/procsnmp" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/procsock" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/procsoftnet" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/proctcpsummary" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracebiolatency" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracekernel" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracenetiftxlatency" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracenetsoftirq" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracepacketloss" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracesocketlatency" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracetcpreset" + _ "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracevirtcmdlat" +) diff --git a/deploy/config.yaml b/deploy/config.yaml index 75d287d7..44356223 100644 --- a/deploy/config.yaml +++ b/deploy/config.yaml @@ -1,16 +1,30 @@ -debugmode: false -metric_config: - interval: 15 - port: 9102 +debugmode: true +port: 9102 +metrics: probes: - - netdev - - io - - sock - - tcpsummary - - tcp - - tcpext - - udp -event_config: - port: 19102 - loki_enable: false - loki_address: loki-service + - name: conntrack + - name: qdisc + - name: netdev + - name: io + - name: sock + - name: tcpsummary + - name: tcp + - name: tcpext + - name: udp + - name: kernellatency + - name: packetloss +event: + probes: + - name: biolatency + - name: kernellatency + - name: packetloss + - name: tcpreset + sinks: + - name: stderr + - name: file + args: + path: /tmp/exporter.json + - name: loki + args: + addr: 127.0.0.1:3100 + diff --git a/deploy/skoopbundle.yaml b/deploy/skoopbundle.yaml index be6c9f4f..f98a14e4 100644 --- a/deploy/skoopbundle.yaml +++ b/deploy/skoopbundle.yaml @@ -117,28 +117,28 @@ apiVersion: v1 data: config.yaml: |- debugmode: true - metric_config: - interval: 15 - port: 9102 + port: 9102 + metrics: probes: - - netdev - - io - - socketlatency - - packetloss - - sock - - tcpsummary - - tcp - - tcpext - - udp - - net_softirq - - virtcmdlatency - event_config: - port: 19102 - loki_enable: true - loki_address: loki-service + - name: netdev + - name: io + - name: socketlatency + - name: packetloss + - name: sock + - name: tcpsummary + - name: tcp + - name: tcpext + - name: udp + - name: packetloss + event: probes: - - tcpreset - - packetloss + - name: packetloss + - name: tcpreset + sinks: + - name: stderr + - name: loki + args: + addr: loki-service kind: ConfigMap metadata: name: kubeskoop-config diff --git a/go.mod b/go.mod index 897d279e..017ac40c 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/alibaba/kubeskoop go 1.19 require ( + github.com/afiskon/promtail-client v0.0.0-20190305142237-506f3f921e9c github.com/alibabacloud-go/darabonba-openapi v0.2.1 github.com/alibabacloud-go/darabonba-openapi/v2 v2.0.2 github.com/alibabacloud-go/ecs-20140526/v2 v2.1.3 @@ -17,7 +18,6 @@ require ( github.com/fsnotify/fsnotify v1.6.0 github.com/golang/snappy v0.0.4 github.com/google/gops v0.3.26 - github.com/google/uuid v1.3.0 github.com/gorilla/mux v1.8.0 github.com/hashicorp/golang-lru/v2 v2.0.6 github.com/mdlayher/netlink v1.7.1 @@ -29,7 +29,6 @@ require ( github.com/projectcalico/api v0.0.0-20220722155641-439a754a988b github.com/prometheus/client_golang v1.15.1 github.com/prometheus/procfs v0.9.0 - github.com/pterm/pterm v0.12.54 github.com/samber/lo v1.37.0 github.com/sirupsen/logrus v1.9.0 github.com/spf13/cobra v1.6.1 @@ -55,8 +54,6 @@ require ( ) require ( - atomicgo.dev/cursor v0.1.1 // indirect - atomicgo.dev/keyboard v0.2.9 // indirect cdr.dev/slog v1.4.2-0.20221206192828-e4803b10ae17 // indirect github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect github.com/Microsoft/go-winio v0.5.2 // indirect @@ -78,7 +75,6 @@ require ( github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/clbanning/mxj/v2 v2.5.6 // indirect github.com/containerd/cgroups v1.0.4 // indirect - github.com/containerd/console v1.0.3 // indirect github.com/containerd/continuity v0.3.0 // indirect github.com/containerd/fifo v1.0.0 // indirect github.com/containerd/go-cni v1.1.6 // indirect @@ -117,7 +113,7 @@ require ( github.com/google/go-cmp v0.5.9 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect - github.com/gookit/color v1.5.2 // indirect + github.com/google/uuid v1.3.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/imdario/mergo v0.3.12 // indirect github.com/inconshreveable/mousetrap v1.0.1 // indirect @@ -126,13 +122,11 @@ require ( github.com/josharian/native v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.11.13 // indirect - github.com/lithammer/fuzzysearch v1.1.5 // indirect github.com/lucasb-eyer/go-colorful v1.2.0 // indirect github.com/magiconair/properties v1.8.7 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mattn/go-colorable v0.1.12 // indirect github.com/mattn/go-isatty v0.0.14 // indirect - github.com/mattn/go-runewidth v0.0.14 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/mazznoer/csscolorparser v0.1.3 // indirect github.com/mdlayher/socket v0.4.0 // indirect @@ -165,7 +159,6 @@ require ( github.com/subosito/gotenv v1.4.2 // indirect github.com/tchap/go-patricia v2.2.6+incompatible // indirect github.com/tjfoc/gmsm v1.3.2 // indirect - github.com/xo/terminfo v0.0.0-20210125001918-ca9a967f8778 // indirect github.com/yuin/goldmark v1.5.3 // indirect go.etcd.io/bbolt v1.3.6 // indirect go.mozilla.org/pkcs7 v0.0.0-20200128120323-432b2356ecb1 // indirect diff --git a/go.sum b/go.sum index 1f962d4d..c9efec63 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,3 @@ -atomicgo.dev/assert v0.0.2 h1:FiKeMiZSgRrZsPo9qn/7vmr7mCsh5SZyXY4YGYiYwrg= -atomicgo.dev/cursor v0.1.1 h1:0t9sxQomCTRh5ug+hAMCs59x/UmC9QL6Ci5uosINKD4= -atomicgo.dev/cursor v0.1.1/go.mod h1:Lr4ZJB3U7DfPPOkbH7/6TOtJ4vFGHlgj1nc+n900IpU= -atomicgo.dev/keyboard v0.2.9 h1:tOsIid3nlPLZ3lwgG8KZMp/SFmr7P0ssEN5JUsm78K8= -atomicgo.dev/keyboard v0.2.9/go.mod h1:BC4w9g00XkxH/f1HXhW2sXmJFOCWbKn9xrOunSFtExQ= bazil.org/fuse v0.0.0-20160811212531-371fbbdaa898/go.mod h1:Xbm+BRKSBEpa4q4hTSxohYNQpsxXPbPry4JJWOB3LB8= bazil.org/fuse v0.0.0-20200407214033-5883e5a4b512/go.mod h1:FbcW6z/2VytnFDhZfumh8Ss8zxHE6qpMP5sHTRe0EaM= cdr.dev/slog v1.4.2-0.20221206192828-e4803b10ae17 h1:Jf+VOk2lif79HeTlnLaZ70zYTsuVSUEu/47U9VaG2Rw= @@ -76,14 +71,6 @@ github.com/Azure/go-autorest/logger v0.2.1/go.mod h1:T9E3cAhj2VqvPOtCYAvby9aBXkZ github.com/Azure/go-autorest/tracing v0.6.0/go.mod h1:+vhtPC754Xsa23ID7GlGsrdKBpUA79WCAKPPZVC2DeU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= -github.com/MarvinJWendt/testza v0.1.0/go.mod h1:7AxNvlfeHP7Z/hDQ5JtE3OKYT3XFUeLCDE2DQninSqs= -github.com/MarvinJWendt/testza v0.2.1/go.mod h1:God7bhG8n6uQxwdScay+gjm9/LnO4D3kkcZX4hv9Rp8= -github.com/MarvinJWendt/testza v0.2.8/go.mod h1:nwIcjmr0Zz+Rcwfh3/4UhBp7ePKVhuBExvZqnKYWlII= -github.com/MarvinJWendt/testza v0.2.10/go.mod h1:pd+VWsoGUiFtq+hRKSU1Bktnn+DMCSrDrXDpX2bG66k= -github.com/MarvinJWendt/testza v0.2.12/go.mod h1:JOIegYyV7rX+7VZ9r77L/eH6CfJHHzXjB69adAhzZkI= -github.com/MarvinJWendt/testza v0.3.0/go.mod h1:eFcL4I0idjtIx8P9C6KkAuLgATNKpX4/2oUqKc6bF2c= -github.com/MarvinJWendt/testza v0.4.2/go.mod h1:mSdhXiKH8sg/gQehJ63bINcCKp7RtYewEjXsvsVUPbE= -github.com/MarvinJWendt/testza v0.5.1 h1:a9Fqx6vQrHQ4CyiaLhktfTTelwGotmFWy8MNhyaohw8= github.com/Microsoft/go-winio v0.4.11/go.mod h1:VhR8bwka0BXejwEJY73c50VrPtXAaKcyvVC4A4RozmA= github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jBhyzoq1bpyYA= github.com/Microsoft/go-winio v0.4.15-0.20190919025122-fc70bd9a86b5/go.mod h1:tTuCMEN+UleMWgg9dVx4Hu52b1bJo+59jBh3ajtinzw= @@ -120,6 +107,8 @@ github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbt github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ= +github.com/afiskon/promtail-client v0.0.0-20190305142237-506f3f921e9c h1:AMDVOKGaiqse4qiRXSzRgpC9DCNTHCx6zpzdtXXrKM4= +github.com/afiskon/promtail-client v0.0.0-20190305142237-506f3f921e9c/go.mod h1:p/7Wos+jcfrnwLqqzJMZ0s323kfVtJPW+HUvAANklVQ= github.com/ajstarks/svgo v0.0.0-20211024235047-1546f124cd8b h1:slYM766cy2nI3BwyRiyQj/Ud48djTMtMebDqepE95rw= github.com/alecthomas/assert/v2 v2.2.1 h1:XivOgYcduV98QCahG8T5XTezV5bylXe+lBxLG2K2ink= github.com/alecthomas/chroma v0.10.0 h1:7XDcGkCQopCNKjZHfYrNLraA+M7e0fMiJ/Mfikbfjek= @@ -183,7 +172,6 @@ github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmV github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= -github.com/atomicgo/cursor v0.0.1/go.mod h1:cBON2QmmrysudxNBFthvMtN32r3jxVRIvzkUiF/RuIk= github.com/aws/aws-sdk-go v1.15.11/go.mod h1:mFuSZ37Z9YOHbQEwBWztmVzqXrEkub65tZoCYDt7FT0= github.com/bastjan/netstat v1.0.0 h1:enyzPg7lNaOpdKdDHkyPdP+okVKdBgR9/YFnxku7IlE= github.com/bastjan/netstat v1.0.0/go.mod h1:gqJ1/1N3vzrMLk3bMSY2i9xjXe8dzfCVZGaIF19pvdo= @@ -271,7 +259,6 @@ github.com/containerd/console v0.0.0-20181022165439-0650fd9eeb50/go.mod h1:Tj/on github.com/containerd/console v0.0.0-20191206165004-02ecf6a7291e/go.mod h1:8Pf4gM6VEbTNRIT26AyyU7hxdQU3MvAvxVI0sc00XBE= github.com/containerd/console v1.0.1/go.mod h1:XUsP6YE/mKtz6bxc+I8UiKKTP04qjQL4qcS3XoQ5xkw= github.com/containerd/console v1.0.2/go.mod h1:ytZPjGgY2oeTkAONYafi2kSj0aYggsf8acV1PGKCbzQ= -github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw= github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U= github.com/containerd/containerd v1.2.10/go.mod h1:bC6axHOhabU15QhwfG7w5PipXdVtMXFTttgp+kVtyUA= github.com/containerd/containerd v1.3.0-beta.2.0.20190828155532-0293cbd26c69/go.mod h1:bC6axHOhabU15QhwfG7w5PipXdVtMXFTttgp+kVtyUA= @@ -645,10 +632,6 @@ github.com/googleapis/gnostic v0.4.1/go.mod h1:LRhVm6pbyptWbWbuZ38d1eyptfvIytN3i github.com/googleapis/gnostic v0.5.1/go.mod h1:6U4PtQXGIEt/Z3h5MAT7FNofLnw9vXk2cUuW7uA/OeU= github.com/googleapis/gnostic v0.5.5/go.mod h1:7+EbHbldMins07ALC74bsA81Ovc97DwqyJO1AENw9kA= github.com/googleapis/google-cloud-go-testing v0.0.0-20200911160855-bcd43fbb19e8/go.mod h1:dvDLG8qkwmyD9a/MJJN3XJcT3xFxOKAvTZGvuZmac9g= -github.com/gookit/color v1.4.2/go.mod h1:fqRyamkC1W8uxl+lxCQxOT09l/vYfZ+QeiX3rKQHCoQ= -github.com/gookit/color v1.5.0/go.mod h1:43aQb+Zerm/BWh2GnrgOQm7ffz7tvQXEKV6BFMl7wAo= -github.com/gookit/color v1.5.2 h1:uLnfXcaFjlrDnQDT+NCBcfhrXqYTx/rcCa6xn01Y8yI= -github.com/gookit/color v1.5.2/go.mod h1:w8h4bGiHeeBpvQVePTutdbERIUf3oJE5lZ8HM0UgXyg= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= github.com/gopherjs/gopherjs v0.0.0-20200217142428-fce0ec30dd00/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= github.com/gorilla/handlers v0.0.0-20150720190736-60c7bfde3e33/go.mod h1:Qkdc/uu4tH4g6mTK6auzZ766c4CA0Ng8+o/OAirnOIQ= @@ -744,10 +727,6 @@ github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+o github.com/klauspost/compress v1.11.3/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs= github.com/klauspost/compress v1.11.13 h1:eSvu8Tmq6j2psUJqJrLcWH6K3w5Dwc+qipbaA6eVEN4= github.com/klauspost/compress v1.11.13/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs= -github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= -github.com/klauspost/cpuid/v2 v2.0.10/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c= -github.com/klauspost/cpuid/v2 v2.0.12/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c= -github.com/klauspost/cpuid/v2 v2.2.0 h1:4ZexSFt8agMNzNisrsilL6RClWDC5YJnLHNIfTy4iuc= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= @@ -764,8 +743,6 @@ github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/linuxkit/virtsock v0.0.0-20201010232012-f8cee7dfc7a3/go.mod h1:3r6x7q95whyfWQpmGZTu3gk3v2YkMi05HEzl7Tf7YEo= -github.com/lithammer/fuzzysearch v1.1.5 h1:Ag7aKU08wp0R9QCfF4GoGST9HbmAIeLP7xwMrOBEp1c= -github.com/lithammer/fuzzysearch v1.1.5/go.mod h1:1R1LRNk7yKid1BaQkmuLQaHruxcC4HmAH30Dh61Ih1Q= github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY= github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= @@ -791,9 +768,6 @@ github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Ky github.com/mattn/go-isatty v0.0.14 h1:yVuAays6BHfxijgZPzw+3Zlu5yQgKGP2/hcQbHb7S9Y= github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= -github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= -github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU= -github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/mattn/go-shellwords v1.0.3/go.mod h1:3xCvwCdWdlDJUrvuMn7Wuy9eWs4pE8vqg+NOMyg4B2o= github.com/mattn/go-shellwords v1.0.6/go.mod h1:3xCvwCdWdlDJUrvuMn7Wuy9eWs4pE8vqg+NOMyg4B2o= github.com/mattn/go-shellwords v1.0.12/go.mod h1:EZzvwXDESEeg03EKmM+RmDnNOPKG4lLtQsUlTZDWQ8Y= @@ -1002,16 +976,6 @@ github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1 github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= -github.com/pterm/pterm v0.12.27/go.mod h1:PhQ89w4i95rhgE+xedAoqous6K9X+r6aSOI2eFF7DZI= -github.com/pterm/pterm v0.12.29/go.mod h1:WI3qxgvoQFFGKGjGnJR849gU0TsEOvKn5Q8LlY1U7lg= -github.com/pterm/pterm v0.12.30/go.mod h1:MOqLIyMOgmTDz9yorcYbcw+HsgoZo3BQfg2wtl3HEFE= -github.com/pterm/pterm v0.12.31/go.mod h1:32ZAWZVXD7ZfG0s8qqHXePte42kdz8ECtRyEejaWgXU= -github.com/pterm/pterm v0.12.33/go.mod h1:x+h2uL+n7CP/rel9+bImHD5lF3nM9vJj80k9ybiiTTE= -github.com/pterm/pterm v0.12.36/go.mod h1:NjiL09hFhT/vWjQHSj1athJpx6H8cjpHXNAK5bUw8T8= -github.com/pterm/pterm v0.12.40/go.mod h1:ffwPLwlbXxP+rxT0GsgDTzS3y3rmpAO1NMjUkGTYf8s= -github.com/pterm/pterm v0.12.54 h1:7DX218ZhG2v3NsvmvsHeTJHC92zUyK9Bgeqbu0x4mG0= -github.com/pterm/pterm v0.12.54/go.mod h1:x6HvVq6rUC/Ik2u3MxMgS6kIx2Mlj1qLq5xquul2TWs= -github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rivo/uniseg v0.4.3 h1:utMvzDsuh3suAEnhH0RdHmoPbU648o6CvXxTx4SBMOw= github.com/rivo/uniseg v0.4.3/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= @@ -1034,8 +998,6 @@ github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo= github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= -github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= -github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/shirou/gopsutil/v3 v3.22.10/go.mod h1:QNza6r4YQoydyCfo6rH0blGfKahgibh4dQmV5xdFkQk= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= github.com/sirupsen/logrus v1.0.4-0.20170822132746-89742aefa4b2/go.mod h1:pMByvHTf9Beacp5x1UXfOR9xyW/9antXMhjMPG0dEzc= @@ -1146,8 +1108,6 @@ github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1: github.com/xeipuuv/gojsonschema v0.0.0-20180618132009-1d523034197f/go.mod h1:5yf86TLmAcydyeJq5YvxkGPE2fm/u4myDekKRoLuqhs= github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= github.com/xlab/treeprint v1.1.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0= -github.com/xo/terminfo v0.0.0-20210125001918-ca9a967f8778 h1:QldyIu/L63oPpyvQmHgvgickp1Yw510KJOqX7H24mg8= -github.com/xo/terminfo v0.0.0-20210125001918-ca9a967f8778/go.mod h1:2MuV+tbUrU1zIOPMxZ5EncGwgmMJsa+9ucAQZXxsObs= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= @@ -1478,12 +1438,10 @@ golang.org/x/sys v0.0.0-20210831042530-f4d43177bf5e/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20210903071746-97244b99971b/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20211013075003-97ac67df715c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220128215802-99c3d69c2c27/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220319134239-a9b59b0215f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/pkg/exporter/cmd/diag.go b/pkg/exporter/cmd/diag.go deleted file mode 100644 index b8cd18f9..00000000 --- a/pkg/exporter/cmd/diag.go +++ /dev/null @@ -1,23 +0,0 @@ -package cmd - -import ( - "github.com/spf13/cobra" -) - -// diagCmd represents the diag command -var ( - diagCmd = &cobra.Command{ - Use: "diag", - Short: "Run command in the command line to probe metrics and events.", - Run: func(cmd *cobra.Command, args []string) { - cmd.Help() // nolint - }, - } - - podname string -) - -func init() { - rootCmd.AddCommand(diagCmd) - diagCmd.PersistentFlags().StringVarP(&podname, "pod", "i", "", "specified pod") -} diff --git a/pkg/exporter/cmd/diag_event.go b/pkg/exporter/cmd/diag_event.go deleted file mode 100644 index 953cbff1..00000000 --- a/pkg/exporter/cmd/diag_event.go +++ /dev/null @@ -1,77 +0,0 @@ -/* -Copyright © 2022 NAME HERE -*/ -package cmd - -import ( - "os" - "os/signal" - "time" - - "github.com/alibaba/kubeskoop/pkg/exporter/nettop" - "github.com/alibaba/kubeskoop/pkg/exporter/probe" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" - - "github.com/pterm/pterm" - "github.com/spf13/cobra" - "golang.org/x/exp/slog" -) - -var ( - diagEventCmd = &cobra.Command{ - Use: "event", - Short: "diagnose specific event probe", - Run: func(cmd *cobra.Command, args []string) { - if len(probeName) == 0 { - // nolint - cmd.Help() - return - } - - err := nettop.SyncNetTopology() - if err != nil { - slog.Ctx(cmd.Context()).Warn("sync nettop failed", "err", err) - return - } - // nolint - go nettop.StartCache(cmd.Context()) - defer nettop.StopCache() - - for _, p := range probeName { - pb := probe.GetEventProbe(p) - if pb == nil { - slog.Ctx(cmd.Context()).Info("ignore unsupported probe", "probe", p) - continue - } - - ch := make(chan proto.RawEvent) - if err := pb.Register(ch); err != nil { - slog.Ctx(cmd.Context()).Info("register failed", "err", err, "probe", p) - continue - } - - go pb.Start(cmd.Context(), "") - go func() { - for evt := range ch { - ets, err := nettop.GetEntityByNetns(int(evt.Netns)) - if err != nil && ets == nil { - slog.Ctx(cmd.Context()).Info("ignore event", "err", err, "netns", evt.Netns) - continue - } - pterm.Info.Printf("%s %s %s %s\n", time.Now().Format(time.Stamp), evt.EventType, ets.GetPodName(), evt.EventBody) - } - }() - } - - c := make(chan os.Signal, 1) - signal.Notify(c, os.Interrupt) - - <-c - }, - } -) - -func init() { - diagCmd.AddCommand(diagEventCmd) - diagEventCmd.PersistentFlags().StringSliceVarP(&probeName, "probe", "p", []string{}, "probe name to diag") -} diff --git a/pkg/exporter/cmd/diag_metric.go b/pkg/exporter/cmd/diag_metric.go deleted file mode 100644 index 4ae7e028..00000000 --- a/pkg/exporter/cmd/diag_metric.go +++ /dev/null @@ -1,79 +0,0 @@ -package cmd - -import ( - "context" - "fmt" - "strings" - - "github.com/alibaba/kubeskoop/pkg/exporter/nettop" - "github.com/alibaba/kubeskoop/pkg/exporter/probe" - - "github.com/pterm/pterm" - "github.com/spf13/cobra" - "golang.org/x/exp/slog" -) - -// metricCmd represents the tcp command -var ( - metricCmd = &cobra.Command{ - Use: "metric", - Short: "get metric data in cli", - Run: func(cmd *cobra.Command, args []string) { - if len(probeName) == 0 { - cmd.Help() // nolint - return - } - ctx := slog.NewContext(context.Background(), slog.Default()) - err := nettop.SyncNetTopology() - if err != nil { - slog.Ctx(ctx).Info("sync nettop", "err", err) - return - } - texts := pterm.TableData{ - {"METRIC", "VALUE", "NETNS", "POD", "NAMESPACE", "PROBE"}, - } - for _, p := range probeName { - data, err := probe.CollectOnce(ctx, p) - if err != nil && data == nil { - slog.Ctx(ctx).Info("collect metric", "err", err) - continue - } - for m, d := range data { - slog.Ctx(ctx).Debug("raw metric msg", "metric", m, "data", d) - // if a probe provide multi subject, only fetch relevant metric data - if !strings.HasPrefix(m, p) { - continue - } - for nsinum, v := range d { - et, err := nettop.GetEntityByNetns(int(nsinum)) - if err != nil { - slog.Ctx(ctx).Info("get entity failed", "netns", nsinum, "err", err) - continue - } - texts = append(texts, []string{ - m, - fmt.Sprintf("%d", v), - fmt.Sprintf("%d", nsinum), - et.GetPodName(), - et.GetPodNamespace(), - p, - }) - } - - } - } - pterm.DefaultTable.WithHasHeader().WithData(texts).Render() // nolint - - }, - } - - probeName []string - metricName []string -) - -func init() { - diagCmd.AddCommand(metricCmd) - - metricCmd.PersistentFlags().StringSliceVarP(&probeName, "probe", "p", []string{}, "probe name to diag") - metricCmd.PersistentFlags().StringSliceVarP(&metricName, "metric", "m", []string{}, "metric name to diag") -} diff --git a/pkg/exporter/cmd/eventserver.go b/pkg/exporter/cmd/eventserver.go index 7c79123d..1e5fb689 100644 --- a/pkg/exporter/cmd/eventserver.go +++ b/pkg/exporter/cmd/eventserver.go @@ -2,291 +2,78 @@ package cmd import ( "context" - "fmt" - "github.com/samber/lo" - - "sync" - "time" - - lokiwrapper "github.com/alibaba/kubeskoop/pkg/exporter/loki" - "github.com/alibaba/kubeskoop/pkg/exporter/nettop" "github.com/alibaba/kubeskoop/pkg/exporter/probe" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" - - "github.com/google/uuid" - "golang.org/x/exp/slog" - "google.golang.org/grpc/peer" + "github.com/alibaba/kubeskoop/pkg/exporter/sink" + log "github.com/sirupsen/logrus" ) -type EServer struct { - proto.UnimplementedInspectorServer - probes map[string]proto.EventProbe - subscribers map[string]chan<- proto.RawEvent - mtx sync.Mutex - ctx context.Context - control chan struct{} - config EventConfig - eventChan chan proto.RawEvent - lokiDatach chan proto.RawEvent - lokiIngester *lokiwrapper.Ingester +type EventServer struct { + *DynamicProbeServer[probe.EventProbe] } -func NewEServer(ctx context.Context, config EventConfig) *EServer { - es := &EServer{ - probes: make(map[string]proto.EventProbe), - subscribers: make(map[string]chan<- proto.RawEvent), - config: config, - mtx: sync.Mutex{}, - ctx: ctx, - control: make(chan struct{}), - eventChan: make(chan proto.RawEvent), +func NewEventServer(sinks []sink.Sink) (*EventServer, error) { + probeManager := &EventProbeManager{ + sinks: sinks, + sinkChan: make(chan *probe.Event), + done: make(chan struct{}), } - for _, p := range config.Probes { - ep := probe.GetEventProbe(p) - if ep == nil { - slog.Ctx(ctx).Info("get event probe nil", "probe", p) - continue - } - es.probes[p] = ep - err := ep.Register(es.eventChan) - if err != nil { - slog.Ctx(ctx).Warn("probe register failed", "probe", p) - continue - } - go ep.Start(ctx, proto.ProbeTypeEvent) - slog.Ctx(ctx).Debug("eserver start", "subject", p) - } - - // start cache loop - slog.Ctx(ctx).Debug("new eserver start dispatch loop") - go es.dispatcher(ctx, es.control) - - err := es.enableLoki() - if err != nil { - slog.Ctx(ctx).Warn("enable loki failed", "err", err) - } - return es + return &EventServer{ + DynamicProbeServer: NewDynamicProbeServer[probe.EventProbe](probeManager), + }, nil } -func (e *EServer) enableLoki() error { - if e.lokiIngester != nil { - return nil - } - - // handle grafana loki ingester preparation - if e.config.LokiEnable && e.config.LokiAddress != "" { - slog.Ctx(e.ctx).Debug("enabling loki ingester", "address", e.config.LokiAddress) - datach := make(chan proto.RawEvent) - ingester, err := lokiwrapper.NewLokiIngester(e.ctx, e.config.LokiAddress, nettop.GetNodeName()) - if err != nil { - slog.Ctx(e.ctx).Info("new loki ingester", "err", err, "client", ingester.Name()) - } else { - e.subscribe(ingester.Name(), datach) - go ingester.Watch(e.ctx, datach) - } - e.lokiDatach = datach - e.lokiIngester = ingester - } - - return nil +func (s *EventServer) Start(ctx context.Context, probeConfig []ProbeConfig) error { + go s.probeManager.(*EventProbeManager).start() + return s.DynamicProbeServer.Start(ctx, probeConfig) } -func (e *EServer) disableLoki() error { - if e.lokiIngester == nil { - return nil - } - - slog.Ctx(e.ctx).Debug("disabling loki ingester") - e.unsubscribe(e.lokiIngester.Name()) - - err := e.lokiIngester.Close() - if err != nil { +func (s *EventServer) Stop(ctx context.Context) error { + if err := s.DynamicProbeServer.Stop(ctx); err != nil { return err } - - close(e.lokiDatach) - e.lokiIngester = nil - e.lokiDatach = nil + s.probeManager.(*EventProbeManager).stop() return nil } -func (e *EServer) Reload(config EventConfig) error { - enabled := lo.Keys(e.probes) - toClose, toStart := lo.Difference(enabled, config.Probes) - slog.Ctx(e.ctx).Info("reload event probes", "close", toClose, "enable", toStart) - - for _, n := range toClose { - p, ok := e.probes[n] - if !ok { - slog.Ctx(e.ctx).Warn("probe not found in enabled probes, skip.", "probe", n) - continue - } - - err := p.Close(proto.ProbeTypeEvent) - if err != nil { - slog.Ctx(e.ctx).Warn("close probe error", "probe", n, "err", err) - continue - } - - // clear event channel - err = p.Register(nil) - if err != nil { - slog.Ctx(e.ctx).Warn("unregister probe error", "probe", n, "err", err) - continue - } - - delete(e.probes, n) - } - - for _, n := range toStart { - p := probe.GetEventProbe(n) - if p == nil { - slog.Ctx(e.ctx).Info("get event probe nil", "probe", p) - continue - } - e.probes[n] = p - go p.Start(e.ctx, proto.ProbeTypeEvent) - slog.Ctx(e.ctx).Debug("eserver start", "subject", p) - - err := p.Register(e.eventChan) - if err != nil { - slog.Ctx(e.ctx).Info("register receiver", "probe", p, "err", err) - continue - } - } - - e.config = config - - if config.LokiEnable { - if err := e.enableLoki(); err != nil { - slog.Ctx(e.ctx).Warn("enable loki error", "err", err) - } - } else { - if err := e.disableLoki(); err != nil { - slog.Ctx(e.ctx).Warn("disable loki error", "err", err) - } - } - - return nil -} - -func (e *EServer) WatchEvent(_ *proto.WatchRequest, srv proto.Inspector_WatchEventServer) error { - client := getPeerClient(srv.Context()) - datach := make(chan proto.RawEvent) - slog.Ctx(e.ctx).Info("watch event income", "client", client) - e.subscribe(client, datach) - defer e.unsubscribe(client) - - for evt := range datach { - resp := &proto.WatchReply{ - Name: evt.EventType, - Event: &proto.Event{ - Name: evt.EventType, - Value: evt.EventBody, - Meta: getEventMetaByNetns(e.ctx, evt.Netns), - }, - } - err := srv.Send(resp) - if err != nil { - slog.Ctx(e.ctx).Warn("watch event", "err", err, "client", client) - return err - } - } - - return nil -} - -func (e *EServer) QueryMetric(_ context.Context, _ *proto.QueryMetricRequest) (*proto.QueryMetricResponse, error) { - res := &proto.QueryMetricResponse{} - return res, nil -} - -func (e *EServer) subscribe(client string, ch chan<- proto.RawEvent) { - e.mtx.Lock() - defer e.mtx.Unlock() - - e.subscribers[client] = ch +type EventProbeManager struct { + sinkChan chan *probe.Event + sinks []sink.Sink + done chan struct{} } -func (e *EServer) unsubscribe(client string) { - e.mtx.Lock() - defer e.mtx.Unlock() - - delete(e.subscribers, client) +func (m *EventProbeManager) stop() { + log.Infof("probe manager stopped") + close(m.done) } -func (e *EServer) dispatcher(ctx context.Context, stopc chan struct{}) { +func (m *EventProbeManager) start() { for { select { - case <-stopc: - slog.Ctx(ctx).Debug("event dispatcher exited because of stop signal") - return - case evt := <-e.eventChan: - err := e.broadcast(evt) - if err != nil { - slog.Ctx(ctx).Info("dispatcher broadcast", "err", err, "event", evt) - continue + case evt := <-m.sinkChan: + for _, sink := range m.sinks { + //TODO be concurrency + if err := sink.Write(evt); err != nil { + log.Errorf("error sink evt %s", err) + } } + case <-m.done: + break } - } } -func (e *EServer) broadcast(evt proto.RawEvent) error { - pbs := e.subscribers - - ctx, cancelf := context.WithTimeout(e.ctx, 5*time.Second) - defer cancelf() - workdone := make(chan struct{}) - go func(done chan struct{}) { - for client, c := range pbs { - c <- evt - slog.Ctx(e.ctx).Debug("broadcast event", "client", client, "event", evt.EventType) - } - - done <- struct{}{} - }(workdone) - - if e.config.InfoToLog { - slog.Ctx(e.ctx).Warn("broadcast event", "type", evt.EventType, "body", evt.EventBody, "netns", evt.Netns) - } - - select { - case <-ctx.Done(): - slog.Ctx(e.ctx).Info("broadcast event stuck", "event", evt.EventType) - return context.DeadlineExceeded - case <-workdone: - slog.Ctx(e.ctx).Info("broadcast event", "event", evt.EventType, "info", evt.EventBody) - } - - return nil +func (m *EventProbeManager) CreateProbe(config ProbeConfig) (probe.EventProbe, error) { + return probe.CreateEventProbe(config.Name, m.sinkChan, config.Args) } -func getPeerClient(ctx context.Context) string { - var clientid string - pr, ok := peer.FromContext(ctx) - if ok { - clientid = pr.Addr.String() - } else { - clientid = uuid.New().String() - } - - return clientid +func (m *EventProbeManager) StartProbe(ctx context.Context, probe probe.EventProbe) error { + return probe.Start(ctx) } -func getEventMetaByNetns(ctx context.Context, netns uint32) *proto.Meta { - et, err := nettop.GetEntityByNetns(int(netns)) - if err != nil { - slog.Ctx(ctx).Info("nettop get entity", "err", err, "netns", netns) - return nil - } - - return &proto.Meta{ - Pod: et.GetPodName(), - Namespace: et.GetPodNamespace(), - Netns: fmt.Sprintf("ns%d", netns), - Node: nettop.GetNodeName(), - } +func (m *EventProbeManager) StopProbe(ctx context.Context, probe probe.EventProbe) error { + return probe.Stop(ctx) } + +var _ ProbeManager[probe.MetricsProbe] = &MetricsProbeManager{} diff --git a/pkg/exporter/cmd/list.go b/pkg/exporter/cmd/list.go index eb7fd112..587b7b1a 100644 --- a/pkg/exporter/cmd/list.go +++ b/pkg/exporter/cmd/list.go @@ -13,12 +13,8 @@ var ( cmd.Help() // nolint }, } - - output string ) func init() { rootCmd.AddCommand(listCmd) - - listCmd.PersistentFlags().StringVarP(&output, "output", "o", "text", "output format, support text/json/file") } diff --git a/pkg/exporter/cmd/list_entity.go b/pkg/exporter/cmd/list_entity.go deleted file mode 100644 index 267e0aee..00000000 --- a/pkg/exporter/cmd/list_entity.go +++ /dev/null @@ -1,104 +0,0 @@ -/* -Copyright © 2022 NAME HERE -*/ -package cmd - -import ( - "errors" - "fmt" - - "strings" - - "github.com/alibaba/kubeskoop/pkg/exporter/nettop" - "github.com/pterm/pterm" - "github.com/spf13/cobra" -) - -// entityCmd represents the entity command -var ( - entityCmd = &cobra.Command{ - Use: "entity", - Short: "List all network entities, including all non-hostnetwork pods and the host itself.", - Run: func(cmd *cobra.Command, args []string) { - if LabelSelector != "" { - slct, err := parseLabelSelector(LabelSelector) - if err != nil { - fmt.Printf("parse label %s failed:%s\n", LabelSelector, err.Error()) - return - } - listEntities(slct) - } else { - listEntities() - } - - }, - } - - LabelSelector string -) - -func init() { - listCmd.AddCommand(entityCmd) - entityCmd.PersistentFlags().StringVarP(&LabelSelector, "label", "l", "", "label filter") -} - -func listEntities(slct ...selector) { - err := nettop.SyncNetTopology() - if err != nil { - fmt.Printf("sync nettop failed:%s\n", err.Error()) - return - } - - texts := pterm.TableData{ - {"POD", "APP", "IP", "NAMESPACE", "NETNS", "PID", "NSINUM"}, - } - ets := nettop.GetAllEntity() - for _, et := range ets { - if len(slct) > 0 { - labelvalue, ok := et.GetLabel(slct[0].key) - if ok && (slct[0].value == "" || slct[0].value == labelvalue) { - texts = append(texts, []string{ - et.GetPodName(), - et.GetAppLabel(), - et.GetIP(), - et.GetNetnsMountPoint(), - et.GetPodNamespace(), - fmt.Sprintf("%d", et.GetPid()), - fmt.Sprintf("%d", et.GetNetns()), - }) - } - } else { - texts = append(texts, []string{ - et.GetPodName(), - et.GetAppLabel(), - et.GetIP(), - et.GetNetnsMountPoint(), - et.GetPodNamespace(), - fmt.Sprintf("%d", et.GetPid()), - fmt.Sprintf("%d", et.GetNetns()), - }) - } - } - // nolint - pterm.DefaultTable.WithHasHeader().WithData(texts).Render() - -} - -func parseLabelSelector(selector string) (s selector, err error) { - err = errors.New("invalid label selector") - ss := strings.Split(selector, "=") - if len(ss) > 2 { - return - } - s.key = ss[0] - if len(ss) > 1 { - s.value = ss[1] - } - - return s, nil -} - -type selector struct { - key string - value string -} diff --git a/pkg/exporter/cmd/list_event.go b/pkg/exporter/cmd/list_event.go deleted file mode 100644 index 29b8f8e8..00000000 --- a/pkg/exporter/cmd/list_event.go +++ /dev/null @@ -1,44 +0,0 @@ -/* -Copyright © 2022 NAME HERE -*/ -package cmd - -import ( - "github.com/alibaba/kubeskoop/pkg/exporter/probe" - "github.com/pterm/pterm" - "github.com/spf13/cobra" -) - -// eventCmd represents the event command -var eventCmd = &cobra.Command{ - Use: "event", - Short: "list all available metrics", - Run: func(cmd *cobra.Command, args []string) { - events := probe.ListEvents() - - sliceMapTextOutput("events", events) - }, -} - -func init() { - listCmd.AddCommand(eventCmd) -} - -func sliceMapTextOutput(title string, data map[string][]string) { - tree := pterm.TreeNode{ - Text: title, - Children: []pterm.TreeNode{}, - } - - for p, unit := range data { - parent := pterm.TreeNode{ - Text: p, - Children: []pterm.TreeNode{}, - } - for i := range unit { - parent.Children = append(parent.Children, pterm.TreeNode{Text: unit[i]}) - } - tree.Children = append(tree.Children, parent) - } - pterm.DefaultTree.WithRoot(tree).Render() // nolint -} diff --git a/pkg/exporter/cmd/list_metric.go b/pkg/exporter/cmd/list_metric.go deleted file mode 100644 index 4d6cf8b2..00000000 --- a/pkg/exporter/cmd/list_metric.go +++ /dev/null @@ -1,71 +0,0 @@ -/* -Copyright © 2022 NAME HERE -*/ -package cmd - -import ( - "strings" - - "github.com/alibaba/kubeskoop/pkg/exporter/probe" - "github.com/pterm/pterm" - "github.com/spf13/cobra" -) - -// metricCmd represents the metric command -var ( - listmetricCmd = &cobra.Command{ - Use: "metric", - Short: "list available metrics of probe", - Run: func(cmd *cobra.Command, args []string) { - showprobes := []string{} - allprobes := probe.ListMetricProbes() - for idx := range listprobe { - for _, probe := range allprobes { - if strings.Contains(probe, listprobe[idx]) { - showprobes = append(showprobes, probe) - break - } - } - } - - if len(showprobes) == 0 { - showprobes = allprobes - } - - tree := pterm.TreeNode{ - Text: "metrics", - Children: []pterm.TreeNode{}, - } - metrics := probe.ListMetrics() - for _, p := range showprobes { - if mnames, ok := metrics[p]; ok { - parent := pterm.TreeNode{ - Text: p, - Children: []pterm.TreeNode{}, - } - for i := range mnames { - if !strings.HasPrefix(mnames[i], p) { - continue - } - children := pterm.TreeNode{ - Text: mnames[i], - } - parent.Children = append(parent.Children, children) - } - tree.Children = append(tree.Children, parent) - } - } - - pterm.DefaultTree.WithRoot(tree).Render() // nolint - - }, - } - - listprobe []string -) - -func init() { - listCmd.AddCommand(listmetricCmd) - - listmetricCmd.PersistentFlags().StringSliceVarP(&listprobe, "probe", "p", []string{}, "probe to list, default show all available") -} diff --git a/pkg/exporter/cmd/list_probe.go b/pkg/exporter/cmd/list_probe.go index be5ad66d..b83bbee3 100644 --- a/pkg/exporter/cmd/list_probe.go +++ b/pkg/exporter/cmd/list_probe.go @@ -4,12 +4,10 @@ Copyright © 2022 NAME HERE package cmd import ( - "encoding/json" "fmt" "github.com/alibaba/kubeskoop/pkg/exporter/probe" "github.com/spf13/cobra" - "golang.org/x/exp/slog" ) // probeCmd represents the probe command @@ -18,27 +16,17 @@ var ( Use: "probe", Short: "list supported probe with metric exporting", Run: func(cmd *cobra.Command, args []string) { - res := map[string][]string{ - "metric": {}, - "event": {}, - } - res["metric"] = probe.ListMetricProbes() + res := make(map[string][]string) + res["metrics"] = probe.ListMetricsProbes() + res["event"] = probe.ListEventProbes() - els := probe.ListEvents() - for ep := range els { - res["event"] = append(res["event"], ep) - } + for key, l := range res { + fmt.Println(key) + indent := " " + for _, s := range l { + fmt.Printf("%s%s\n", indent, s) - switch output { - case "json": - text, err := json.MarshalIndent(res, "", " ") - if err != nil { - slog.Warn("json marshal failed", "err", err) - return } - fmt.Println(string(text)) - default: - sliceMapTextOutput("probes", res) } }, } diff --git a/pkg/exporter/cmd/metricserver.go b/pkg/exporter/cmd/metricserver.go index 04a6190b..afa55570 100644 --- a/pkg/exporter/cmd/metricserver.go +++ b/pkg/exporter/cmd/metricserver.go @@ -2,355 +2,66 @@ package cmd import ( "context" - "fmt" - "sync" + "net/http" - "github.com/samber/lo" - - "strings" - "time" - - "github.com/alibaba/kubeskoop/pkg/exporter/nettop" "github.com/alibaba/kubeskoop/pkg/exporter/probe" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" - "github.com/patrickmn/go-cache" + "github.com/alibaba/kubeskoop/pkg/exporter/util" "github.com/prometheus/client_golang/prometheus" - "golang.org/x/exp/slog" -) - -const ( - MetricLabelMeta = "meta" - MetricLabelLabel = "label" + "github.com/prometheus/client_golang/prometheus/promhttp" + log "github.com/sirupsen/logrus" ) -var ( - CollectLatency = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "inspector_runtime_collectlatency", - Help: "net-exporter metrics collect latency", - }, - []string{"node", "probe"}, - ) +func NewMetricsServer() (*MetricsServer, error) { - cacheUpdateInterval = 10 * time.Second -) - -func NewMServer(ctx context.Context, config MetricConfig) *MServer { - ms := &MServer{ - ctx: ctx, - mtx: sync.Mutex{}, - descs: make(map[string]*prometheus.Desc), - config: config, - probes: make(map[string]proto.MetricProbe), - metricCache: cache.New(3*cacheUpdateInterval, 10*cacheUpdateInterval), - loopctrl: make(chan struct{}), - } + r := prometheus.NewRegistry() + handler := promhttp.HandlerFor(prometheus.Gatherers{ + r, + }, promhttp.HandlerOpts{}) - for _, p := range config.Probes { - mp := probe.GetProbe(p) - if mp == nil { - slog.Ctx(ctx).Info("get metric probe nil", "probe", p) - continue - } - ms.probes[p] = mp - go mp.Start(ctx, proto.ProbeTypeMetrics) - slog.Ctx(ctx).Debug("new mserver add subject", "subject", p) + probeManager := &MetricsProbeManager{ + prometheusRegistry: r, } - ms.additionalLabels = validateExposeLabels(ms.config.ExposeLabels) - slog.Default().Debug("metric config", "config", ms.additionalLabels) - - for sub, mp := range ms.probes { - mnames := mp.GetMetricNames() - for _, mname := range mnames { - if !strings.HasPrefix(mname, sub) { - continue - } - slog.Ctx(ctx).Debug("new mserver add desc", "probe", mp.Name(), "subject", sub, "metric", mname) - if ms.config.Verbose { - ms.descs[mname] = getDescOfMetricVerbose(sub, mname, ms.additionalLabels) - } else { - ms.descs[mname] = getDescOfMetric(sub, mname) - } - - } - } - // start cache loop - slog.Ctx(ctx).Debug("new mserver start cache loop") - go ms.collectLoop(ctx, cacheUpdateInterval, ms.loopctrl) - - return ms + return &MetricsServer{ + DynamicProbeServer: NewDynamicProbeServer[probe.MetricsProbe](probeManager), + httpHandler: handler, + }, nil } -type MServer struct { - ctx context.Context - mtx sync.Mutex - descs map[string]*prometheus.Desc - config MetricConfig - metricCache *cache.Cache - probes map[string]proto.MetricProbe - loopctrl chan struct{} - additionalLabels []ExposeLabel +type MetricsProbeManager struct { + prometheusRegistry *prometheus.Registry } -// Close if cache process loop exited, close the metric server will be stuck, check is first -func (s *MServer) Close() { - if s.loopctrl != nil { - select { - case <-s.loopctrl: - s.loopctrl <- struct{}{} - default: - } - } +func (m *MetricsProbeManager) CreateProbe(config ProbeConfig) (probe.MetricsProbe, error) { + log.Infof("create metrics probe %s with args %s", config.Name, util.ToJSONString(config.Args)) + return probe.CreateMetricsProbe(config.Name, config.Args) } -func (s *MServer) Reload(config MetricConfig) error { - s.mtx.Lock() - defer s.mtx.Unlock() - - enabled := lo.Keys(s.probes) - toClose, toStart := lo.Difference(enabled, config.Probes) - slog.Ctx(s.ctx).Info("reload metric probes", "close", toClose, "enable", toStart) - - for _, n := range toClose { - p, ok := s.probes[n] - if !ok { - slog.Ctx(s.ctx).Warn("probe not found in enabled probes, skip.", "probe", n) - continue - } - - err := p.Close(proto.ProbeTypeMetrics) - if err != nil { - slog.Ctx(s.ctx).Warn("close probe error", "probe", n, "err", err) - continue - } - delete(s.probes, n) - } - - for _, n := range toStart { - p := probe.GetProbe(n) - if p == nil { - slog.Ctx(s.ctx).Info("get metric probe nil", "probe", n) - continue - } - s.probes[n] = p - go p.Start(s.ctx, proto.ProbeTypeMetrics) - slog.Ctx(s.ctx).Debug("new mserver add subject", "subject", n) - } - - for sub, mp := range s.probes { - mnames := mp.GetMetricNames() - for _, mname := range mnames { - if !strings.HasPrefix(mname, sub) { - continue - } - if s.config.Verbose { - s.descs[mname] = getDescOfMetricVerbose(sub, mname, s.additionalLabels) - } else { - s.descs[mname] = getDescOfMetric(sub, mname) - } - } +func (m *MetricsProbeManager) StartProbe(ctx context.Context, probe probe.MetricsProbe) error { + log.Infof("start metrics probe %s", probe.Name()) + if err := probe.Start(ctx); err != nil { + return err } - - s.config = config + m.prometheusRegistry.MustRegister(probe) return nil } -func (s *MServer) Collect(ch chan<- prometheus.Metric) { - s.mtx.Lock() - defer s.mtx.Unlock() - slog.Ctx(s.ctx).Debug("metric server collect request in", "metric count", len(s.descs)) - for mname, desc := range s.descs { - data, err := s.collectOnceCache(s.ctx, mname) - if err != nil || data == nil { - slog.Ctx(s.ctx).Info("collect metric cache", "err", err, "metric", mname) - continue - } - slog.Ctx(s.ctx).Debug("metric server collect", "metric", mname, "value", data) - for nsinum, value := range data { - et, err := nettop.GetEntityByNetns(int(nsinum)) - if err != nil || et == nil { - slog.Ctx(s.ctx).Info("collect metric get entity error or nil", "err", err) - continue - } - slog.Ctx(s.ctx).Debug("collect metric", "pod", et.GetPodName(), "netns", nsinum, "metric", mname, "value", value) - labelValues := []string{nettop.GetNodeName(), et.GetPodNamespace(), et.GetPodName()} - // for legacy pod labels - labelValues = append(labelValues, labelValues...) - if s.config.Verbose { - if len(s.additionalLabels) > 0 { - for _, label := range s.additionalLabels { - switch label.LabelType { - case "label": - if value, ok := et.GetLabel(label.Source); ok { - labelValues = append(labelValues, value) - } else { - labelValues = append(labelValues, "") - } - case "meta": - // support ip/netns now - value, err := et.GetMeta(label.Source) - if err != nil { - slog.Default().Info("get meta failed", "meta", label.Source) - labelValues = append(labelValues, "") - } else { - labelValues = append(labelValues, value) - } - default: - // unsupported exposed label will be empty string - slog.Default().Info("empty label set", "label", label.Source) - labelValues = append(labelValues, "") - } - } - slog.Default().Info("label values", "label", labelValues) - } - } - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(value), labelValues...) - } - } -} - -// Describe get all description from probe module -func (s *MServer) Describe(ch chan<- *prometheus.Desc) { - slog.Ctx(s.ctx).Debug("metric server describe request in") - for m, desc := range s.descs { - slog.Ctx(s.ctx).Debug("mserver describe", "metric", m) - ch <- desc - } -} - -func (s *MServer) collectOnceCache(ctx context.Context, metric string) (map[uint32]uint64, error) { - v, ok := s.metricCache.Get(strings.ToLower(metric)) - if !ok || v == nil { - slog.Ctx(ctx).Info("collect from cache", "value", v) - return nil, fmt.Errorf("no cache found for %s", metric) - } - - vp := v.(map[uint32]uint64) - if vp == nil { - slog.Ctx(ctx).Info("collect from cache", "value", v) - return nil, fmt.Errorf("empty cache found for %s", metric) +func (m *MetricsProbeManager) StopProbe(ctx context.Context, probe probe.MetricsProbe) error { + log.Infof("stop metrics probe %s", probe.Name()) + if err := probe.Stop(ctx); err != nil { + return err } - slog.Ctx(ctx).Debug("collect once cache", "metric", metric, "value", vp) - return vp, nil -} - -func (s *MServer) collectLoop(ctx context.Context, interval time.Duration, stopc chan struct{}) { - slog.Ctx(ctx).Debug("cache loop start", "interval", interval) - - t := time.NewTicker(interval) - defer t.Stop() - - for { - select { - case <-t.C: - if err := s.collectWorkerSerial(ctx); err != nil { - slog.Ctx(ctx).Info("cache loop", "err", err) - continue - } - case <-stopc: - slog.Ctx(ctx).Info("cache loop stop", "interval", interval) - close(stopc) - return - } - } -} - -// collectWorkerSerial collect metric data in serial -func (s *MServer) collectWorkerSerial(ctx context.Context) error { - s.mtx.Lock() - defer s.mtx.Unlock() - if len(s.probes) == 0 { - return nil - } - slog.Ctx(s.ctx).Debug("collect worker serial start") - workdone := make(chan struct{}) - cstart := time.Now() - ctx, cancelf := context.WithTimeout(ctx, cacheUpdateInterval) - defer cancelf() - - go func(ctx context.Context, done chan struct{}) { - for pn, pb := range s.probes { - start := time.Now() - // check probe status here - if !pb.Ready() { - slog.Ctx(ctx).Info("collect worker not ready", "probe", pn) - continue - } - data, err := pb.Collect(ctx) - if err != nil { - slog.Ctx(ctx).Info("collect worker", "err", err, "probe", pn) - continue - } - for mname, mdata := range data { - slog.Ctx(ctx).Debug("collect worker store", "metric", mname, "value", mdata) - s.metricCache.Set(mname, mdata, cache.NoExpiration) - } - slog.Ctx(ctx).Debug("collect worker finish", "probe", pn) - - CollectLatency.With(prometheus.Labels{"node": nettop.GetNodeName(), "probe": pn}).Set(float64(time.Since(start).Seconds())) - } - - done <- struct{}{} - }(ctx, workdone) - - select { - case <-ctx.Done(): - slog.Ctx(ctx).Info("collect worker", "time exceeded", time.Since(cstart).Seconds()) - return context.DeadlineExceeded - case <-workdone: - slog.Ctx(ctx).Info("collect worker", "finished in", time.Since(cstart).Seconds()) - } - + m.prometheusRegistry.Unregister(probe) return nil } -// inspector pod metrics common labels -// {"node", "namespace", "pod"} will override by prometheus default configuration -// refer to https://github.com/alibaba/kubeskoop/issues/77 -var defaultMetricLabels = []string{"target_node", "target_namespace", "target_pod", "node", "namespace", "pod"} - -func getDescOfMetric(mp, mname string) *prometheus.Desc { - return prometheus.NewDesc( - prometheus.BuildFQName("inspector", "pod", mname), - fmt.Sprintf("%s %s count in netns/pod", mp, mname), - defaultMetricLabels, - nil, - ) -} +var _ ProbeManager[probe.MetricsProbe] = &MetricsProbeManager{} -func getDescOfMetricVerbose(mp, mname string, additionalLabels []ExposeLabel) *prometheus.Desc { - labels := defaultMetricLabels - if len(additionalLabels) > 0 { - for _, label := range additionalLabels { - slog.Info("build metric description", "additional label", label) - labels = append(labels, label.Replace) - } - } - return prometheus.NewDesc( - prometheus.BuildFQName("inspector", "pod", mname), - fmt.Sprintf("%s %s count in netns/pod", mp, mname), - labels, - nil, - ) +type MetricsServer struct { + *DynamicProbeServer[probe.MetricsProbe] + httpHandler http.Handler } -func validateExposeLabels(labels []ExposeLabel) []ExposeLabel { - res := []ExposeLabel{} - for _, label := range labels { - if label.LabelType != MetricLabelLabel && label.LabelType != MetricLabelMeta { - continue - } - - if label.Source == "" { - continue - } - - if label.Replace == "" { - label.Replace = label.Source - } - - res = append(res, label) - } - - return res +func (s *MetricsServer) ServeHTTP(w http.ResponseWriter, r *http.Request) { + s.httpHandler.ServeHTTP(w, r) } diff --git a/pkg/exporter/cmd/root.go b/pkg/exporter/cmd/root.go index e9bbb0c8..d5d4a1a9 100644 --- a/pkg/exporter/cmd/root.go +++ b/pkg/exporter/cmd/root.go @@ -1,13 +1,13 @@ package cmd import ( - "io" "os" + log "github.com/sirupsen/logrus" + "github.com/alibaba/kubeskoop/pkg/exporter/nettop" "github.com/spf13/cobra" - "golang.org/x/exp/slog" ) // rootCmd represents the base command when called without any subcommands @@ -18,14 +18,9 @@ var ( PersistentPreRun: func(cmd *cobra.Command, args []string) { nettop.Init(sidecar) if debug { - opts := slog.HandlerOptions{ - AddSource: true, - Level: slog.DebugLevel, - } - - slog.SetDefault(slog.New(opts.NewTextHandler(os.Stdout))) + log.SetLevel(log.DebugLevel) } else { - slog.SetDefault(slog.New(slog.NewTextHandler(io.Discard))) + log.SetLevel(log.InfoLevel) } }, } diff --git a/pkg/exporter/cmd/server.go b/pkg/exporter/cmd/server.go index ff03e029..5adffca3 100644 --- a/pkg/exporter/cmd/server.go +++ b/pkg/exporter/cmd/server.go @@ -7,30 +7,30 @@ import ( "context" "encoding/json" "fmt" - "net" "net/http" "os" "os/signal" + "reflect" + "sync" "syscall" - "github.com/fsnotify/fsnotify" + "github.com/alibaba/kubeskoop/pkg/exporter/sink" - "github.com/alibaba/kubeskoop/pkg/exporter/nettop" - "github.com/alibaba/kubeskoop/pkg/exporter/probe" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + "github.com/fsnotify/fsnotify" _ "net/http" //for golangci-lint _ "net/http/pprof" //for golangci-lint once more + "github.com/alibaba/kubeskoop/pkg/exporter/nettop" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" + gops "github.com/google/gops/agent" - "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/collectors" "github.com/prometheus/client_golang/prometheus/promhttp" + log "github.com/sirupsen/logrus" "github.com/spf13/cobra" "github.com/spf13/viper" - log "golang.org/x/exp/slog" - "google.golang.org/grpc" ) // serverCmd represents the server command @@ -41,28 +41,19 @@ var ( Run: func(cmd *cobra.Command, args []string) { insp := &inspServer{ v: *viper.New(), - ctx: log.NewContext(context.Background(), log.Default()), + ctx: context.Background(), } + log.Infof("start with config file %s", configPath) insp.v.SetConfigFile(configPath) err := insp.MergeConfig() if err != nil { - log.Ctx(insp.ctx).Info("merge config", "err", err) + log.Errorf("merge config err: %v", err) return } if insp.config.DebugMode { - opts := log.HandlerOptions{ - AddSource: true, - Level: log.DebugLevel, - } - insp.ctx = log.NewContext(context.Background(), log.New(opts.NewJSONHandler(os.Stdout))) - } else { - opts := log.HandlerOptions{ - AddSource: false, - Level: log.InfoLevel, - } - insp.ctx = log.NewContext(context.Background(), log.New(opts.NewJSONHandler(os.Stdout))) + log.SetLevel(log.DebugLevel) } // nolint @@ -71,18 +62,18 @@ var ( // config hot reload process insp.v.OnConfigChange(func(e fsnotify.Event) { - log.Ctx(insp.ctx).Info("Start reload config") + log.Info("Start reload config") if err := insp.reload(); err != nil { - log.Ctx(insp.ctx).Warn("Reload config error", "err", err) + log.Warnf("Reload config error: %v", err) } - log.Ctx(insp.ctx).Info("Config reload succeed.") + log.Info("Config reload succeed.") }) insp.v.WatchConfig() // block here err = insp.start() if err != nil { - log.Ctx(insp.ctx).Info("start server", "err", err) + log.Infof("start server err: %v", err) return } }, @@ -91,6 +82,129 @@ var ( configPath = "/etc/config/config.yaml" ) +type ProbeManager[T probe.Probe] interface { + CreateProbe(config ProbeConfig) (T, error) + StartProbe(ctx context.Context, probe T) error + StopProbe(ctx context.Context, probe T) error +} + +type DynamicProbeServer[T probe.Probe] struct { + lock sync.Mutex + probeManager ProbeManager[T] + lastConfig []ProbeConfig + probes map[string]T +} + +func NewDynamicProbeServer[T probe.Probe](probeManager ProbeManager[T]) *DynamicProbeServer[T] { + return &DynamicProbeServer[T]{ + probeManager: probeManager, + probes: make(map[string]T), + } +} + +func (s *DynamicProbeServer[T]) probeChanges(config []ProbeConfig) (toAdd []ProbeConfig, toClose []string) { + toMap := func(configs []ProbeConfig) map[string]ProbeConfig { + ret := make(map[string]ProbeConfig) + for _, probeConfig := range configs { + ret[probeConfig.Name] = probeConfig + } + return ret + } + lastConfigMap := toMap(s.lastConfig) + configMap := toMap(config) + + for name := range lastConfigMap { + if _, ok := configMap[name]; !ok { + toClose = append(toClose, name) + } + } + + for name, probeConf := range configMap { + lastConf, ok := lastConfigMap[name] + if !ok { + toAdd = append(toAdd, probeConf) + } else { + if !reflect.DeepEqual(lastConf, probeConf) { + toAdd = append(toAdd, probeConf) + toClose = append(toClose, name) + } + } + } + + return toAdd, toClose +} + +func (s *DynamicProbeServer[T]) Start(ctx context.Context, config []ProbeConfig) error { + return s.Reload(ctx, config) +} + +func (s *DynamicProbeServer[T]) Stop(ctx context.Context) error { + s.lock.Lock() + defer s.lock.Unlock() + + for _, probe := range s.probes { + if err := s.probeManager.StopProbe(ctx, probe); err != nil { + return err + } + } + return nil +} + +func marshalProbeConfig(config []ProbeConfig) string { + s, _ := json.Marshal(config) + return string(s) +} + +func (s *DynamicProbeServer[T]) Reload(ctx context.Context, config []ProbeConfig) error { + s.lock.Lock() + defer s.lock.Unlock() + + log.Infof("reload config, old config: %s, new config: %s", marshalProbeConfig(s.lastConfig), marshalProbeConfig(config)) + toAdd, toClose := s.probeChanges(config) + var toAddProbes []T + for _, probeConfig := range toAdd { + probe, err := s.probeManager.CreateProbe(probeConfig) + if err != nil { + return fmt.Errorf("error create probe %s: %w", probeConfig.Name, err) + } + toAddProbes = append(toAddProbes, probe) + } + + for _, name := range toClose { + probe, ok := s.probes[name] + if !ok { + continue + } + if err := s.probeManager.StopProbe(ctx, probe); err != nil { + return fmt.Errorf("failed stop probe %s, %w", name, err) + } + } + + s.lastConfig = config + + for _, probe := range toAddProbes { + s.probes[probe.Name()] = probe + if err := s.probeManager.StartProbe(ctx, probe); err != nil { + log.Errorf("failed start probe %s, err: %v", probe.Name(), err) + } + } + + return nil +} + +type probeState struct { + Name string `json:"name"` + State string `json:"state"` +} + +func (s *DynamicProbeServer[T]) listProbes() []probeState { + var ret []probeState + for name, probe := range s.probes { + ret = append(ret, probeState{Name: name, State: probe.State().String()}) + } + return ret +} + func init() { rootCmd.AddCommand(serverCmd) @@ -98,57 +212,53 @@ func init() { } type inspServerConfig struct { - DebugMode bool `mapstructure:"debugmode"` - Mconfig MetricConfig `mapstructure:"metric_config"` - Econfig EventConfig `mapstructure:"event_config"` + DebugMode bool `yaml:"debugmode" mapstructure:"debugmode"` + Port uint16 `yaml:"port" mapstructure:"port"` + MetricsConfig MetricsConfig `yaml:"metrics" mapstructure:"metrics"` + EventConfig EventConfig `yaml:"event" mapstructure:"event"` } -type ExposeLabel struct { - Source string `mapstructure:"source"` - LabelType string `mapstructure:"type"` - Replace string `mapstructure:"replace"` +type MetricsConfig struct { + Probes []ProbeConfig `yaml:"probes" mapstructure:"probes"` } -type MetricConfig struct { - Interval int `mapstructure:"interval"` - Port int `mapstructure:"port"` - Probes []string `mapstructure:"probes"` - Verbose bool `mapstructure:"verbose"` - ExposeLabels []ExposeLabel `mapstructure:"expose_labels"` +type EventConfig struct { + EventSinks []EventSinkConfig `yaml:"sinks" mapstructure:"sinks"` + Probes []ProbeConfig `yaml:"probes" mapstructure:"probes"` } -type EventConfig struct { - Port int `mapstructure:"port"` - InfoToLog bool `mapstructure:"infotolog"` - LokiAddress string `mapstructure:"loki_address"` - LokiEnable bool `mapstructure:"loki_enable"` - Probes []string `mapstructure:"probes"` +type EventSinkConfig struct { + Name string `yaml:"name" mapstructure:"name"` + Args interface{} `yaml:"args" mapstructure:"args"` +} + +type ProbeConfig struct { + Name string `yaml:"name" mapstructure:"name"` + Args interface{} `yaml:"args" mapstructure:"args"` } type inspServer struct { - v viper.Viper - config inspServerConfig - ctx context.Context - mserver *MServer - eserver *EServer + v viper.Viper + config inspServerConfig + ctx context.Context + metricsServer *MetricsServer + eventServer *EventServer } func (i *inspServer) MergeConfig() error { err := i.v.ReadInConfig() if err != nil { if _, ok := err.(viper.ConfigFileNotFoundError); ok { - log.Ctx(i.ctx).Info("validate config", "path", configPath, "err", err) - return errors.Wrapf(err, "no such config") + log.Infof("validate config err: %v", err) + return fmt.Errorf("config file %s not found", i.v.ConfigFileUsed()) } - log.Ctx(i.ctx).Info("validate config", "err", err) - return err + return fmt.Errorf("config file err: %w", err) } cfg := &inspServerConfig{} err = i.v.Unmarshal(cfg) if err != nil { - log.Ctx(i.ctx).Info("validate unmarshal config", "err", err) - return err + return fmt.Errorf("config file err: %w", err) } i.config = *cfg @@ -163,12 +273,14 @@ func (i *inspServer) reload() error { return err } - err = i.mserver.Reload(cfg.Mconfig) + ctx := context.TODO() + + err = i.metricsServer.Reload(ctx, cfg.MetricsConfig.Probes) if err != nil { return fmt.Errorf("reload metric server error: %s", err) } - err = i.eserver.Reload(cfg.Econfig) + err = i.eventServer.Reload(ctx, cfg.EventConfig.Probes) if err != nil { return fmt.Errorf("reload event server error: %s", err) } @@ -179,22 +291,57 @@ func (i *inspServer) reload() error { func (i *inspServer) start() error { if err := gops.Listen(gops.Options{}); err != nil { - log.Ctx(i.ctx).Info("start gops", "err", err) + log.Infof("start gops err: %v", err) } go func() { - i.mserver = NewMServer(i.ctx, i.config.Mconfig) - defer i.mserver.Close() - - r := prometheus.NewRegistry() - r.MustRegister(i.mserver) - handler := promhttp.HandlerFor(prometheus.Gatherers{ - r, - }, promhttp.HandlerOpts{}) - http.Handle("/metrics", handler) + var err error + ctx := context.TODO() + + log.Infof("start metrics server") + i.metricsServer, err = NewMetricsServer() + if err != nil { + log.Errorf("failed create metrics server: %v", err) + return + } + + defer func() { + _ = i.metricsServer.Stop(ctx) + }() + + if err := i.metricsServer.Start(ctx, i.config.MetricsConfig.Probes); err != nil { + log.Errorf("failed start metrics server: %v", err) + return + } + + //sink + sinks, err := createSink(i.config.EventConfig.EventSinks) + if err != nil { + log.Errorf("failed create sinks, err: %v", err) + return + } + + log.Infof("start event server") + //TODO create sinks from config + i.eventServer, err = NewEventServer(sinks) + if err != nil { + log.Errorf("failed create event server: %v", err) + return + } + + defer func() { + _ = i.eventServer.Stop(context.TODO()) + }() + + if err := i.eventServer.Start(ctx, i.config.EventConfig.Probes); err != nil { + log.Errorf("failed start event server: %v", err) + return + } + + http.Handle("/metrics", i.metricsServer) http.Handle("/", http.HandlerFunc(defaultPage)) http.Handle("/config", http.HandlerFunc(i.configPage)) - http.Handle("/status", http.HandlerFunc(status)) + http.Handle("/status", http.HandlerFunc(i.statusPage)) if i.config.DebugMode { reg := prometheus.NewRegistry() @@ -204,28 +351,11 @@ func (i *inspServer) start() error { ) http.Handle("/internal", promhttp.HandlerFor(reg, promhttp.HandlerOpts{Registry: reg})) } - listenaddr := fmt.Sprintf(":%d", i.config.Mconfig.Port) - log.Ctx(i.ctx).Info("inspector start metric server", "listenaddr", listenaddr) - srv := &http.Server{Addr: listenaddr} + listenAddr := fmt.Sprintf(":%d", i.config.Port) + log.Infof("inspector start metric server, listenAddr: %s", listenAddr) + srv := &http.Server{Addr: listenAddr} if err := srv.ListenAndServe(); err != nil { - log.Ctx(i.ctx).Info("inspector start metric server", "err", err, "listenaddr", listenaddr) - } - }() - - go func() { - s := grpc.NewServer() - i.eserver = NewEServer(i.ctx, i.config.Econfig) - proto.RegisterInspectorServer(s, i.eserver) - listener, err := net.Listen("tcp", fmt.Sprintf("0.0.0.0:%d", i.config.Econfig.Port)) - if err != nil { - log.Ctx(i.ctx).Warn("inspector start event server", "port", i.config.Econfig.Port, "err", err) - return - } - log.Ctx(i.ctx).Info("inspector eserver serve", "port", i.config.Econfig.Port) - // grpc server block there, handle it with goroutine - if err := s.Serve(listener); err != nil { - log.Ctx(i.ctx).Warn("inspector eserver serve", "port", i.config.Econfig.Port, "err", err) - return + log.Errorf("inspector start metric server err: %v", err) } }() @@ -233,8 +363,19 @@ func (i *inspServer) start() error { return nil } -func WaitSignals(ctx context.Context, sgs ...os.Signal) { - log.Ctx(ctx).Info("keep running and start waiting for signals") +func createSink(sinkConfigs []EventSinkConfig) ([]sink.Sink, error) { + var ret []sink.Sink + for _, config := range sinkConfigs { + s, err := sink.CreateSink(config.Name, config.Args) + if err != nil { + return nil, fmt.Errorf("failed create sink %s, err: %w", config.Name, err) + } + ret = append(ret, s) + } + return ret, nil +} + +func WaitSignals(_ context.Context, sgs ...os.Signal) { s := make(chan os.Signal, 1) signal.Notify(s, sgs...) <-s @@ -258,14 +399,25 @@ func (i *inspServer) configPage(w http.ResponseWriter, _ *http.Request) { w.Write(rawText) // nolint } -func status(w http.ResponseWriter, _ *http.Request) { +func (i *inspServer) statusPage(w http.ResponseWriter, _ *http.Request) { w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusOK) - res := make(map[string]bool) - for _, pn := range probe.ListMetricProbes() { - p := probe.GetProbe(pn) - res[p.Name()] = p.Ready() + + res := map[string]interface{}{ + "inuse_probes": map[string][]probeState{ + "metrics": i.metricsServer.listProbes(), + "event": i.eventServer.listProbes(), + }, + + "available_probes": map[string][]string{ + "event": probe.ListEventProbes(), + "metrics": probe.ListMetricsProbes(), + }, + } + + rawText, err := json.Marshal(res) + if err != nil { + log.Errorf("failed marshal probe status: %v", err) } - rawText, _ := json.Marshal(res) w.Write(rawText) // nolint } diff --git a/pkg/exporter/cmd/watch.go b/pkg/exporter/cmd/watch.go deleted file mode 100644 index 4a7554ff..00000000 --- a/pkg/exporter/cmd/watch.go +++ /dev/null @@ -1,65 +0,0 @@ -/* -Copyright © 2022 NAME HERE -*/ -package cmd - -import ( - "context" - "fmt" - - "github.com/alibaba/kubeskoop/pkg/exporter/proto" - "github.com/spf13/cobra" - "golang.org/x/exp/slog" - "google.golang.org/grpc" - "google.golang.org/grpc/credentials/insecure" -) - -// watchCmd represents the watch command -var ( - watchCmd = &cobra.Command{ - Use: "watch", - Short: "A brief description of your command", - Run: func(cmd *cobra.Command, args []string) { - endpoint := fmt.Sprintf("%s:%d", endpointAddr, endpointPort) - slog.Ctx(context.Background()).Info("inspector watch", "endpoint", endpoint) - watchInspEvents(context.Background(), endpoint) - }, - } - - endpointPort uint32 - endpointAddr string -) - -func init() { - rootCmd.AddCommand(watchCmd) - watchCmd.PersistentFlags().Uint32VarP(&endpointPort, "port", "p", 19102, "remote inspector server port") - watchCmd.PersistentFlags().StringVarP(&endpointAddr, "server", "s", "127.0.0.1", "remote inspector server") -} - -func watchInspEvents(ctx context.Context, ep string) { - conn, err := grpc.Dial(ep, grpc.WithTransportCredentials(insecure.NewCredentials())) - if err != nil { - slog.Ctx(ctx).Warn("grpc dial", "err", err) - return - } - - cli := proto.NewInspectorClient(conn) - stream, err := cli.WatchEvent(ctx, &proto.WatchRequest{}) - if err != nil { - slog.Ctx(ctx).Warn("stream watch", "err", err, "endpoint", ep) - return - } - - for { - resp, err := stream.Recv() - if err != nil { - slog.Ctx(ctx).Warn("stream recv", "err", err, "endpoint", ep) - return - } - - meta := resp.GetEvent().GetMeta() - - metaStr := fmt.Sprintf("%s/%s node=%s netns=%s ", meta.GetNamespace(), meta.GetPod(), meta.GetNode(), meta.GetNetns()) - slog.Ctx(ctx).Info(resp.GetEvent().GetName(), "meta", metaStr, "event", resp.GetEvent().GetValue()) - } -} diff --git a/pkg/exporter/loki/client.go b/pkg/exporter/loki/client.go index 1bb6ece2..9b22cc91 100644 --- a/pkg/exporter/loki/client.go +++ b/pkg/exporter/loki/client.go @@ -10,10 +10,10 @@ import ( "sync" "time" + inspproto "github.com/alibaba/kubeskoop/pkg/exporter/probe" + "github.com/alibaba/kubeskoop/pkg/exporter/loki/logproto" "github.com/alibaba/kubeskoop/pkg/exporter/nettop" - inspproto "github.com/alibaba/kubeskoop/pkg/exporter/proto" - "github.com/golang/snappy" "golang.org/x/exp/slog" "google.golang.org/protobuf/proto" diff --git a/pkg/exporter/nettop/cache.go b/pkg/exporter/nettop/cache.go index c7d3cfa0..456b06c1 100644 --- a/pkg/exporter/nettop/cache.go +++ b/pkg/exporter/nettop/cache.go @@ -6,9 +6,10 @@ import ( "os" "time" + log "github.com/sirupsen/logrus" + "github.com/patrickmn/go-cache" "github.com/vishvananda/netns" - "golang.org/x/exp/slog" ) const ( @@ -158,7 +159,7 @@ func (e *Entity) GetPids() []int { } func StartCache(ctx context.Context) error { - slog.Ctx(ctx).Info("nettop cache loop start", "interval", cacheUpdateInterval) + log.Infof("nettop cache loop start, interval: %d", cacheUpdateInterval) return cacheDaemonLoop(ctx, control) } @@ -166,14 +167,14 @@ func StopCache() { control <- struct{}{} } -func cacheDaemonLoop(ctx context.Context, control chan struct{}) error { +func cacheDaemonLoop(_ context.Context, control chan struct{}) error { t := time.NewTicker(cacheUpdateInterval) defer t.Stop() for { select { case <-control: - slog.Ctx(ctx).Info("cache daemon loop exit of control signal") + log.Info("cache daemon loop exit of control signal") return nil case <-t.C: go cacheProcess() @@ -192,17 +193,17 @@ func cacheProcess() { go func(done chan struct{}) { err := cacheNetTopology() if err != nil { - logger.Warn("cache process", "err", err) + log.Errorf("failed cache process, err: %v", err) } done <- struct{}{} }(cacheDone) select { case <-ctx.Done(): - logger.Info("cache process time exceeded", "latency", time.Since(start).Seconds()) + log.Infof("cache process time exceeded, latency: %fs", time.Since(start).Seconds()) return case <-cacheDone: - logger.Info("cache process finished", "latency", time.Since(start).Seconds()) + log.Infof("cache process finished, latency: %fs", time.Since(start).Seconds()) } } @@ -215,17 +216,17 @@ func cacheNetTopology() error { // get all process pids, err := getAllPids() if err != nil { - logger.Warn("cache pids failed %s", err.Error()) + log.Warnf("cache pids failed %s", err) return err } - logger.Debug("finished get all pids") + log.Debug("finished get all pids") // get all netns by process netnsMap := map[int]netnsMeta{} for _, pid := range pids { nsinum, err := getNsInumByPid(pid) if err != nil { - logger.Warn("get ns inum of %d failed %s", pid, err.Error()) + log.Warnf("get ns inum of %d failed %s", pid, err) continue } @@ -248,17 +249,17 @@ func cacheNetTopology() error { } - logger.Debug("finished get all netns") + log.Debug("finished get all netns") // get netns mount point aka cni presentation namedns, err := findNsfsMountpoint() if err != nil { - logger.Warn("get nsfs mount point failed %s", err.Error()) + log.Warnf("get nsfs mount point failed %s", err) } else { for _, mp := range namedns { nsinum, err := getNsInumByNsfsMountPoint(mp) if err != nil { - logger.Warn("get ns inum from %s point failed %s", mp, err.Error()) + log.Warnf("get ns inum from %s point failed %s", mp, err) continue } if v, ok := netnsMap[nsinum]; !ok { @@ -274,24 +275,24 @@ func cacheNetTopology() error { } } - logger.Debug("finished get all nsfs mount point") + log.Debug("finished get all nsfs mount point") var podMap map[string]podMeta if !sidecarEnabled { // get pod meta info podMap, err = getPodMetas(rcrisvc) if err != nil { - logger.Warn("get pod meta failed %s", err.Error()) + log.Warnf("get pod meta failed %s", err) return err } // if use docker, get docker sandbox - if top.Crimeta.RuntimeName == "docker" { + if top.Crimeta != nil && top.Crimeta.RuntimeName == "docker" { for sandbox, pm := range podMap { if pm.nspath == "" && pm.pid == 0 { pid, err := getPidForContainerBySock(sandbox) if err != nil { - logger.Warn("get docker container", "sandbox", sandbox, "err", err.Error()) + log.Warnf("get docker container error, sandbox: %s, err: %v", sandbox, err) continue } pm.pid = pid @@ -307,12 +308,12 @@ func cacheNetTopology() error { netnsMeta: nsmeta, pids: nsmeta.pids, } - logger.Debug("try related pod", nsinum, nsmeta.mountPath) + log.Debugf("try associate pod with netns %d (%s)", nsinum, nsmeta.mountPath) for sandbox, pm := range podMap { // 1. use cri infospec/nspath to match if pm.nspath != "" && pm.nspath == nsmeta.mountPath { ent.podMeta = pm - logger.Debug("related pod mount point", "pod", pm.name, "netns", nsmeta.inum) + log.Debugf("associate pod %s with mount point %d", pm.name, nsmeta.inum) podCache.Set(sandbox, ent, 3*cacheUpdateInterval) for _, pid := range nsmeta.pids { pidCache.Set(fmt.Sprintf("%d", pid), ent, 3*cacheUpdateInterval) @@ -325,7 +326,7 @@ func cacheNetTopology() error { if err == nil { if nsinum == pidns { ent.podMeta = pm - logger.Debug("related pod", "pod", pm.name, "netns", nsmeta.inum) + log.Debugf("associate pod %s with netns %d", pm.name, nsmeta.inum) podCache.Set(sandbox, ent, 3*cacheUpdateInterval) for _, pid := range nsmeta.pids { pidCache.Set(fmt.Sprintf("%d", pid), ent, 3*cacheUpdateInterval) @@ -336,7 +337,7 @@ func cacheNetTopology() error { for _, pid := range nsmeta.pids { if pm.pid == pid { ent.podMeta = pm - logger.Debug("related pod pid", "pod", pm.name, "netns", nsmeta.inum) + log.Debugf("associate pod pid, pod: %s, netns %d", pm.name, nsmeta.inum) podCache.Set(sandbox, ent, 3*cacheUpdateInterval) for _, pid := range nsmeta.pids { pidCache.Set(fmt.Sprintf("%d", pid), ent, 3*cacheUpdateInterval) @@ -348,6 +349,6 @@ func cacheNetTopology() error { nsCache.Set(fmt.Sprintf("%d", nsinum), ent, 3*cacheUpdateInterval) } - logger.Debug("finished cache process") + log.Debug("finished cache process") return nil } diff --git a/pkg/exporter/nettop/cri.go b/pkg/exporter/nettop/cri.go index 6bb79d17..cd2265fc 100644 --- a/pkg/exporter/nettop/cri.go +++ b/pkg/exporter/nettop/cri.go @@ -9,6 +9,8 @@ import ( "strings" "time" + log "github.com/sirupsen/logrus" + "github.com/pkg/errors" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -33,8 +35,7 @@ var ( func (c *CriMeta) Update() error { criclient, crisock, err := getCriClient(runtimeEndpoints) if err != nil { - logger.Debug("get client of runtime svc failed", "err", err.Error()) - return nil + return err } c.RuntimeSock = crisock @@ -42,7 +43,6 @@ func (c *CriMeta) Update() error { version, err := rcrisvc.Version(kubeAPIVersion) if err != nil { - logger.Debug("get version of runtime svc failed", "err", err.Error()) return err } @@ -64,11 +64,9 @@ func getCriClient(eps []string) (internalapi.RuntimeService, string, error) { if _, err := os.Stat(sock); os.IsNotExist(err) { return nil, "", fmt.Errorf("cannot find cri sock %s", sock) } - logger.Debug("try to connect crisock", "candidate", sock) client, err := NewRemoteRuntimeService(sock, 10*time.Second) if err != nil { - logger.Warn("try to connect crisock", "candidate", sock, "err", err) - return nil, "", fmt.Errorf("connect cri sock %s error: %s", sock, err.Error()) + return nil, "", fmt.Errorf("connect cri sock %s error: %w", sock, err) } return client, sock, nil } @@ -77,10 +75,8 @@ func getCriClient(eps []string) (internalapi.RuntimeService, string, error) { if _, err := os.Stat(candidate); os.IsNotExist(err) { continue } - logger.Debug("try to connect crisock", "candidate", candidate) client, err := NewRemoteRuntimeService(candidate, 10*time.Second) if err != nil { - logger.Warn("try to connect crisock", "candidate", candidate, "err", err) continue } return client, candidate, nil @@ -99,12 +95,9 @@ func (r *remoteRuntimeService) versionV1alpha2(ctx context.Context, apiVersion s Version: apiVersion, }) if err != nil { - logger.Warn("Version from runtime service failed", "err", err) return nil, err } - logger.Debug("[RemoteRuntimeService] Version Response", "apiVersion", typedVersion) - if typedVersion.Version == "" || typedVersion.RuntimeName == "" || typedVersion.RuntimeApiVersion == "" || typedVersion.RuntimeVersion == "" { return nil, fmt.Errorf("not all fields are set in VersionResponse (%q)", *typedVersion) } @@ -114,7 +107,6 @@ func (r *remoteRuntimeService) versionV1alpha2(ctx context.Context, apiVersion s // Version returns the runtime name, runtime version and runtime API version. func (r *remoteRuntimeService) Version(apiVersion string) (*runtimeapi.VersionResponse, error) { - logger.Debug("[RemoteRuntimeService] Version", "apiVersion", apiVersion, "timeout", r.timeout) ctx, cancel := getContextWithTimeout(r.timeout) defer cancel() @@ -131,12 +123,9 @@ func (r *remoteRuntimeService) versionV1(ctx context.Context, apiVersion string) Version: apiVersion, }) if err != nil { - logger.Warn("Version from runtime service failed", "err", err) return nil, err } - logger.Debug("[RemoteRuntimeService] Version Response", "apiVersion", typedVersion) - if typedVersion.Version == "" || typedVersion.RuntimeName == "" || typedVersion.RuntimeApiVersion == "" || typedVersion.RuntimeVersion == "" { return nil, fmt.Errorf("not all fields are set in VersionResponse (%q)", *typedVersion) } @@ -148,12 +137,10 @@ func getConnection(ctx context.Context, endPoint string) (*grpc.ClientConn, erro var conn *grpc.ClientConn addr, dialer, err := GetAddressAndDialer(endPoint) if err != nil { - logger.Debug("get connect", "addr", addr, "err", err) return nil, err } conn, err = grpc.DialContext(ctx, addr, grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithBlock(), grpc.WithContextDialer(dialer), grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(maxMsgSize))) if err != nil { - logger.Debug("get connect ", "addr", addr, "err", err) return nil, fmt.Errorf("connect endpoint '%s', make sure you are running as root and the endpoint has been started", endPoint) } @@ -273,7 +260,6 @@ func (r *remoteRuntimeService) UpdateRuntimeConfig(_ *runtimeapi.RuntimeConfig) // Status returns the status of the runtime. func (r *remoteRuntimeService) Status(verbose bool) (*runtimeapi.StatusResponse, error) { - logger.Debug("[RemoteRuntimeService] Status", "timeout", r.timeout) ctx, cancel := getContextWithTimeout(r.timeout) defer cancel() @@ -289,16 +275,12 @@ func (r *remoteRuntimeService) statusV1alpha2(ctx context.Context, verbose bool) Verbose: verbose, }) if err != nil { - logger.Warn("Status from runtime service failed", "err", err) return nil, err } - logger.Debug("[RemoteRuntimeService] Status Response", "status", resp.Status) - if resp.Status == nil || len(resp.Status.Conditions) < 2 { errorMessage := "RuntimeReady or NetworkReady condition are not set" err := errors.New(errorMessage) - logger.Warn("Status failed", "err", err) return nil, err } @@ -310,16 +292,12 @@ func (r *remoteRuntimeService) statusV1(ctx context.Context, verbose bool) (*run Verbose: verbose, }) if err != nil { - logger.Warn("Status from runtime service failed", "err", err) return nil, err } - logger.Debug("[RemoteRuntimeService] Status Response", "status", resp.Status) - if resp.Status == nil || len(resp.Status.Conditions) < 2 { errorMessage := "RuntimeReady or NetworkReady condition are not set" err := errors.New(errorMessage) - logger.Warn("Status failed", "err", err) return nil, err } @@ -328,7 +306,6 @@ func (r *remoteRuntimeService) statusV1(ctx context.Context, verbose bool) (*run // PodSandboxStatus returns the status of the PodSandbox. func (r *remoteRuntimeService) PodSandboxStatus(podSandBoxID string, verbose bool) (*runtimeapi.PodSandboxStatusResponse, error) { - logger.Debug("[RemoteRuntimeService] PodSandboxStatus", "podSandboxID", podSandBoxID, "timeout", r.timeout) ctx, cancel := getContextWithTimeout(r.timeout) defer cancel() @@ -348,8 +325,6 @@ func (r *remoteRuntimeService) podSandboxStatusV1alpha2(ctx context.Context, pod return nil, err } - logger.Debug("[RemoteRuntimeService] PodSandboxStatus Response", "podSandboxID", podSandBoxID, "status", resp.Status) - res := fromV1alpha2PodSandboxStatusResponse(resp) if res.Status != nil { if err := verifySandboxStatus(res.Status); err != nil { @@ -369,8 +344,6 @@ func (r *remoteRuntimeService) podSandboxStatusV1(ctx context.Context, podSandBo return nil, err } - logger.Debug("[RemoteRuntimeService] PodSandboxStatus Response", "podSandboxID", podSandBoxID, "status", resp.Status) - status := resp.Status if resp.Status != nil { if err := verifySandboxStatus(status); err != nil { @@ -383,7 +356,6 @@ func (r *remoteRuntimeService) podSandboxStatusV1(ctx context.Context, podSandBo // PodSandboxStats returns the stats of the pod. func (r *remoteRuntimeService) PodSandboxStats(podSandboxID string) (*runtimeapi.PodSandboxStats, error) { - logger.Debug("[RemoteRuntimeService] PodSandboxStats", "podSandboxID", podSandboxID, "timeout", r.timeout) ctx, cancel := getContextWithTimeout(r.timeout) defer cancel() @@ -401,7 +373,6 @@ func (r *remoteRuntimeService) podSandboxStatsV1alpha2(ctx context.Context, podS if err != nil { return nil, err } - logger.Debug("[RemoteRuntimeService] PodSandbox Response", "podSandboxID", podSandboxID, "stats", resp.GetStats()) return fromV1alpha2PodSandboxStats(resp.GetStats()), nil } @@ -413,14 +384,12 @@ func (r *remoteRuntimeService) podSandboxStatsV1(ctx context.Context, podSandbox if err != nil { return nil, err } - logger.Debug("[RemoteRuntimeService] PodSandbox Response", "podSandboxID", podSandboxID, "stats", resp.GetStats()) return resp.GetStats(), nil } // ListPodSandboxStats returns the list of pod sandbox stats given the filter func (r *remoteRuntimeService) ListPodSandboxStats(filter *runtimeapi.PodSandboxStatsFilter) ([]*runtimeapi.PodSandboxStats, error) { - logger.Debug("[RemoteRuntimeService] ListPodSandboxStats", "filter", filter) // Set timeout, because runtimes are able to cache disk stats results ctx, cancel := getContextWithTimeout(r.timeout) defer cancel() @@ -437,10 +406,8 @@ func (r *remoteRuntimeService) listPodSandboxStatsV1alpha2(ctx context.Context, Filter: v1alpha2PodSandboxStatsFilter(filter), }) if err != nil { - logger.Warn("ListPodSandboxStats with filter from runtime service failed", "err", err, "filter", filter) return nil, err } - logger.Debug("[RemoteRuntimeService] ListPodSandboxStats Response", "filter", filter, "stats", resp.GetStats()) return fromV1alpha2ListPodSandboxStatsResponse(resp).GetStats(), nil } @@ -450,17 +417,14 @@ func (r *remoteRuntimeService) listPodSandboxStatsV1(ctx context.Context, filter Filter: filter, }) if err != nil { - logger.Warn("ListPodSandboxStats with filter from runtime service failed", "err", err, "filter", filter) return nil, err } - logger.Debug("[RemoteRuntimeService] ListPodSandboxStats Response", "filter", filter, "stats", resp.GetStats()) return resp.GetStats(), nil } // ListPodSandbox returns a list of PodSandboxes. func (r *remoteRuntimeService) ListPodSandbox(filter *runtimeapi.PodSandboxFilter) ([]*runtimeapi.PodSandbox, error) { - logger.Debug("[RemoteRuntimeService] ListPodSandbox", "filter", filter, "timeout", r.timeout) ctx, cancel := getContextWithTimeout(r.timeout) defer cancel() @@ -473,7 +437,6 @@ func (r *remoteRuntimeService) ListPodSandbox(filter *runtimeapi.PodSandboxFilte // ListContainers lists containers by filters. func (r *remoteRuntimeService) ListContainers(filter *runtimeapi.ContainerFilter) ([]*runtimeapi.Container, error) { - logger.Debug("[RemoteRuntimeService] ListContainers", "filter", filter, "timeout", r.timeout) ctx, cancel := getContextWithTimeout(r.timeout) defer cancel() @@ -489,12 +452,9 @@ func (r *remoteRuntimeService) listPodSandboxV1alpha2(ctx context.Context, filte Filter: v1alpha2PodSandboxFilter(filter), }) if err != nil { - logger.Warn("ListPodSandbox with filter from runtime service failed", "err", err, "filter", filter) return nil, err } - logger.Debug("[RemoteRuntimeService] ListPodSandbox Response", "filter", filter, "items", resp.Items) - return fromV1alpha2ListPodSandboxResponse(resp).Items, nil } @@ -503,12 +463,9 @@ func (r *remoteRuntimeService) listPodSandboxV1(ctx context.Context, filter *run Filter: filter, }) if err != nil { - logger.Warn("ListPodSandbox with filter from runtime service failed", "err", err, "filter", filter) return nil, err } - logger.Debug("[RemoteRuntimeService] ListPodSandbox Response", "filter", filter, "items", resp.Items) - return resp.Items, nil } @@ -517,10 +474,8 @@ func (r *remoteRuntimeService) listContainersV1alpha2(ctx context.Context, filte Filter: v1alpha2ContainerFilter(filter), }) if err != nil { - logger.Warn("ListContainers with filter from runtime service failed", "err", err, "filter", filter) return nil, err } - logger.Debug("[RemoteRuntimeService] ListContainers Response", "filter", filter, "containers", resp.Containers) return fromV1alpha2ListContainersResponse(resp).Containers, nil } @@ -530,17 +485,14 @@ func (r *remoteRuntimeService) listContainersV1(ctx context.Context, filter *run Filter: filter, }) if err != nil { - logger.Warn("ListContainers with filter from runtime service failed", "err", err, "filter", filter) return nil, err } - logger.Debug("[RemoteRuntimeService] ListContainers Response", "filter", filter, "containers", resp.Containers) return resp.Containers, nil } // ListContainerStats returns the list of ContainerStats given the filter. func (r *remoteRuntimeService) ListContainerStats(filter *runtimeapi.ContainerStatsFilter) ([]*runtimeapi.ContainerStats, error) { - logger.Debug("[RemoteRuntimeService] ListContainerStats", "filter", filter) // Do not set timeout, because writable layer stats collection takes time. // TODO(random-liu): Should we assume runtime should cache the result, and set timeout here? ctx, cancel := getContextWithCancel() @@ -558,10 +510,8 @@ func (r *remoteRuntimeService) listContainerStatsV1(ctx context.Context, filter Filter: filter, }) if err != nil { - logger.Warn("ListContainerStats with filter from runtime service failed", "err", err, "filter", filter) return nil, err } - logger.Debug("[RemoteRuntimeService] ListContainerStats Response", "filter", filter, "stats", resp.GetStats()) return resp.GetStats(), nil } @@ -571,17 +521,14 @@ func (r *remoteRuntimeService) listContainerStatsV1alpha2(ctx context.Context, f Filter: v1alpha2ContainerStatsFilter(filter), }) if err != nil { - logger.Warn("ListContainerStats with filter from runtime service failed", "err", err, "filter", filter) return nil, err } - logger.Debug("[RemoteRuntimeService] ListContainerStats Response", "filter", filter, "stats", resp.GetStats()) return fromV1alpha2ListContainerStatsResponse(resp).GetStats(), nil } // ContainerStatus returns the container status. func (r *remoteRuntimeService) ContainerStatus(containerID string, verbose bool) (*runtimeapi.ContainerStatusResponse, error) { - logger.Debug("[RemoteRuntimeService] ContainerStatus", "containerID", containerID, "timeout", r.timeout) ctx, cancel := getContextWithTimeout(r.timeout) defer cancel() @@ -601,12 +548,9 @@ func (r *remoteRuntimeService) containerStatusV1(ctx context.Context, containerI return nil, err } - logger.Debug("[RemoteRuntimeService] ContainerStatus Response", "containerID", containerID, "status", resp.Status) - status := resp.Status if resp.Status != nil { if err := verifyContainerStatus(status); err != nil { - logger.Warn("verify ContainerStatus failed", "err", err, "containerID", containerID) return nil, err } } @@ -622,12 +566,10 @@ func (r *remoteRuntimeService) containerStatusV1alpha2(ctx context.Context, cont if err != nil { return nil, err } - logger.Debug("[RemoteRuntimeService] ContainerStatus Response", "containerID", containerID, "status", resp.Status) res := fromV1alpha2ContainerStatusResponse(resp) if resp.Status != nil { if err := verifyContainerStatus(res.Status); err != nil { - logger.Warn("verify ContainerStatus failed", "err", err, "containerID", containerID) return nil, err } } @@ -647,13 +589,11 @@ func (r *remoteRuntimeService) determineAPIVersion(conn *grpc.ClientConn) error ctx, cancel := getContextWithTimeout(r.timeout) defer cancel() - logger.Debug("Finding the CRI API runtime version") r.runtimeClient = runtimeapi.NewRuntimeServiceClient(conn) if _, err := r.runtimeClient.Version(ctx, &runtimeapi.VersionRequest{}); err == nil { - logger.Warn("Using CRI v1 runtime API") + log.Warn("Using CRI v1 runtime API") } else if status.Code(err) == codes.Unimplemented { - logger.Warn("Falling back to CRI v1alpha2 runtime API (deprecated)") r.runtimeClientV1alpha2 = runtimeapiV1alpha2.NewRuntimeServiceClient(conn) } else { return fmt.Errorf("unable to determine runtime API version: %w", err) @@ -687,9 +627,6 @@ func parseEndpointWithFallbackProtocol(endpoint string, fallbackProtocol string) if protocol, addr, err = parseEndpoint(endpoint); err != nil && protocol == "" { fallbackEndpoint := fallbackProtocol + "://" + endpoint protocol, addr, err = parseEndpoint(fallbackEndpoint) - if err == nil { - logger.Info("Using this endpoint is deprecated, please consider using full URL format", "endpoint", endpoint, "URL", fallbackEndpoint) - } } return } diff --git a/pkg/exporter/nettop/docker.go b/pkg/exporter/nettop/docker.go index 1bc87329..31562c5b 100644 --- a/pkg/exporter/nettop/docker.go +++ b/pkg/exporter/nettop/docker.go @@ -7,6 +7,8 @@ import ( io "io" "net" "net/http" + + log "github.com/sirupsen/logrus" ) var ( @@ -39,22 +41,22 @@ func getPidForContainerBySock(id string) (int, error) { url := fmt.Sprintf("http://localhost/containers/%s/json", id) response, err := dockerhttpc.Get(url) if err != nil { - logger.Warn("get response with %s", err.Error()) + log.Errorf("failed get docker response, err: %v", err) return 0, err } b, err := io.ReadAll(response.Body) if err != nil { - logger.Warn("get response with %s", err.Error()) + log.Errorf("failed get docker response, err: %v", err) return 0, err } sd := &slimDocker{} err = json.Unmarshal(b, &sd) if err != nil { - logger.Warn("get response", "err", err.Error()) + log.Errorf("failed get docker response, err: %v", err) return 0, err } - logger.Info("finish get pid", "sandbox", id, "pid", sd.State.Pid) + log.Infof("finish get pid, sandbox: %s, pid: %d", id, sd.State.Pid) return sd.State.Pid, nil } diff --git a/pkg/exporter/nettop/nodemeta.go b/pkg/exporter/nettop/nodemeta.go index b84627cd..3dcca9d8 100644 --- a/pkg/exporter/nettop/nodemeta.go +++ b/pkg/exporter/nettop/nodemeta.go @@ -2,20 +2,18 @@ package nettop import ( fmt "fmt" - "io" "os" "strings" "unsafe" - "golang.org/x/exp/slog" + log "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" ) //go:generate protoc --go_out=. ./libnettop.proto var ( - logger *slog.Logger - top = metacache{} runtimeEndpoints = []string{"/var/run/dockershim.sock", "/run/containerd/containerd.sock", "/run/k3s/containerd/containerd.sock"} @@ -27,11 +25,10 @@ type metacache struct { } func Init(sidecar bool) { - logger = slog.New(slog.NewJSONHandler(io.Discard)) top.NodeName = getNodeName() kr, err := getKernelRelease() if err != nil { - logger.Warn("failed to get node kernel info %s", err.Error()) + log.Errorf("failed to get node kernel info %v", err) } else { top.Kernel = kr } @@ -40,7 +37,7 @@ func Init(sidecar bool) { c := &CriMeta{} err = c.Update() if err != nil { - logger.Warn("update cri meta failed %s", err.Error()) + log.Errorf("update cri meta failed %v", err) } top.Crimeta = c diff --git a/pkg/exporter/nettop/pod.go b/pkg/exporter/nettop/pod.go index 45044936..39553b6a 100644 --- a/pkg/exporter/nettop/pod.go +++ b/pkg/exporter/nettop/pod.go @@ -2,7 +2,8 @@ package nettop import ( "encoding/json" - "errors" + + log "github.com/sirupsen/logrus" internalapi "k8s.io/cri-api/pkg/apis" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" @@ -28,7 +29,7 @@ type InfoSpec struct { func getPodMetas(client internalapi.RuntimeService) (map[string]podMeta, error) { if client == nil { - return nil, errors.New("not in cloudnative environment") + return nil, nil } // only list live pods filter := runtimeapi.PodSandboxFilter{ @@ -44,7 +45,7 @@ func getPodMetas(client internalapi.RuntimeService) (map[string]podMeta, error) for _, sandbox := range listresponse { status, err := client.PodSandboxStatus(sandbox.GetId(), true) if err != nil { - logger.Debug("get pod sandbox %s status failed with %s", sandbox.GetId(), err) + log.Debugf("get pod sandbox %s status failed with %s", sandbox.GetId(), err) continue } pm := podMeta{ @@ -65,7 +66,7 @@ func getPodMetas(client internalapi.RuntimeService) (map[string]podMeta, error) infospec := InfoSpec{} err := json.Unmarshal([]byte(info), &infospec) if err != nil { - logger.Warn("parse info spec %s failed with %s", pm.name, err) + log.Warnf("parse info spec %s failed with %v", pm.name, err) continue } pm.pid = infospec.Pid diff --git a/pkg/exporter/probe/event.go b/pkg/exporter/probe/event.go index 9ad026bf..2265f203 100644 --- a/pkg/exporter/probe/event.go +++ b/pkg/exporter/probe/event.go @@ -1,50 +1,57 @@ package probe import ( - "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracebiolatency" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracekernel" - tracenetif "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracenetiftxlatency" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracenetsoftirq" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracepacketloss" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracesocketlatency" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracetcpreset" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracevirtcmdlat" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + "fmt" + + "github.com/alibaba/kubeskoop/pkg/exporter/nettop" + "golang.org/x/exp/slog" ) var ( - availeprobes map[string]proto.EventProbe + availableEventProbe = make(map[string]EventProbeCreator) ) -func init() { +type EventProbeCreator func(sink chan<- *Event, args map[string]interface{}) (EventProbe, error) + +func MustRegisterEventProbe(name string, creator EventProbeCreator) { + if _, ok := availableEventProbe[name]; ok { + panic(fmt.Errorf("duplicated event probe %s", name)) + } - availeprobes = map[string]proto.EventProbe{} + availableEventProbe[name] = creator +} - availeprobes["netiftxlatency"] = tracenetif.GetProbe() - availeprobes["biolatency"] = tracebiolatency.GetProbe() - availeprobes["net_softirq"] = tracenetsoftirq.GetProbe() - availeprobes["tcpreset"] = tracetcpreset.GetProbe() - availeprobes["kernellatency"] = tracekernel.GetProbe() - availeprobes["packetloss"] = tracepacketloss.GetProbe() - availeprobes["socketlatency"] = tracesocketlatency.GetProbe() - availeprobes["virtcmdlatency"] = tracevirtcmdlat.GetProbe() +func NewEventProbe(name string, simpleProbe SimpleProbe) EventProbe { + return NewProbe(name, simpleProbe) } -func GetEventProbe(subject string) proto.EventProbe { - if p, ok := availeprobes[subject]; ok { - return p +func CreateEventProbe(name string, sink chan<- *Event, _ interface{}) (EventProbe, error) { + creator, ok := availableEventProbe[name] + if !ok { + return nil, fmt.Errorf("undefined probe %s", name) } - return nil + //TODO reflect creator's arguments + return creator(sink, nil) } -func ListEvents() map[string][]string { - em := make(map[string][]string) - - for p, ep := range availeprobes { - enames := ep.GetEventNames() - em[p] = append(em[p], enames...) +func ListEventProbes() []string { + var ret []string + for key := range availableEventProbe { + ret = append(ret, key) } + return ret +} - return em +func EventMetaByNetNS(netns int) []Label { + et, err := nettop.GetEntityByNetns(netns) + if err != nil { + slog.Info("nettop get entity", "err", err, "netns", netns) + return nil + } + return []Label{ + {Name: "pod", Value: et.GetPodName()}, + {Name: "namespace", Value: et.GetPodNamespace()}, + {Name: "node", Value: nettop.GetNodeName()}, + } } diff --git a/pkg/exporter/probe/flow/flow.go b/pkg/exporter/probe/flow/flow.go index e8d7b226..fc21aaa9 100644 --- a/pkg/exporter/probe/flow/flow.go +++ b/pkg/exporter/probe/flow/flow.go @@ -2,129 +2,153 @@ package flow import ( "context" + "encoding/binary" "fmt" "strings" + "github.com/prometheus/client_golang/prometheus" + + "github.com/alibaba/kubeskoop/pkg/exporter/probe" + "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" "github.com/cilium/ebpf" "github.com/cilium/ebpf/rlimit" "github.com/pkg/errors" + log "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" - log "golang.org/x/exp/slog" "golang.org/x/sys/unix" ) //go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc clang -cflags $BPF_CFLAGS bpf ../../../../bpf/flow.c -- -I../../../../bpf/headers -D__TARGET_ARCH_x86 -type Probe struct { - enable bool -} - -type Direction int +type direction int const ( - ModuleName = "flow" - Ingress Direction = 0 - Egress Direction = 1 + ingress direction = 0 + egress direction = 1 + + metricsBytes = "bytes" + metricsPackets = "packets" ) var ( - probe proto.MetricProbe = &Probe{} - dev = "eth0" //TODO 通过参数指定设备名称 - bpfObjs = bpfObjects{} + dev = "eth0" + probeName = "flow" ) -func GetProbe() proto.MetricProbe { - return probe - +func init() { + probe.MustRegisterMetricsProbe(probeName, metricsProbeCreator) } -func (f *Probe) Start(_ context.Context, _ proto.ProbeType) { - log.Info("flow probe starting...") - eth0, err := netlink.LinkByName(dev) - if err != nil { - log.Error("fail get link eth0", err) - return +func metricsProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &metricsProbe{} + opts := probe.BatchMetricsOpts{ + Namespace: probe.MetricsNamespace, + Subsystem: probeName, + VariableLabels: []string{"protocol", "src", "dst", "sport", "dport"}, + SingleMetricsOpts: []probe.SingleMetricsOpts{ + {Name: metricsBytes, ValueType: prometheus.CounterValue}, + {Name: metricsPackets, ValueType: prometheus.CounterValue}, + }, } + batchMetrics := probe.NewBatchMetrics(opts, p.collectOnce) + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil +} + +type metricsProbe struct { + bpfObjs bpfObjects +} - if err := load(); err != nil { +func (p *metricsProbe) Start(_ context.Context) error { + //TODO watch every netns create/destroy + if err := p.loadAndAttachBPF(); err != nil { var verifierError *ebpf.VerifierError log.Error("failed load ebpf program", err) if errors.As(err, &verifierError) { log.Warn("detail", strings.Join(verifierError.Log, "\n")) } - return - } - - if err := setupTCFilter(eth0); err != nil { - log.Error("failed replace eth0 qdisc with clsact", err) - return - } - - log.Info("finish setup flow ebpf") - - //below is just for testing - //toip := func(addr uint32) string { - // var bytes [4]byte - // bytes[0] = byte(addr & 0xff) - // bytes[1] = byte(addr >> 8 & 0xff) - // bytes[2] = byte(addr >> 16 & 0xff) - // bytes[3] = byte(addr >> 24 & 0xff) - // return fmt.Sprintf("%d.%d.%d.%d", bytes[0], bytes[1], bytes[2], bytes[3]) - //} - //htons := func(port uint16) uint16 { - // data := make([]byte, 2) - // binary.BigEndian.PutUint16(data, port) - // return binary.LittleEndian.Uint16(data) - //} - //go func() { - // for { - // var values []bpfFlowMetrics - // var key bpfFlowTuple4 - // iterator := bpfObjs.bpfMaps.InspFlow4Metrics.Iterate() - // for { - // if !iterator.Next(&key, &values) { - // break - // } - // - // if err := iterator.Err(); err != nil { - // log.Error("failed read map", err) - // break - // } - // - // var val bpfFlowMetrics - // for i := 0; i < len(values); i++ { - // val.Bytes += values[i].Bytes - // val.Packets += values[i].Packets - // } - // - // fmt.Printf("proto: %d %s:%d->%s:%d pkts: %d, bytes: %d\n", key.Proto, toip(key.Src), htons(key.Sport), toip(key.Dst), htons(key.Dport), val.Packets, val.Bytes) - // } - // } - //}() - - f.enable = true + + return err + } + + return nil +} + +func (p *metricsProbe) Stop(_ context.Context) error { + return p.cleanup() +} + +func (p *metricsProbe) cleanup() error { + //TODO only clean qdisc after replace qdisc successfully + link, err := netlink.LinkByName(dev) + if err == nil { + _ = cleanQdisc(link) + } + return p.bpfObjs.Close() +} + +func toIPString(addr uint32) string { + var bytes [4]byte + bytes[0] = byte(addr & 0xff) + bytes[1] = byte(addr >> 8 & 0xff) + bytes[2] = byte(addr >> 16 & 0xff) + bytes[3] = byte(addr >> 24 & 0xff) + return fmt.Sprintf("%d.%d.%d.%d", bytes[0], bytes[1], bytes[2], bytes[3]) +} + +func (p *metricsProbe) collectOnce(emit probe.Emit) error { + htons := func(port uint16) uint16 { + data := make([]byte, 2) + binary.BigEndian.PutUint16(data, port) + return binary.LittleEndian.Uint16(data) + } + var values []bpfFlowMetrics + var key bpfFlowTuple4 + iterator := p.bpfObjs.bpfMaps.InspFlow4Metrics.Iterate() + + for iterator.Next(&key, &values) { + if err := iterator.Err(); err != nil { + return fmt.Errorf("failed read bpfmap, err: %w", err) + } + + var val bpfFlowMetrics + for i := 0; i < len(values); i++ { + val.Bytes += values[i].Bytes + val.Packets += values[i].Packets + } + emit("bytes", []string{}, float64(val.Bytes)) + emit("packets", []string{}, float64(val.Packets)) + + fmt.Printf("proto: %d %s:%d->%s:%d pkts: %d, bytes: %d\n", + key.Proto, + toIPString(key.Src), + htons(key.Sport), + toIPString(key.Dst), + htons(key.Dport), + val.Packets, + val.Bytes) + } + return nil } -func setupTCFilter(link netlink.Link) error { +func (p *metricsProbe) setupTCFilter(link netlink.Link) error { if err := replaceQdisc(link); err != nil { return errors.Wrapf(err, "failed replace qdics clsact for dev %s", link.Attrs().Name) } - replaceFilter := func(direction Direction) error { + replaceFilter := func(direction direction) error { directionName := "" var filterParent uint32 var prog *ebpf.Program switch direction { - case Ingress: + case ingress: directionName = "ingress" filterParent = netlink.HANDLE_MIN_INGRESS - prog = bpfObjs.bpfPrograms.TcIngress - case Egress: + prog = p.bpfObjs.bpfPrograms.TcIngress + case egress: directionName = "egress" filterParent = netlink.HANDLE_MIN_EGRESS - prog = bpfObjs.bpfPrograms.TcEgress + prog = p.bpfObjs.bpfPrograms.TcEgress default: return fmt.Errorf("invalid direction value: %d", direction) } @@ -148,16 +172,16 @@ func setupTCFilter(link netlink.Link) error { return nil } - if err := replaceFilter(Ingress); err != nil { + if err := replaceFilter(ingress); err != nil { return errors.Wrapf(err, "cannot set ingress filter for dev %s", link.Attrs().Name) } - if err := replaceFilter(Egress); err != nil { + if err := replaceFilter(egress); err != nil { return errors.Wrapf(err, "cannot set egress filter for dev %s", link.Attrs().Name) } return nil } -func load() error { +func (p *metricsProbe) loadBPF() error { if err := rlimit.RemoveMemlock(); err != nil { return fmt.Errorf("remove limit failed: %s", err.Error()) } @@ -176,46 +200,45 @@ func load() error { } // Load pre-compiled programs and maps into the kernel. - if err := loadBpfObjects(&bpfObjs, &opts); err != nil { + if err := loadBpfObjects(&p.bpfObjs, &opts); err != nil { return fmt.Errorf("failed loading objects: %w", err) } return nil } -func replaceQdisc(link netlink.Link) error { - attrs := netlink.QdiscAttrs{ - LinkIndex: link.Attrs().Index, - Handle: netlink.MakeHandle(0xffff, 0), - Parent: netlink.HANDLE_CLSACT, +func (p *metricsProbe) loadAndAttachBPF() error { + eth0, err := netlink.LinkByName(dev) + if err != nil { + return fmt.Errorf("fail get link %s, err: %w", dev, err) } - qdisc := &netlink.GenericQdisc{ - QdiscAttrs: attrs, - QdiscType: "clsact", + if err := p.loadBPF(); err != nil { + return err } - return netlink.QdiscReplace(qdisc) -} - -func (f *Probe) Close(_ proto.ProbeType) error { - if f.enable { - return bpfObjs.Close() + if err := p.setupTCFilter(eth0); err != nil { + return fmt.Errorf("failed replace %s qdisc with clsact, err: %v", dev, err) } return nil } -func (f *Probe) Ready() bool { - return f.enable +func cleanQdisc(link netlink.Link) error { + return netlink.QdiscDel(clsact(link)) } -func (f *Probe) Name() string { - return ModuleName -} +func clsact(link netlink.Link) netlink.Qdisc { + attrs := netlink.QdiscAttrs{ + LinkIndex: link.Attrs().Index, + Handle: netlink.MakeHandle(0xffff, 0), + Parent: netlink.HANDLE_CLSACT, + } -func (f *Probe) GetMetricNames() []string { - return []string{"net_flow"} + return &netlink.GenericQdisc{ + QdiscAttrs: attrs, + QdiscType: "clsact", + } } -func (f *Probe) Collect(_ context.Context) (map[string]map[uint32]uint64, error) { - return map[string]map[uint32]uint64{}, nil +func replaceQdisc(link netlink.Link) error { + return netlink.QdiscReplace(clsact(link)) } diff --git a/pkg/exporter/probe/legacy.go b/pkg/exporter/probe/legacy.go new file mode 100644 index 00000000..f14fa626 --- /dev/null +++ b/pkg/exporter/probe/legacy.go @@ -0,0 +1,118 @@ +package probe + +import ( + "fmt" + + "github.com/alibaba/kubeskoop/pkg/exporter/nettop" + "github.com/prometheus/client_golang/prometheus" + log "github.com/sirupsen/logrus" +) + +var legacyMetricsLabels = []string{"target_node", "target_namespace", "target_pod", "node", "namespace", "pod"} +var newMetricsLabels = []string{"k8s_node", "k8s_namespace", "k8s_pod"} + +type legacyBatchMetrics struct { + module string + collector LegacyCollector + descs map[string]*prometheus.Desc + underscore bool +} + +func legacyMetricsName(module string, name string, underscore bool) string { + if underscore { + return fmt.Sprintf("%s_pod_%s_%s", LegacyMetricsNamespace, module, name) + } + return fmt.Sprintf("%s_pod_%s%s", LegacyMetricsNamespace, module, name) +} +func newMetricsName(module, name string) string { + return prometheus.BuildFQName(MetricsNamespace, module, name) +} + +type LegacyCollector func() (map[string]map[uint32]uint64, error) + +func NewLegacyBatchMetrics(module string, metrics []string, collector LegacyCollector) prometheus.Collector { + return newLegacyBatchMetrics(module, false, metrics, collector) +} + +func newLegacyBatchMetrics(module string, underscore bool, metrics []string, collector LegacyCollector) prometheus.Collector { + descs := make(map[string]*prometheus.Desc) + for _, m := range metrics { + legacyName := legacyMetricsName(module, m, underscore) + newName := newMetricsName(module, m) + descs[legacyName] = prometheus.NewDesc(legacyName, "", legacyMetricsLabels, nil) + descs[newName] = prometheus.NewDesc(newName, "", newMetricsLabels, nil) + } + return &legacyBatchMetrics{ + module: module, + collector: collector, + descs: descs, + underscore: underscore, + } +} + +func NewLegacyBatchMetricsWithUnderscore(module string, metrics []string, collector LegacyCollector) prometheus.Collector { + return newLegacyBatchMetrics(module, true, metrics, collector) +} + +func (l *legacyBatchMetrics) Describe(descs chan<- *prometheus.Desc) { + for _, desc := range l.descs { + descs <- desc + } +} + +func (l *legacyBatchMetrics) Collect(metrics chan<- prometheus.Metric) { + log.Debugf("collect data from %s", l.module) + data, err := l.collector() + if err != nil { + log.Errorf("%s failed collect data, err: %v", l.module, err) + return + } + + emit := func(name string, labelValues []string, value float64) { + desc, ok := l.descs[name] + if !ok { + return + } + + metrics <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(value), labelValues...) + } + + for key, namespaceData := range data { + for nsinum, value := range namespaceData { + et, err := nettop.GetEntityByNetns(int(nsinum)) + if err != nil || et == nil { + continue + } + labelValues := []string{nettop.GetNodeName(), et.GetPodNamespace(), et.GetPodName()} + // for legacy pod labels + emit(newMetricsName(l.module, key), labelValues, float64(value)) + + labelValues = append(labelValues, labelValues...) + emit(legacyMetricsName(l.module, key, l.underscore), labelValues, float64(value)) + } + } +} + +func LagacyEventLabels(netns uint32) []Label { + et, err := nettop.GetEntityByNetns(int(netns)) + if err != nil { + log.Infof("nettop get entity failed, netns: %d, err: %v", netns, err) + return nil + } + return []Label{ + {Name: "pod", Value: et.GetPodName()}, + {Name: "namespace", Value: et.GetPodNamespace()}, + {Name: "node", Value: nettop.GetNodeName()}, + } +} + +func CopyLegacyMetricsMap(m map[string]map[uint32]uint64) map[string]map[uint32]uint64 { + ret := make(map[string]map[uint32]uint64) + for key, nsMap := range m { + ret[key] = make(map[uint32]uint64) + for ns, data := range nsMap { + ret[key][ns] = data + } + } + return ret +} diff --git a/pkg/exporter/probe/metric.go b/pkg/exporter/probe/metric.go index 1739c26b..89857fa2 100644 --- a/pkg/exporter/probe/metric.go +++ b/pkg/exporter/probe/metric.go @@ -1,96 +1,175 @@ package probe import ( - "context" + "errors" "fmt" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/flow" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/nlconntrack" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/nlqdisc" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/procfd" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/procio" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/procipvs" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/procnetdev" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/procnetstat" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/procsnmp" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/procsock" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/procsoftnet" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/proctcpsummary" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracekernel" - tracenetif "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracenetiftxlatency" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracenetsoftirq" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracepacketloss" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracesocketlatency" - "github.com/alibaba/kubeskoop/pkg/exporter/probe/tracevirtcmdlat" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + log "github.com/sirupsen/logrus" + + "github.com/prometheus/client_golang/prometheus" + "golang.org/x/exp/maps" ) +const LegacyMetricsNamespace = "inspector" +const MetricsNamespace = "kubeskoop" + var ( - availmprobes map[string]proto.MetricProbe + availableMetricsProbes = make(map[string]MetricsProbeCreator) + ErrUndeclaredMetrics = errors.New("undeclared metrics") ) -func init() { - availmprobes = map[string]proto.MetricProbe{} - - availmprobes["tcp"] = procsnmp.GetProbe() - availmprobes["udp"] = procsnmp.GetProbe() - availmprobes["ip"] = procsnmp.GetProbe() - availmprobes["netdev"] = procnetdev.GetProbe() - availmprobes["softnet"] = procsoftnet.GetProbe() - availmprobes["sock"] = procsock.GetProbe() - availmprobes["io"] = procio.GetProbe() - availmprobes["tcpext"] = procnetstat.GetProbe() - availmprobes["socketlatency"] = tracesocketlatency.GetProbe() - availmprobes["packetloss"] = tracepacketloss.GetProbe() - availmprobes["net_softirq"] = tracenetsoftirq.GetProbe() - availmprobes["netiftxlatency"] = tracenetif.GetProbe() - availmprobes["kernellatency"] = tracekernel.GetProbe() - availmprobes["tcpsummary"] = proctcpsummary.GetProbe() - availmprobes["virtcmdlatency"] = tracevirtcmdlat.GetProbe() - availmprobes["conntrack"] = nlconntrack.GetProbe() - availmprobes["ipvs"] = procipvs.GetProbe() - availmprobes["qdisc"] = nlqdisc.GetProbe() - availmprobes["fd"] = procfd.GetProbe() - availmprobes["flow"] = flow.GetProbe() +type MetricsProbeCreator func(args map[string]interface{}) (MetricsProbe, error) + +func MustRegisterMetricsProbe(name string, creator MetricsProbeCreator) { + if _, ok := availableMetricsProbes[name]; ok { + panic(fmt.Errorf("duplicated event probe %s", name)) + } + + availableMetricsProbes[name] = creator } -func ListMetricProbes() (probelist []string) { - for k := range availmprobes { - probelist = append(probelist, k) +func CreateMetricsProbe(name string, _ interface{}) (MetricsProbe, error) { + creator, ok := availableMetricsProbes[name] + if !ok { + return nil, fmt.Errorf("undefined probe %s", name) } - return + + //TODO reflect creator's arguments + return creator(nil) +} + +func ListMetricsProbes() []string { + var ret []string + for key := range availableMetricsProbes { + ret = append(ret, key) + } + return ret +} + +type Emit func(name string, labels []string, val float64) + +type Collector func(emit Emit) error + +type SingleMetricsOpts struct { + Name string + Help string + ConstLabels map[string]string + VariableLabels []string + ValueType prometheus.ValueType } -func ListMetrics() map[string][]string { - mm := make(map[string][]string) - for p, pb := range availmprobes { - if pb != nil { - // for multi metric of one probe,filter with prefix - mnames := pb.GetMetricNames() - mm[p] = append(mm[p], mnames...) +type BatchMetricsOpts struct { + Namespace string + Subsystem string + ConstLabels map[string]string + VariableLabels []string + SingleMetricsOpts []SingleMetricsOpts +} + +type metricsInfo struct { + desc *prometheus.Desc + valueType prometheus.ValueType +} + +type BatchMetrics struct { + name string + infoMap map[string]*metricsInfo + ProbeCollector Collector +} + +func NewBatchMetrics(opts BatchMetricsOpts, probeCollector Collector) *BatchMetrics { + m := make(map[string]*metricsInfo) + for _, metrics := range opts.SingleMetricsOpts { + constLabels, variableLables := mergeLabels(opts, metrics) + desc := prometheus.NewDesc( + prometheus.BuildFQName(opts.Namespace, opts.Subsystem, metrics.Name), + metrics.Help, + variableLables, + constLabels, + ) + + m[metrics.Name] = &metricsInfo{ + desc: desc, + valueType: metrics.ValueType, } } - return mm + + return &BatchMetrics{ + name: fmt.Sprintf("%s_%s", opts.Namespace, opts.Subsystem), + infoMap: m, + ProbeCollector: probeCollector, + } +} + +func (b *BatchMetrics) Describe(descs chan<- *prometheus.Desc) { + for _, info := range b.infoMap { + descs <- info.desc + } } -func GetProbe(subject string) proto.MetricProbe { - if p, ok := availmprobes[subject]; ok { - return p +func (b *BatchMetrics) Collect(metrics chan<- prometheus.Metric) { + emit := func(name string, labels []string, val float64) { + info, ok := b.infoMap[name] + if !ok { + log.Errorf("%s undeclared metrics %s", b.name, name) + return + } + metrics <- prometheus.MustNewConstMetric(info.desc, info.valueType, val, labels...) } - return nil + err := b.ProbeCollector(emit) + if err != nil { + log.Errorf("%s error collect, err: %v", b.name, err) + return + } } -// CollectOnce collect from probe directly for test -func CollectOnce(ctx context.Context, subject string) (map[string]map[uint32]uint64, error) { - return collectOnce(ctx, subject) +func mergeLabels(opts BatchMetricsOpts, metrics SingleMetricsOpts) (map[string]string, []string) { + constLabels := mergeMap(opts.ConstLabels, metrics.ConstLabels) + variableLabels := mergeArray(opts.VariableLabels, metrics.VariableLabels) + + return constLabels, variableLabels } -func collectOnce(ctx context.Context, subject string) (map[string]map[uint32]uint64, error) { - probe, ok := availmprobes[subject] - if !ok { - return nil, fmt.Errorf("no probe found of %s", subject) +func mergeArray(labels []string, labels2 []string) []string { + m := make(map[string]bool) + for _, s := range labels { + m[s] = true + } + + for _, s := range labels2 { + if _, ok := m[s]; ok { + //to avoid duplicated label + panic(fmt.Sprintf("metric label %s already declared in BatchMetricsOpts", s)) + } } - return probe.Collect(ctx) + var ret []string + for k := range m { + ret = append(ret, k) + } + + return ret +} + +// if a key exists in both maps, value in labels2 will be keep +func mergeMap(labels map[string]string, labels2 map[string]string) map[string]string { + ret := make(map[string]string) + maps.Copy(ret, labels) + maps.Copy(ret, labels2) + return ret +} + +type combinedMetricsProbe struct { + Probe + prometheus.Collector } + +func NewMetricsProbe(name string, simpleProbe SimpleProbe, collector prometheus.Collector) MetricsProbe { + return &combinedMetricsProbe{ + Probe: NewProbe(name, simpleProbe), + Collector: collector, + } +} + +var _ prometheus.Collector = &BatchMetrics{} diff --git a/pkg/exporter/probe/nlconntrack/conntrackevents.go b/pkg/exporter/probe/nlconntrack/conntrackevents.go index 6be421e0..183d5ea2 100644 --- a/pkg/exporter/probe/nlconntrack/conntrackevents.go +++ b/pkg/exporter/probe/nlconntrack/conntrackevents.go @@ -3,16 +3,14 @@ package nlconntrack import ( "context" "fmt" - "net" "strconv" - "sync" "time" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" + "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" "github.com/alibaba/kubeskoop/pkg/exporter/nettop" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" - "github.com/mdlayher/netlink" "github.com/ti-mo/conntrack" "github.com/ti-mo/netfilter" @@ -29,184 +27,115 @@ const ( ) var ( - ModuleName = "insp_conntrack" - - events = []string{"ConntrackNew", "ConntrackUpdate", "ConntrackDestroy", "ConntrackExpNew", "ConntrackExpDestroy", "ConntrackUnknow"} - probe = &Probe{mtx: sync.Mutex{}, conns: make(map[int]chan struct{})} + probeName = "conntrack" ) -type Probe struct { - enable bool - sub chan<- proto.RawEvent - mtx sync.Mutex - conns map[int]chan struct{} - initConn *conntrack.Conn -} - -func GetProbe() *Probe { - return probe -} - -func (p *Probe) Name() string { - return ModuleName -} - -func (p *Probe) Ready() bool { - if _, err := p.getConn(); err != nil { - return false +func eventProbeCreator(sink chan<- *probe.Event, _ map[string]interface{}) (probe.EventProbe, error) { + p := &conntrackEventProbe{ + sink: sink, } - return true + return probe.NewEventProbe(probeName, p), nil } -func (p *Probe) Close(_ proto.ProbeType) error { - p.mtx.Lock() - defer p.mtx.Unlock() - - p.enable = false - - return nil +type conntrackEventProbe struct { + sink chan<- *probe.Event + conns map[int]chan struct{} + done chan struct{} } -func (p *Probe) GetEventNames() []string { - return events -} - -func (p *Probe) Start(ctx context.Context, _ proto.ProbeType) { - p.mtx.Lock() - p.enable = true - p.mtx.Unlock() - - ticker := time.NewTicker(10 * time.Second) - defer p.release() - for range ticker.C { - if !p.enable { - return - } - slog.Ctx(ctx).Info("start update netns list", "module", ModuleName) - ets := nettop.GetAllEntity() - for _, et := range ets { - if et == nil { - slog.Ctx(ctx).Info("skip empty entity", "module", ModuleName) - continue - } - nsfd, err := et.GetNetNsFd() - if err != nil { - slog.Ctx(ctx).Info("skip netns fd", "err", err, "module", ModuleName) - continue - } - if nsfd == 0 { - slog.Ctx(ctx).Info("skip empty netns fd", "module", ModuleName) - continue - } - if _, ok := p.conns[et.GetNetns()]; !ok { - ctrch := make(chan struct{}) - go func() { - err := p.startCtListen(ctx, ctrch, nsfd, et.GetNetns()) - if err != nil { - slog.Ctx(ctx).Warn("start worker", "err", err, "netns", et.GetNetns(), "nsfd", nsfd, "module", ModuleName) - return - } - }() - p.conns[et.GetNetns()] = ctrch - slog.Ctx(ctx).Info("start worker finished", "netns", et.GetNetns(), "nsfd", nsfd, "module", ModuleName) +func (p *conntrackEventProbe) Start(ctx context.Context) error { + go func() { + ticker := time.NewTicker(10 * time.Second) + select { + case <-ticker.C: + slog.Ctx(ctx).Info("start update netns list", "module", probeName) + ets := nettop.GetAllEntity() + for _, et := range ets { + if et == nil { + slog.Ctx(ctx).Info("skip empty entity", "module", probeName) + continue + } + nsfd, err := et.GetNetNsFd() + if err != nil { + slog.Ctx(ctx).Info("skip netns fd", "err", err, "module", probeName) + continue + } + if nsfd == 0 { + slog.Ctx(ctx).Info("skip empty netns fd", "module", probeName) + continue + } + if _, ok := p.conns[et.GetNetns()]; !ok { + ctrch := make(chan struct{}) + go func() { + err := p.startCtListen(ctx, ctrch, nsfd, et.GetNetns()) + if err != nil { + slog.Ctx(ctx).Warn("start worker", "err", err, "netns", et.GetNetns(), "nsfd", nsfd, "module", probeName) + return + } + }() + p.conns[et.GetNetns()] = ctrch + slog.Ctx(ctx).Info("start worker finished", "netns", et.GetNetns(), "nsfd", nsfd, "module", probeName) + } } + case <-p.done: + return } - } + }() + return nil } -func (p *Probe) release() { - p.mtx.Lock() - defer p.mtx.Unlock() - for _, ctrch := range p.conns { - close(ctrch) +func (p *conntrackEventProbe) Stop(_ context.Context) error { + close(p.done) + for _, conn := range p.conns { + close(conn) } - p.conns = make(map[int]chan struct{}) + return nil } -func (p *Probe) startCtListen(ctx context.Context, ctrch <-chan struct{}, nsfd int, nsinum int) error { +func (p *conntrackEventProbe) startCtListen(ctx context.Context, ctrch <-chan struct{}, nsfd int, nsinum int) error { c, err := conntrack.Dial(&netlink.Config{ NetNS: nsfd, }) if err != nil { - slog.Ctx(ctx).Info("start conntrack dial", "err", err, "module", ModuleName) + slog.Ctx(ctx).Info("start conntrack dial", "err", err, "module", probeName) return err } - slog.Ctx(ctx).Info("start conntrack listen", "netns", nsfd, "module", ModuleName) + slog.Ctx(ctx).Info("start conntrack listen", "netns", nsfd, "module", probeName) evCh := make(chan conntrack.Event, 1024) errCh, err := c.Listen(evCh, 4, append(netfilter.GroupsCT, netfilter.GroupsCTExp...)) if err != nil { - slog.Ctx(ctx).Info("start conntrack listen", "err", err, "module", ModuleName) + slog.Ctx(ctx).Info("start conntrack listen", "err", err, "module", probeName) return err } for { select { case <-ctrch: - slog.Ctx(ctx).Info("conntrack event listen stop", "module", ModuleName) + slog.Ctx(ctx).Info("conntrack event listen stop", "module", probeName) return nil case err = <-errCh: - slog.Ctx(ctx).Info("conntrack event listen stop", "err", err, "module", ModuleName) + slog.Ctx(ctx).Info("conntrack event listen stop", "err", err, "module", probeName) return err case event := <-evCh: - if p.sub != nil { - p.sub <- vanishEvent(event, nsinum) - slog.Ctx(ctx).Info("conntrack event listen", "event", event.String(), "module", ModuleName) - } + p.sink <- vanishEvent(event, nsinum) + slog.Ctx(ctx).Info("conntrack event listen", "event", event.String(), "module", probeName) } } } -// func getEventCh(ctx context.Context, nsinum int) (evCh chan conntrack.Event, errCh chan error, err error) { -// c, err := conntrack.Dial(&netlink.Config{ -// NetNS: nsinum, -// }) - -// if err != nil { -// slog.Ctx(ctx).Info("start conntrack dial", "err", err, "module", ModuleName) -// return -// } - -// slog.Ctx(ctx).Info("start conntrack listen", "netns", nsinum, "module", ModuleName) -// evCh = make(chan conntrack.Event, 1024) -// errCh, err = c.Listen(evCh, 4, append(netfilter.GroupsCT, netfilter.GroupsCTExp...)) -// if err != nil { -// slog.Ctx(ctx).Info("start conntrack listen", "err", err, "module", ModuleName) -// return -// } - -// return -// } - -// Register register sub chan to get perf events -func (p *Probe) Register(receiver chan<- proto.RawEvent) error { - p.mtx.Lock() - defer p.mtx.Unlock() - p.sub = receiver - - return nil +var eventTypeMapping = map[uint8]probe.EventType{ + uint8(conntrack.EventNew): ConntrackNew, + uint8(conntrack.EventUpdate): ConntrackUpdate, + uint8(conntrack.EventDestroy): ConntrackDestroy, + uint8(conntrack.EventExpNew): ConntrackExpNew, + uint8(conntrack.EventExpDestroy): ConntrackExpDestroy, + uint8(conntrack.EventUnknown): ConntrackUnknow, } -func vanishEvent(evt conntrack.Event, nsinum int) proto.RawEvent { - raw := proto.RawEvent{ - Netns: uint32(nsinum), - } - switch evt.Type { - case conntrack.EventNew: - raw.EventType = ConntrackNew - case conntrack.EventUpdate: - raw.EventType = ConntrackUpdate - case conntrack.EventDestroy: - raw.EventType = ConntrackDestroy - case conntrack.EventExpNew: - raw.EventType = ConntrackExpNew - case conntrack.EventExpDestroy: - raw.EventType = ConntrackExpDestroy - default: - raw.EventType = ConntrackUnknow - } +func vanishEvent(evt conntrack.Event, nsinum int) *probe.Event { rawStr := fmt.Sprintf("Proto = %s Replied = %t ", bpfutil.GetProtoStr(evt.Flow.TupleOrig.Proto.Protocol), evt.Flow.Status.SeenReply()) if evt.Flow.TupleOrig.Proto.Protocol == 6 && evt.Flow.ProtoInfo.TCP != nil { @@ -214,6 +143,15 @@ func vanishEvent(evt conntrack.Event, nsinum int) proto.RawEvent { } rawStr += fmt.Sprintf("Src = %s, Dst = %s", net.JoinHostPort(evt.Flow.TupleOrig.IP.SourceAddress.String(), strconv.Itoa(int(evt.Flow.TupleOrig.Proto.SourcePort))), net.JoinHostPort(evt.Flow.TupleOrig.IP.DestinationAddress.String(), strconv.Itoa(int(evt.Flow.TupleOrig.Proto.DestinationPort)))) - raw.EventBody = rawStr - return raw + + return &probe.Event{ + Timestamp: time.Now().UnixNano(), + Type: eventTypeMapping[uint8(evt.Type)], + Labels: probe.EventMetaByNetNS(nsinum), + Message: rawStr, + } +} + +func init() { + probe.MustRegisterEventProbe(probeName, eventProbeCreator) } diff --git a/pkg/exporter/probe/nlconntrack/conntrackmetrics.go b/pkg/exporter/probe/nlconntrack/conntrackmetrics.go index a6a60d5a..bc61993a 100644 --- a/pkg/exporter/probe/nlconntrack/conntrackmetrics.go +++ b/pkg/exporter/probe/nlconntrack/conntrackmetrics.go @@ -2,11 +2,9 @@ package nlconntrack import ( "context" - "fmt" - "strings" "github.com/alibaba/kubeskoop/pkg/exporter/nettop" - + "github.com/alibaba/kubeskoop/pkg/exporter/probe" "github.com/ti-mo/conntrack" ) @@ -14,70 +12,44 @@ var ( MetricPrefix = "conntrack" // stats of conntrack entry operation - Found = "Found" - Invalid = "Invalid" - Ignore = "Ignore" - Insert = "Insert" - InsertFailed = "InsertFailed" - Drop = "Drop" - EarlyDrop = "EarlyDrop" - Error = "Error" - SearchRestart = "SearchRestart" - - Entries = "Entries" - MaxEntries = "MaxEntries" + Found = "found" + Invalid = "invalid" + Ignore = "ignore" + Insert = "insert" + InsertFailed = "insertfailed" + Drop = "drop" + EarlyDrop = "earlydrop" + Error = "error" + SearchRestart = "searchrestart" + + Entries = "entries" + MaxEntries = "maxentries" // stats of conntrack status summary - conntrackMetrics = []string{Found, Invalid, Ignore, Insert, InsertFailed, Drop, EarlyDrop, Error, SearchRestart, Entries, MaxEntries} ) -func (s *Probe) GetMetricNames() []string { - res := []string{} - for _, m := range conntrackMetrics { - res = append(res, metricUniqueID("conntrack", m)) - } - return res -} - -func (s *Probe) Collect(_ context.Context) (map[string]map[uint32]uint64, error) { - resMap := map[string]map[uint32]uint64{} - stats, err := s.collectStats() - if err != nil { - return resMap, err - } +func metricsProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &conntrackMetricsProbe{} - for _, metric := range conntrackMetrics { - resMap[metricUniqueID("conntrack", metric)] = map[uint32]uint64{uint32(nettop.InitNetns): stats[metric]} - } + batchMetrics := probe.NewLegacyBatchMetrics(probeName, conntrackMetrics, p.CollectOnce) - return resMap, nil + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil } -func (s *Probe) getConn() (*conntrack.Conn, error) { - if s.initConn == nil { - err := s.initStatConn() - if err != nil { - return nil, err - } - } - return s.initConn, nil +type conntrackMetricsProbe struct { + conn *conntrack.Conn } -func (s *Probe) collectStats() (map[string]uint64, error) { +func (c *conntrackMetricsProbe) collectStats() (map[string]uint64, error) { resMap := map[string]uint64{} - conn, err := s.getConn() - if err != nil { - return resMap, err - } - - stat, err := conn.Stats() + stat, err := c.conn.Stats() if err != nil { return resMap, err } - globalstat, err := conn.StatsGlobal() + globalstat, err := c.conn.StatsGlobal() if err != nil { return resMap, err } @@ -100,16 +72,30 @@ func (s *Probe) collectStats() (map[string]uint64, error) { return resMap, nil } -// initStatConn create a netlink connection in init netns -func (s *Probe) initStatConn() error { - c, err := conntrack.Dial(nil) +func (c *conntrackMetricsProbe) CollectOnce() (map[string]map[uint32]uint64, error) { + resMap := map[string]map[uint32]uint64{} + stats, err := c.collectStats() if err != nil { - return err + return resMap, err + } + + for _, metric := range conntrackMetrics { + resMap[metric] = map[uint32]uint64{uint32(nettop.InitNetns): stats[metric]} } - s.initConn = c - return nil + + return resMap, nil +} + +func (c *conntrackMetricsProbe) Start(_ context.Context) error { + var err error + c.conn, err = conntrack.Dial(nil) + return err +} + +func (c *conntrackMetricsProbe) Stop(_ context.Context) error { + return c.conn.Close() } -func metricUniqueID(subject string, m string) string { - return fmt.Sprintf("%s%s", subject, strings.ToLower(m)) +func init() { + probe.MustRegisterMetricsProbe(probeName, metricsProbeCreator) } diff --git a/pkg/exporter/probe/nlqdisc/nlqdiscstats.go b/pkg/exporter/probe/nlqdisc/nlqdiscstats.go index f8337ccc..445a39de 100644 --- a/pkg/exporter/probe/nlqdisc/nlqdiscstats.go +++ b/pkg/exporter/probe/nlqdisc/nlqdiscstats.go @@ -7,16 +7,15 @@ import ( "math" "net" "strings" - "sync" "syscall" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" "github.com/alibaba/kubeskoop/pkg/exporter/nettop" "github.com/mdlayher/netlink" "github.com/mdlayher/netlink/nlenc" - "golang.org/x/exp/slog" + log "github.com/sirupsen/logrus" ) const ( @@ -44,92 +43,63 @@ const ( ) var ( - ModuleName = "insp_qdisc" + probeName = "qdisc" - Bytes = "Bytes" - Packets = "Packets" - Drops = "Drops" - Qlen = "Qlen" - Backlog = "Backlog" - Overlimits = "Overlimits" + Bytes = "bytes" + Packets = "packets" + Drops = "drops" + Qlen = "qlen" + Backlog = "backlog" + Overlimits = "overlimits" - QdiscMetrics = []string{Bytes, Packets, Drops, Qlen, Backlog, Overlimits} - - probe = &Probe{mtx: sync.Mutex{}} + qdiscMetrics = []string{Bytes, Packets, Drops, Qlen, Backlog, Overlimits} ) -type Probe struct { - enable bool - mtx sync.Mutex +func init() { + probe.MustRegisterMetricsProbe(probeName, qdiscProbeCreator) } -func GetProbe() *Probe { - return probe -} +func qdiscProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &Probe{} -func (p *Probe) Name() string { - return ModuleName -} + batchMetrics := probe.NewLegacyBatchMetrics(probeName, qdiscMetrics, p.CollectOnce) -func (p *Probe) Start(_ context.Context, _ proto.ProbeType) { + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil } -func (p *Probe) Ready() bool { - fd, err := nettop.GetHostnetworkNetnsFd() - if err != nil { - slog.Default().Warn("status not ready", "err", err, "module", ModuleName) - return false - } - c, err := getConn(fd) - if err != nil { - slog.Default().Warn("status not ready", "err", err, "module", ModuleName) - return false - } - defer c.Close() +type Probe struct{} - return true -} - -func (p *Probe) GetMetricNames() []string { - res := []string{} - for _, m := range QdiscMetrics { - res = append(res, metricUniqueID("qdisc", m)) - } - return res +func (p *Probe) Start(_ context.Context) error { + return nil } -func (p *Probe) Close(_ proto.ProbeType) error { - p.mtx.Lock() - defer p.mtx.Unlock() - - p.enable = false - +func (p *Probe) Stop(_ context.Context) error { return nil } -func (p *Probe) Collect(ctx context.Context) (map[string]map[uint32]uint64, error) { +func (p *Probe) CollectOnce() (map[string]map[uint32]uint64, error) { resMap := make(map[string]map[uint32]uint64) - for _, metric := range QdiscMetrics { - resMap[metricUniqueID("qdisc", metric)] = make(map[uint32]uint64) + for _, metric := range qdiscMetrics { + resMap[metric] = make(map[uint32]uint64) } ets := nettop.GetAllEntity() for _, et := range ets { - stats, err := getQdiscStats(ctx, et) + stats, err := getQdiscStats(et) if err != nil { - slog.Ctx(ctx).Info("get qdisc stats", "err", err, "module", ModuleName) + log.Errorf("%s failed get qdisc stats: %v", probeName, err) continue } for _, stat := range stats { // only care about eth0/eth1... if strings.HasPrefix(stat.IfaceName, "eth") { - resMap[metricUniqueID("qdisc", Bytes)][uint32(et.GetNetns())] += stat.Bytes - resMap[metricUniqueID("qdisc", Packets)][uint32(et.GetNetns())] += uint64(stat.Packets) - resMap[metricUniqueID("qdisc", Drops)][uint32(et.GetNetns())] += uint64(stat.Drops) - resMap[metricUniqueID("qdisc", Qlen)][uint32(et.GetNetns())] += uint64(stat.Qlen) - resMap[metricUniqueID("qdisc", Backlog)][uint32(et.GetNetns())] += uint64(stat.Backlog) - resMap[metricUniqueID("qdisc", Overlimits)][uint32(et.GetNetns())] += uint64(stat.Overlimits) + resMap[Bytes][uint32(et.GetNetns())] += stat.Bytes + resMap[Packets][uint32(et.GetNetns())] += uint64(stat.Packets) + resMap[Drops][uint32(et.GetNetns())] += uint64(stat.Drops) + resMap[Qlen][uint32(et.GetNetns())] += uint64(stat.Qlen) + resMap[Backlog][uint32(et.GetNetns())] += uint64(stat.Backlog) + resMap[Overlimits][uint32(et.GetNetns())] += uint64(stat.Overlimits) } } } @@ -137,7 +107,7 @@ func (p *Probe) Collect(ctx context.Context) (map[string]map[uint32]uint64, erro return resMap, nil } -func getQdiscStats(ctx context.Context, entity *nettop.Entity) ([]QdiscInfo, error) { +func getQdiscStats(entity *nettop.Entity) ([]QdiscInfo, error) { fd, err := entity.GetNetNsFd() if err != nil { return nil, err @@ -166,7 +136,7 @@ func getQdiscStats(ctx context.Context, entity *nettop.Entity) ([]QdiscInfo, err for _, msg := range msgs { m, err := parseMessage(msg) if err != nil { - slog.Ctx(ctx).Info("parse qdisc msg", "err", err, "msg", msg, "module", ModuleName) + log.Errorf("failed parse qdisc msg, nlmsg: %v, err: %v", msg, err) continue } res = append(res, m) @@ -406,7 +376,3 @@ func parseMessage(msg netlink.Message) (QdiscInfo, error) { return m, err } - -func metricUniqueID(subject string, m string) string { - return fmt.Sprintf("%s%s", subject, strings.ToLower(m)) -} diff --git a/pkg/exporter/probe/procfd/procfd.go b/pkg/exporter/probe/procfd/procfd.go index 003444e1..c407bcbd 100644 --- a/pkg/exporter/probe/procfd/procfd.go +++ b/pkg/exporter/probe/procfd/procfd.go @@ -6,70 +6,61 @@ import ( "os" "strings" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" + log "k8s.io/klog/v2" "github.com/alibaba/kubeskoop/pkg/exporter/nettop" - - "golang.org/x/exp/slog" ) const ( - ModuleName = "Procfd" // nolint + probeName = "fd" ) var ( - probe = &ProcFD{} - - FDMetrics = []string{"OpenFd", "OpenSocket"} + OpenFD = "openfd" + OpenSocket = "opensocket" + FDMetrics = []string{OpenFD, OpenSocket} ) -type ProcFD struct { +func init() { + probe.MustRegisterMetricsProbe(probeName, fdProbeCreator) } -func GetProbe() *ProcFD { - return probe -} +func fdProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &ProcFD{} -func (s *ProcFD) Close(_ proto.ProbeType) error { - return nil -} + batchMetrics := probe.NewLegacyBatchMetrics(probeName, FDMetrics, p.CollectOnce) -func (s *ProcFD) Start(_ context.Context, _ proto.ProbeType) { + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil } -func (s *ProcFD) Ready() bool { - return true +type ProcFD struct { } -func (s *ProcFD) Name() string { - return ModuleName +func (s *ProcFD) Start(_ context.Context) error { + return nil } -func (s *ProcFD) GetMetricNames() []string { - res := []string{} - for _, m := range FDMetrics { - res = append(res, metricUniqueID("fd", m)) - } - return res +func (s *ProcFD) Stop(_ context.Context) error { + return nil } -func (s *ProcFD) Collect(ctx context.Context) (map[string]map[uint32]uint64, error) { +func (s *ProcFD) CollectOnce() (map[string]map[uint32]uint64, error) { ets := nettop.GetAllEntity() if len(ets) == 0 { - slog.Ctx(ctx).Info("collect", "mod", ModuleName, "ignore", "no entity found") + log.Warningf("procfd: no entity found") + return map[string]map[uint32]uint64{}, nil } return getAllProcessFd(ets) } -func metricUniqueID(subject string, m string) string { - return fmt.Sprintf("%s%s", subject, strings.ToLower(m)) -} - func getAllProcessFd(nslist []*nettop.Entity) (map[string]map[uint32]uint64, error) { resMap := make(map[string]map[uint32]uint64) - for _, metricname := range FDMetrics { - resMap[metricUniqueID("fd", metricname)] = map[uint32]uint64{} + + for _, m := range FDMetrics { + resMap[m] = make(map[uint32]uint64) } + for _, nslogic := range nslist { nsprocfd := map[string]struct{}{} nsprocfsock := map[string]struct{}{} @@ -86,8 +77,8 @@ func getAllProcessFd(nslist []*nettop.Entity) (map[string]map[uint32]uint64, err } } } - resMap[metricUniqueID("fd", "OpenFd")][uint32(nslogic.GetNetns())] = uint64(len(nsprocfd)) - resMap[metricUniqueID("fd", "OpenSocket")][uint32(nslogic.GetNetns())] = uint64(len(nsprocfsock)) + resMap[OpenFD][uint32(nslogic.GetNetns())] = uint64(len(nsprocfd)) + resMap[OpenSocket][uint32(nslogic.GetNetns())] = uint64(len(nsprocfsock)) } return resMap, nil } diff --git a/pkg/exporter/probe/procio/procio.go b/pkg/exporter/probe/procio/procio.go index 6122e4ad..a57f9bb9 100644 --- a/pkg/exporter/probe/procio/procio.go +++ b/pkg/exporter/probe/procio/procio.go @@ -5,73 +5,62 @@ import ( "fmt" "io" "os" - "strings" - - "github.com/alibaba/kubeskoop/pkg/exporter/proto" "github.com/alibaba/kubeskoop/pkg/exporter/nettop" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" + log "github.com/sirupsen/logrus" "github.com/prometheus/procfs" - "golang.org/x/exp/slog" ) const ( - IOReadSyscall = "IOReadSyscall" - IOWriteSyscall = "IOWriteSyscall" - IOReadBytes = "IOReadBytes" - IOWriteBytes = "IOWriteBytes" + IOReadSyscall = "ioreadsyscall" + IOWriteSyscall = "iowritesyscall" + IOReadBytes = "ioreadbytes" + IOWriteBytes = "iowritebytes" - ModuleName = "procio" // nolint + probeName = "io" // nolint ) var ( - probe = &ProcIO{} - IOMetrics = []string{IOReadSyscall, IOWriteSyscall, IOReadBytes, IOWriteBytes} ) -type ProcIO struct { +func init() { + probe.MustRegisterMetricsProbe(probeName, ioProbeCreator) } -func GetProbe() *ProcIO { - return probe -} +func ioProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &ProcIO{} -func (s *ProcIO) Close(_ proto.ProbeType) error { - return nil -} + batchMetrics := probe.NewLegacyBatchMetrics(probeName, IOMetrics, p.CollectOnce) -func (s *ProcIO) Start(_ context.Context, _ proto.ProbeType) { + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil } -func (s *ProcIO) Ready() bool { - return true +type ProcIO struct { } -func (s *ProcIO) Name() string { - return ModuleName +func (s *ProcIO) Start(_ context.Context) error { + return nil } -func (s *ProcIO) GetMetricNames() []string { - res := []string{} - for _, m := range IOMetrics { - res = append(res, metricUniqueID("io", m)) - } - return res +func (s *ProcIO) Stop(_ context.Context) error { + return nil } -func (s *ProcIO) Collect(ctx context.Context) (map[string]map[uint32]uint64, error) { +func (s *ProcIO) CollectOnce() (map[string]map[uint32]uint64, error) { ets := nettop.GetAllEntity() if len(ets) == 0 { - slog.Ctx(ctx).Info("collect", "mod", ModuleName, "ignore", "no entity found") + log.Infof("procio: no entity found") } - return collect(ctx, ets) + return collect(ets) } -func collect(_ context.Context, _ []*nettop.Entity) (map[string]map[uint32]uint64, error) { +func collect(_ []*nettop.Entity) (map[string]map[uint32]uint64, error) { resMap := make(map[string]map[uint32]uint64) for _, stat := range IOMetrics { - resMap[metricUniqueID("io", stat)] = map[uint32]uint64{} + resMap[stat] = map[uint32]uint64{} } procios, err := getAllProcessIO(nettop.GetAllEntity()) @@ -81,20 +70,16 @@ func collect(_ context.Context, _ []*nettop.Entity) (map[string]map[uint32]uint6 for nsinum := range procios { for _, procio := range procios[nsinum] { - resMap[metricUniqueID("io", IOReadSyscall)][nsinum] += procio.SyscR - resMap[metricUniqueID("io", IOWriteSyscall)][nsinum] += procio.SyscW - resMap[metricUniqueID("io", IOReadBytes)][nsinum] += procio.ReadBytes - resMap[metricUniqueID("io", IOWriteBytes)][nsinum] += procio.WriteBytes + resMap[IOReadSyscall][nsinum] += procio.SyscR + resMap[IOWriteSyscall][nsinum] += procio.SyscW + resMap[IOReadBytes][nsinum] += procio.ReadBytes + resMap[IOWriteBytes][nsinum] += procio.WriteBytes } } return resMap, nil } -func metricUniqueID(subject string, m string) string { - return fmt.Sprintf("%s%s", subject, strings.ToLower(m)) -} - func getAllProcessIO(nslist []*nettop.Entity) (map[uint32][]procfs.ProcIO, error) { allprocio := make(map[uint32][]procfs.ProcIO) for idx := range nslist { diff --git a/pkg/exporter/probe/procipvs/ipvsservicestats.go b/pkg/exporter/probe/procipvs/ipvsservicestats.go index 58267f27..a961c5c1 100644 --- a/pkg/exporter/probe/procipvs/ipvsservicestats.go +++ b/pkg/exporter/probe/procipvs/ipvsservicestats.go @@ -4,80 +4,54 @@ import ( "bytes" "context" "errors" - "fmt" "io" "os" "strconv" "strings" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" ) const maxBufferSize = 1024 * 1024 var ( - ModuleName = "insp_ipvs" + probeName = "ipvs" statf = "/proc/net/ip_vs_stats" - probe = &ProcIPVS{} - - IPVSServiceCount = "IPVSServiceCount" - IPVSServiceTCPConnCount = "IPVSServiceTCPConnCount" - IPVSServiceTCPInBytesCount = "IPVSServiceTCPInBytesCount" - IPVSServiceTCPInPacketsCount = "IPVSServiceTCPInPacketsCount" - IPVSServiceTCPOutBytesCount = "IPVSServiceTCPOutBytesCount" - IPVSServiceTCPOutPacketsCount = "IPVSServiceTCPOutPacketsCount" - IPVSServiceUDPConnCount = "IPVSServiceUDPConnCount" - IPVSServiceUDPInBytesCount = "IPVSServiceUDPInBytesCount" - IPVSServiceUDPInPacketsCount = "IPVSServiceUDPInPacketsCount" - IPVSServiceUDPOutBytesCount = "IPVSServiceUDPOutBytesCount" - IPVSServiceUDPOutPacketsCount = "IPVSServiceUDPOutPacketsCount" - - Connections = "Connections" - IncomingPackets = "IncomingPackets" - OutgoingPackets = "OutgoingPackets" - IncomingBytes = "IncomingBytes" - OutgoingBytes = "OutgoingBytes" + Connections = "connections" + IncomingPackets = "incomingpackets" + OutgoingPackets = "outgoingpackets" + IncomingBytes = "incomingbytes" + OutgoingBytes = "outgoingbytes" IPVSMetrics = []string{Connections, IncomingPackets, OutgoingBytes, IncomingBytes, OutgoingPackets} ) -type ProcIPVS struct { +func init() { + probe.MustRegisterMetricsProbe(probeName, ipvsProbeCreator) } -func GetProbe() *ProcIPVS { - return probe -} +func ipvsProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &ProcIPVS{} -func (p *ProcIPVS) Name() string { - return ModuleName -} + batchMetrics := probe.NewLegacyBatchMetrics(probeName, IPVSMetrics, p.CollectOnce) -func (p *ProcIPVS) Close(_ proto.ProbeType) error { - return nil + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil } -func (p *ProcIPVS) Start(_ context.Context, _ proto.ProbeType) { +type ProcIPVS struct { } -func (p *ProcIPVS) Ready() bool { - // determine by if default ipvs stats file was ready - if _, err := os.Stat(statf); os.IsNotExist(err) { - return false - } - return true +func (p *ProcIPVS) Start(_ context.Context) error { + return nil } -func (p *ProcIPVS) GetMetricNames() []string { - res := []string{} - for _, m := range IPVSMetrics { - res = append(res, metricUniqueID("ipvs", m)) - } - return res +func (p *ProcIPVS) Stop(_ context.Context) error { + return nil } -func (p *ProcIPVS) Collect(_ context.Context) (map[string]map[uint32]uint64, error) { +func (p *ProcIPVS) CollectOnce() (map[string]map[uint32]uint64, error) { resMap := make(map[string]map[uint32]uint64) f, err := os.Open(statf) if err != nil { @@ -96,18 +70,14 @@ func (p *ProcIPVS) Collect(_ context.Context) (map[string]map[uint32]uint64, err return resMap, err } // only handle stats in default netns - resMap[metricUniqueID("ipvs", Connections)] = map[uint32]uint64{0: stats.Connections} - resMap[metricUniqueID("ipvs", IncomingPackets)] = map[uint32]uint64{0: stats.IncomingBytes} - resMap[metricUniqueID("ipvs", IncomingBytes)] = map[uint32]uint64{0: stats.IncomingBytes} - resMap[metricUniqueID("ipvs", OutgoingPackets)] = map[uint32]uint64{0: stats.OutgoingPackets} - resMap[metricUniqueID("ipvs", OutgoingBytes)] = map[uint32]uint64{0: stats.OutgoingBytes} + resMap[Connections] = map[uint32]uint64{0: stats.Connections} + resMap[IncomingPackets] = map[uint32]uint64{0: stats.IncomingBytes} + resMap[IncomingBytes] = map[uint32]uint64{0: stats.IncomingBytes} + resMap[OutgoingPackets] = map[uint32]uint64{0: stats.OutgoingPackets} + resMap[OutgoingBytes] = map[uint32]uint64{0: stats.OutgoingBytes} return resMap, nil } -func metricUniqueID(subject string, m string) string { - return fmt.Sprintf("%s%s", subject, strings.ToLower(m)) -} - // IPVSStats holds IPVS statistics, as exposed by the kernel in `/proc/net/ip_vs_stats`. type IPVSStats struct { // Total count of connections. diff --git a/pkg/exporter/probe/procnetdev/procnetdev.go b/pkg/exporter/probe/procnetdev/procnetdev.go index 89df0cd2..1c2816d5 100644 --- a/pkg/exporter/probe/procnetdev/procnetdev.go +++ b/pkg/exporter/probe/procnetdev/procnetdev.go @@ -2,93 +2,67 @@ package procnetdev import ( "context" - "fmt" - "os" - "strings" - "sync" - - "github.com/alibaba/kubeskoop/pkg/exporter/proto" "github.com/alibaba/kubeskoop/pkg/exporter/nettop" - + "github.com/alibaba/kubeskoop/pkg/exporter/probe" "github.com/prometheus/procfs" - "golang.org/x/exp/slog" + log "github.com/sirupsen/logrus" ) const ( - ModuleName = "procnetdev" // nolint - - RxBytes = "RxBytes" - RxErrors = "RxErrors" - TxBytes = "TxBytes" - TxErrors = "TxErrors" - RxPackets = "RxPackets" - RxDropped = "RxDropped" - TxPackets = "TxPackets" - TxDropped = "TxDropped" + probeName = "netdev" // nolint + + RxBytes = "rxbytes" + RxErrors = "rxerrors" + TxBytes = "txbytes" + TxErrors = "txerrors" + RxPackets = "rxpackets" + RxDropped = "rxdropped" + TxPackets = "txpackets" + TxDropped = "txdropped" ) var ( - once = sync.Once{} - probe *ProcNetdev - NetdevMetrics = []string{RxBytes, RxErrors, TxBytes, TxErrors, RxPackets, RxDropped, TxPackets, TxDropped} ) -type ProcNetdev struct { +func init() { + probe.MustRegisterMetricsProbe(probeName, netdevProbeCreator) } -func GetProbe() *ProcNetdev { - once.Do(func() { - probe = &ProcNetdev{} - }) - return probe -} +func netdevProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &ProcNetdev{} -func (s *ProcNetdev) Close(_ proto.ProbeType) error { - return nil -} + batchMetrics := probe.NewLegacyBatchMetrics(probeName, NetdevMetrics, p.CollectOnce) -func (s *ProcNetdev) Start(_ context.Context, _ proto.ProbeType) { + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil } -func (s *ProcNetdev) Ready() bool { - // determine by if default snmp file was ready - if _, err := os.Stat("/proc/net/dev"); os.IsNotExist(err) { - return false - } - return true +type ProcNetdev struct { } -func (s *ProcNetdev) Name() string { - return ModuleName +func (s *ProcNetdev) Start(_ context.Context) error { + return nil } -func (s *ProcNetdev) GetMetricNames() []string { - res := []string{} - for _, m := range NetdevMetrics { - res = append(res, metricUniqueID("netdev", m)) - } - return res +func (s *ProcNetdev) Stop(_ context.Context) error { + return nil } -func (s *ProcNetdev) Collect(ctx context.Context) (map[string]map[uint32]uint64, error) { +func (s *ProcNetdev) CollectOnce() (map[string]map[uint32]uint64, error) { ets := nettop.GetAllEntity() if len(ets) == 0 { - slog.Ctx(ctx).Info("collect", "mod", ModuleName, "ignore", "no entity found") + log.Errorf("%s error, no entity found", probeName) } - return collect(ctx, ets) -} - -func metricUniqueID(subject string, m string) string { - return fmt.Sprintf("%s%s", subject, strings.ToLower(m)) + return collect(ets) } -func collect(_ context.Context, nslist []*nettop.Entity) (map[string]map[uint32]uint64, error) { +func collect(nslist []*nettop.Entity) (map[string]map[uint32]uint64, error) { resMap := make(map[string]map[uint32]uint64) - for _, mname := range NetdevMetrics { - resMap[metricUniqueID("netdev", mname)] = map[uint32]uint64{} + for _, m := range NetdevMetrics { + resMap[m] = make(map[uint32]uint64) } + netdev := getAllNetdev(nslist) for nsid := range netdev { @@ -96,20 +70,16 @@ func collect(_ context.Context, nslist []*nettop.Entity) (map[string]map[uint32] continue } - for _, mname := range NetdevMetrics { - resMap[metricUniqueID("netdev", mname)][nsid] = 0 - } - for devname, devstat := range netdev[nsid] { if devname != "lo" { - resMap[metricUniqueID("netdev", RxBytes)][nsid] += devstat.RxBytes - resMap[metricUniqueID("netdev", RxErrors)][nsid] += devstat.RxErrors - resMap[metricUniqueID("netdev", TxBytes)][nsid] += devstat.TxBytes - resMap[metricUniqueID("netdev", TxErrors)][nsid] += devstat.TxErrors - resMap[metricUniqueID("netdev", RxPackets)][nsid] += devstat.RxPackets - resMap[metricUniqueID("netdev", TxPackets)][nsid] += devstat.TxPackets - resMap[metricUniqueID("netdev", RxDropped)][nsid] += devstat.RxDropped - resMap[metricUniqueID("netdev", TxDropped)][nsid] += devstat.TxDropped + resMap[RxBytes][nsid] += devstat.RxBytes + resMap[RxErrors][nsid] += devstat.RxErrors + resMap[TxBytes][nsid] += devstat.TxBytes + resMap[TxErrors][nsid] += devstat.TxErrors + resMap[RxPackets][nsid] += devstat.RxPackets + resMap[TxPackets][nsid] += devstat.TxPackets + resMap[RxDropped][nsid] += devstat.RxDropped + resMap[TxDropped][nsid] += devstat.TxDropped } } } diff --git a/pkg/exporter/probe/procnetstat/procnetstat.go b/pkg/exporter/probe/procnetstat/procnetstat.go index 2d86faed..f5d51bb7 100644 --- a/pkg/exporter/probe/procnetstat/procnetstat.go +++ b/pkg/exporter/probe/procnetstat/procnetstat.go @@ -3,67 +3,63 @@ package procnetstat import ( "bufio" "context" - "errors" "fmt" "io" "os" "strconv" "strings" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" + log "github.com/sirupsen/logrus" "github.com/alibaba/kubeskoop/pkg/exporter/nettop" - - "golang.org/x/exp/slog" ) const ( - ModuleName = "procnetstat" // nolint - - ProtocolTCPExt = "TcpExt" - - TCPActiveOpens = "ActiveOpens" - TCPPassiveOpens = "PassiveOpens" - TCPRetransSegs = "RetransSegs" - TCPListenDrops = "ListenDrops" - TCPListenOverflows = "ListenOverflows" - TCPSynRetrans = "TCPSynRetrans" - TCPFastRetrans = "TCPFastRetrans" - TCPRetransFail = "TCPRetransFail" - TCPTimeouts = "TCPTimeouts" - - TCPAbortOnClose = "TCPAbortOnClose" - TCPAbortOnMemory = "TCPAbortOnMemory" - TCPAbortOnTimeout = "TCPAbortOnTimeout" - TCPAbortOnLinger = "TCPAbortOnLinger" - TCPAbortOnData = "TCPAbortOnData" - TCPAbortFailed = "TCPAbortFailed" - TCPACKSkippedSynRecv = "TCPACKSkippedSynRecv" - TCPACKSkippedPAWS = "TCPACKSkippedPAWS" - TCPACKSkippedSeq = "TCPACKSkippedSeq" - TCPACKSkippedFinWait2 = "TCPACKSkippedFinWait2" - TCPACKSkippedTimeWait = "TCPACKSkippedTimeWait" - TCPACKSkippedChallenge = "TCPACKSkippedChallenge" - TCPRcvQDrop = "TCPRcvQDrop" - PAWSActive = "PAWSActive" - PAWSEstab = "PAWSEstab" - EmbryonicRsts = "EmbryonicRsts" - TCPWinProbe = "TCPWinProbe" - TCPKeepAlive = "TCPKeepAlive" - TCPMTUPFail = "TCPMTUPFail" - TCPMTUPSuccess = "TCPMTUPSuccess" - TCPZeroWindowDrop = "TCPZeroWindowDrop" - TCPBacklogDrop = "TCPBacklogDrop" - PFMemallocDrop = "PFMemallocDrop" - TCPWqueueTooBig = "TCPWqueueTooBig" - - TCPMemoryPressures = "TCPMemoryPressures" - TCPMemoryPressuresChrono = "TCPMemoryPressuresChrono" + probeName = "tcpext" // nolint + + ProtocolTCPExt = "tcpext" + + TCPActiveOpens = "activeopens" + TCPPassiveOpens = "passiveopens" + TCPRetransSegs = "retranssegs" + TCPListenDrops = "listendrops" + TCPListenOverflows = "listenoverflows" + TCPSynRetrans = "tcpsynretrans" + TCPFastRetrans = "tcpfastretrans" + TCPRetransFail = "tcpretransfail" + TCPTimeouts = "tcptimeouts" + + TCPAbortOnClose = "tcpabortonclose" + TCPAbortOnMemory = "tcpabortonmemory" + TCPAbortOnTimeout = "tcpabortontimeout" + TCPAbortOnLinger = "tcpabortonlinger" + TCPAbortOnData = "tcpabortondata" + TCPAbortFailed = "tcpabortfailed" + TCPACKSkippedSynRecv = "tcpackskippedsynrecv" + TCPACKSkippedPAWS = "tcpackskippedpaws" + TCPACKSkippedSeq = "tcpackskippedseq" + TCPACKSkippedFinWait2 = "tcpackskippedfinwait2" + TCPACKSkippedTimeWait = "tcpackskippedtimewait" + TCPACKSkippedChallenge = "tcpackskippedchallenge" + TCPRcvQDrop = "tcprcvqdrop" + PAWSActive = "pawsactive" + PAWSEstab = "pawsestab" + EmbryonicRsts = "embryonicrsts" + TCPWinProbe = "tcpwinprobe" + TCPKeepAlive = "tcpkeepalive" + TCPMTUPFail = "tcpmtupfail" + TCPMTUPSuccess = "tcpmtupsuccess" + TCPZeroWindowDrop = "tcpzerowindowdrop" + TCPBacklogDrop = "tcpbacklogdrop" + PFMemallocDrop = "pfmemallocdrop" + TCPWqueueTooBig = "tcpwqueuetoobig" + + TCPMemoryPressures = "tcpmemorypressures" + TCPMemoryPressuresChrono = "tcpmemorypressureschrono" ) var ( - probe = &ProcNetstat{} - TCPExtMetrics = []string{TCPListenDrops, TCPListenOverflows, TCPSynRetrans, @@ -98,71 +94,60 @@ var ( TCPWqueueTooBig} ) -type ProcNetstat struct { +func init() { + probe.MustRegisterMetricsProbe(probeName, netdevProbeCreator) } -func GetProbe() *ProcNetstat { - return probe -} +func netdevProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &ProcNetstat{} -func (s *ProcNetstat) Close(_ proto.ProbeType) error { - return nil -} + batchMetrics := probe.NewLegacyBatchMetrics(probeName, TCPExtMetrics, p.CollectOnce) -func (s *ProcNetstat) Start(_ context.Context, _ proto.ProbeType) { + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil } -func (s *ProcNetstat) Ready() bool { - // determine by if default snmp file was ready - if _, err := os.Stat("/proc/net/netstat"); os.IsNotExist(err) { - return false - } - return true +type ProcNetstat struct { } -func (s *ProcNetstat) Name() string { - return ModuleName +func (s *ProcNetstat) Start(_ context.Context) error { + return nil } -func (s *ProcNetstat) GetMetricNames() []string { - res := []string{} - for _, m := range TCPExtMetrics { - res = append(res, metricUniqueID("tcpext", m)) - } - return res +func (s *ProcNetstat) Stop(_ context.Context) error { + return nil } -func (s *ProcNetstat) Collect(ctx context.Context) (map[string]map[uint32]uint64, error) { +func (s *ProcNetstat) CollectOnce() (map[string]map[uint32]uint64, error) { ets := nettop.GetAllEntity() if len(ets) == 0 { - slog.Ctx(ctx).Info("collect", "mod", ModuleName, "ignore", "no entity found") - return nil, errors.New("no entity to collect") + log.Errorf("%s error, no entity found", probeName) } - return collect(ctx, ets) + return collect(ets) } -func collect(ctx context.Context, nslist []*nettop.Entity) (map[string]map[uint32]uint64, error) { +func collect(nslist []*nettop.Entity) (map[string]map[uint32]uint64, error) { resMap := make(map[string]map[uint32]uint64) + for _, stat := range TCPExtMetrics { - resMap[metricUniqueID("tcpext", stat)] = map[uint32]uint64{} + resMap[stat] = make(map[uint32]uint64) } for _, et := range nslist { stats, err := getNetstatByPid(uint32(et.GetPid())) if err != nil { - slog.Ctx(ctx).Info("collect", "mod", ModuleName, "ignore", "no entity found") + log.Errorf("%s failed collect pid %d, err: %v", probeName, et.GetPid(), err) continue } - slog.Ctx(ctx).Debug("collect", "mod", ModuleName, "netns", et.GetNetns(), "stats", stats) + extstats := stats[ProtocolTCPExt] for _, stat := range TCPExtMetrics { if _, ok := extstats[stat]; ok { data, err := strconv.ParseUint(extstats[stat], 10, 64) if err != nil { - slog.Ctx(ctx).Warn("collect", "mod", ModuleName, "ignore", stat, "err", err) + log.Errorf("%s failed parse stat %s, pid: %d err: %v", probeName, stat, et.GetPid(), err) continue } - resMap[metricUniqueID("tcpext", stat)][uint32(et.GetNetns())] += data + resMap[stat][uint32(et.GetNetns())] += data } } } @@ -170,10 +155,6 @@ func collect(ctx context.Context, nslist []*nettop.Entity) (map[string]map[uint3 return resMap, nil } -func metricUniqueID(subject string, m string) string { - return fmt.Sprintf("%s%s", subject, strings.ToLower(m)) -} - func getNetstatByPid(pid uint32) (map[string]map[string]string, error) { resMap := make(map[string]map[string]string) netstatpath := fmt.Sprintf("/proc/%d/net/netstat", pid) @@ -214,14 +195,14 @@ func parseNetStats(r io.Reader, fileName string) (map[string]map[string]string, scanner.Scan() valueParts := strings.Split(scanner.Text(), " ") // Remove trailing :. - protocol := nameParts[0][:len(nameParts[0])-1] + protocol := strings.ToLower(nameParts[0][:len(nameParts[0])-1]) netStats[protocol] = map[string]string{} if len(nameParts) != len(valueParts) { return nil, fmt.Errorf("mismatch field count mismatch in %s: %s", fileName, protocol) } for i := 1; i < len(nameParts); i++ { - netStats[protocol][nameParts[i]] = valueParts[i] + netStats[protocol][strings.ToLower(nameParts[i])] = valueParts[i] } } diff --git a/pkg/exporter/probe/procsnmp/procsnmp.go b/pkg/exporter/probe/procsnmp/procsnmp.go index 1cfb81db..d7dad8b1 100644 --- a/pkg/exporter/probe/procsnmp/procsnmp.go +++ b/pkg/exporter/probe/procsnmp/procsnmp.go @@ -9,12 +9,11 @@ import ( "strconv" "strings" "sync" - - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + "time" "github.com/alibaba/kubeskoop/pkg/exporter/nettop" - - "golang.org/x/exp/slog" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" + log "github.com/sirupsen/logrus" ) const ( @@ -29,38 +28,40 @@ const ( ProtocolUDPLite = "UdpLite" // metrics of tcp - TCPActiveOpens = "ActiveOpens" - TCPPassiveOpens = "PassiveOpens" - TCPRetransSegs = "RetransSegs" - TCPListenDrops = "ListenDrops" - TCPListenOverflows = "ListenOverflows" - TCPSynRetrans = "TCPSynRetrans" - TCPFastRetrans = "TCPFastRetrans" - TCPRetransFail = "TCPRetransFail" - TCPTimeouts = "TCPTimeouts" - TCPAttemptFails = "AttemptFails" - TCPEstabResets = "EstabResets" - TCPCurrEstab = "CurrEstab" - TCPInSegs = "InSegs" - TCPOutSegs = "OutSegs" - TCPInErrs = "InErrs" - TCPOutRsts = "OutRsts" + TCPActiveOpens = "activeopens" + TCPPassiveOpens = "passiveopens" + TCPRetransSegs = "retranssegs" + TCPListenDrops = "listendrops" + TCPListenOverflows = "listenoverflows" + TCPSynRetrans = "tcpsynretrans" + TCPFastRetrans = "tcpfastretrans" + TCPRetransFail = "tcpretransfail" + TCPTimeouts = "tcptimeouts" + TCPAttemptFails = "attemptfails" + TCPEstabResets = "estabresets" + TCPCurrEstab = "currestab" + TCPInSegs = "insegs" + TCPOutSegs = "outsegs" + TCPInErrs = "inerrs" + TCPOutRsts = "outrsts" // metrics of udp - UDPInDatagrams = "InDatagrams" - UDPNoPorts = "NoPorts" - UDPInErrors = "InErrors" - UDPOutDatagrams = "OutDatagrams" - UDPRcvbufErrors = "RcvbufErrors" - UDPSndbufErrors = "SndbufErrors" - UDPInCsumErrors = "InCsumErrors" - UDPIgnoredMulti = "IgnoredMulti" + UDPInDatagrams = "indatagrams" + UDPNoPorts = "noports" + UDPInErrors = "inerrors" + UDPOutDatagrams = "outdatagrams" + UDPRcvbufErrors = "rcvbuferrors" + UDPSndbufErrors = "sndbuferrors" + UDPInCsumErrors = "incsumerrors" + UDPIgnoredMulti = "ignoredmulti" //metrics of ip - IPInNoRoutes = "InNoRoutes" - IPInTruncatedPkts = "InTruncatedPkts" + IPInNoRoutes = "innoroutes" + IPInTruncatedPkts = "intruncatedpkts" - ModuleName = "procsnmp" // nolint + TCP = "tcp" + UDP = "udp" + IP = "ip" ) var ( @@ -68,75 +69,87 @@ var ( UDPStatMetrics = []string{UDPInDatagrams, UDPNoPorts, UDPInErrors, UDPOutDatagrams, UDPRcvbufErrors, UDPSndbufErrors, UDPInCsumErrors, UDPIgnoredMulti} IPMetrics = []string{IPInNoRoutes, IPInTruncatedPkts} - probe *ProcSNMP - once sync.Once + metricsMap = map[string][]string{ + TCP: TCPStatMetrcis, + UDP: UDPStatMetrics, + IP: IPMetrics, + } + + cache = &snmpCache{ + cache: make(map[string]map[string]map[uint32]uint64), + } ) -func GetProbe() *ProcSNMP { - once.Do(func() { - probe = &ProcSNMP{} - }) - return probe +func init() { + probe.MustRegisterMetricsProbe(TCP, newSnmpProbeCreator(TCP)) + probe.MustRegisterMetricsProbe(UDP, newSnmpProbeCreator(UDP)) + probe.MustRegisterMetricsProbe(IP, newSnmpProbeCreator(IP)) } -type ProcSNMP struct { +func newSnmpProbeCreator(probeName string) probe.MetricsProbeCreator { + return func(args map[string]interface{}) (probe.MetricsProbe, error) { + p := &procSNMP{ + name: probeName, + } + metrics := metricsMap[probeName] + batchMetrics := probe.NewLegacyBatchMetrics(probeName, metrics, p.CollectOnce) + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil + } +} + +type procSNMP struct { + name string } -func (s *ProcSNMP) Close(_ proto.ProbeType) error { +func (s *procSNMP) Start(_ context.Context) error { return nil } -func (s *ProcSNMP) Start(_ context.Context, _ proto.ProbeType) { +func (s *procSNMP) Stop(_ context.Context) error { + return nil } -func (s *ProcSNMP) Ready() bool { - // determine by if default snmp file was ready - if _, err := os.Stat("/proc/net/snmp"); os.IsNotExist(err) { - return false - } - return true +func (s *procSNMP) CollectOnce() (map[string]map[uint32]uint64, error) { + return cache.get(s.name) } -func (s *ProcSNMP) Name() string { - return ModuleName +type snmpCache struct { + cache map[string]map[string]map[uint32]uint64 + err error + last time.Time + lock sync.Mutex } -func (s *ProcSNMP) GetMetricNames() []string { - res := []string{} - for _, m := range TCPStatMetrcis { - res = append(res, fmt.Sprintf("tcp%s", strings.ToLower(m))) - } - for _, m := range UDPStatMetrics { - res = append(res, fmt.Sprintf("udp%s", strings.ToLower(m))) - } - for _, m := range IPMetrics { - res = append(res, fmt.Sprintf("ip%s", strings.ToLower(m))) +func (c *snmpCache) get(name string) (map[string]map[uint32]uint64, error) { + c.lock.Lock() + defer c.lock.Unlock() + + if c.err != nil { + return nil, c.err } - return res -} -func (s *ProcSNMP) Collect(ctx context.Context) (map[string]map[uint32]uint64, error) { - ets := nettop.GetAllEntity() - if len(ets) == 0 { - slog.Ctx(ctx).Info("collect", "mod", ModuleName, "ignore", "no entity found") + if time.Since(c.last) > time.Second*2 { + c.reload() } - return collect(ctx, ets) + + return c.cache[name], nil } -func metricUniqueID(subject string, m string) string { - return fmt.Sprintf("%s%s", strings.ToLower(subject), strings.ToLower(m)) +func (c *snmpCache) reload() { + c.cache, c.err = collect() + c.last = time.Now() } -func collect(ctx context.Context, entitys []*nettop.Entity) (map[string]map[uint32]uint64, error) { - res := map[string]map[uint32]uint64{} - for _, m := range TCPStatMetrcis { - res[metricUniqueID("tcp", m)] = map[uint32]uint64{} - } - for _, m := range UDPStatMetrics { - res[metricUniqueID("udp", m)] = map[uint32]uint64{} - } - for _, m := range IPMetrics { - res[metricUniqueID("ip", m)] = map[uint32]uint64{} +func collect() (map[string]map[string]map[uint32]uint64, error) { + entitys := nettop.GetAllEntity() + + res := make(map[string]map[string]map[uint32]uint64) + + for proto, metricsList := range metricsMap { + res[proto] = make(map[string]map[uint32]uint64) + for _, metrics := range metricsList { + res[proto][metrics] = make(map[uint32]uint64) + } } for _, et := range entitys { @@ -146,22 +159,21 @@ func collect(ctx context.Context, entitys []*nettop.Entity) (map[string]map[uint stats, err := getNetstatByPid(pid) if err != nil { - slog.Ctx(ctx).Debug("get netstat failed", "pid", pid, "nsinum", nsinum, "err", err) + log.Errorf("%s failed get netstat, pid: %d, nsinum: %d, err: %v", "snmp", pid, nsinum, err) continue } for proto, stat := range stats { for k, v := range stat { - mkey := metricUniqueID(proto, k) - slog.Ctx(ctx).Debug("store metric", "metric", mkey, "pid", pid, "nsinum", nsinum, "value", v) data, err := strconv.ParseInt(v, 10, 64) if err != nil { - slog.Ctx(ctx).Debug("parse netstat value", "metric", mkey, "pid", pid, "nsinum", nsinum, "value", v, "err", err) + log.Errorf("%s failed parse netstat value, pid: %d, nsinum: %d, key: %s value: %s, err: %v", "snmp", pid, nsinum, k, v, err) continue } // ignore unaware metric - if _, ok := res[mkey]; ok { - res[mkey][uint32(nsinum)] = uint64(data) + + if _, ok := res[proto][k]; ok { + res[proto][k][uint32(nsinum)] = uint64(data) } } } @@ -211,14 +223,14 @@ func parseNetStats(r io.Reader, fileName string) (map[string]map[string]string, scanner.Scan() valueParts := strings.Split(scanner.Text(), " ") // Remove trailing :. - protocol := nameParts[0][:len(nameParts[0])-1] + protocol := strings.ToLower(nameParts[0][:len(nameParts[0])-1]) netStats[protocol] = map[string]string{} if len(nameParts) != len(valueParts) { return nil, fmt.Errorf("mismatch field count mismatch in %s: %s", fileName, protocol) } for i := 1; i < len(nameParts); i++ { - netStats[protocol][nameParts[i]] = valueParts[i] + netStats[protocol][strings.ToLower(nameParts[i])] = valueParts[i] } } diff --git a/pkg/exporter/probe/procsock/procsock.go b/pkg/exporter/probe/procsock/procsock.go index 371a670d..2af129a3 100644 --- a/pkg/exporter/probe/procsock/procsock.go +++ b/pkg/exporter/probe/procsock/procsock.go @@ -7,7 +7,7 @@ import ( "errors" "fmt" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" "io" "os" @@ -20,60 +20,43 @@ import ( ) const ( - TCPSockInuse = "Inuse" - TCPSockOrphan = "Orphan" - TCPSockTimewait = "TW" - TCPSockeAlloc = "Alloc" - TCPSockeMem = "Mem" - - ModuleName = "procsock" + TCPSockInuse = "inuse" + TCPSockOrphan = "orphan" + TCPSockTimewait = "tw" + TCPSockeAlloc = "alloc" + TCPSockeMem = "mem" ) var ( TCPSockStatMetrics = []string{TCPSockInuse, TCPSockOrphan, TCPSockTimewait, TCPSockeAlloc, TCPSockeMem} - probe = &ProcSock{} + probeName = "sock" ) -func GetProbe() *ProcSock { - return probe -} - -func (s *ProcSock) Close(_ proto.ProbeType) error { - return nil +func init() { + probe.MustRegisterMetricsProbe(probeName, sockProbeCreator) } -func (s *ProcSock) Start(_ context.Context, _ proto.ProbeType) { -} +func sockProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &ProcSock{} -func (s *ProcSock) Ready() bool { - // determine by if default snmp file was ready - if _, err := os.Stat("/proc/net/sockstat"); os.IsNotExist(err) { - return false - } - return true -} + batchMetrics := probe.NewLegacyBatchMetrics(probeName, TCPSockStatMetrics, p.CollectOnce) -func (s *ProcSock) Name() string { - return ModuleName + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil } -func (s *ProcSock) GetMetricNames() []string { - res := []string{} - for _, m := range TCPSockStatMetrics { - res = append(res, metricUniqueID("sock", m)) - } - return res +func (s *ProcSock) CollectOnce() (map[string]map[uint32]uint64, error) { + return collect() } -func (s *ProcSock) Collect(ctx context.Context) (map[string]map[uint32]uint64, error) { - return collect(ctx) +type ProcSock struct { } -func metricUniqueID(subject string, m string) string { - return fmt.Sprintf("%s%s", subject, strings.ToLower(m)) +func (s *ProcSock) Start(_ context.Context) error { + return nil } -type ProcSock struct { +func (s *ProcSock) Stop(_ context.Context) error { + return nil } type tcpsockstat struct { @@ -84,10 +67,10 @@ type tcpsockstat struct { Mem int } -func collect(_ context.Context) (resMap map[string]map[uint32]uint64, err error) { +func collect() (resMap map[string]map[uint32]uint64, err error) { resMap = make(map[string]map[uint32]uint64) for _, stat := range TCPSockStatMetrics { - resMap[metricUniqueID("sock", stat)] = map[uint32]uint64{} + resMap[stat] = map[uint32]uint64{} } // for _, nslogic := range nslist { @@ -107,11 +90,11 @@ func collect(_ context.Context) (resMap map[string]map[uint32]uint64, err error) return resMap, err } nsinum := uint32(nettop.InitNetns) - resMap[metricUniqueID("sock", TCPSockInuse)][nsinum] = uint64(skstat.InUse) - resMap[metricUniqueID("sock", TCPSockOrphan)][nsinum] = uint64(skstat.Orphan) - resMap[metricUniqueID("sock", TCPSockTimewait)][nsinum] = uint64(skstat.TW) - resMap[metricUniqueID("sock", TCPSockeAlloc)][nsinum] = uint64(skstat.Alloc) - resMap[metricUniqueID("sock", TCPSockeMem)][nsinum] = uint64(skstat.Mem) + resMap[TCPSockInuse][nsinum] = uint64(skstat.InUse) + resMap[TCPSockOrphan][nsinum] = uint64(skstat.Orphan) + resMap[TCPSockTimewait][nsinum] = uint64(skstat.TW) + resMap[TCPSockeAlloc][nsinum] = uint64(skstat.Alloc) + resMap[TCPSockeMem][nsinum] = uint64(skstat.Mem) return } diff --git a/pkg/exporter/probe/procsoftnet/procsoftnet.go b/pkg/exporter/probe/procsoftnet/procsoftnet.go index be959b51..123243ea 100644 --- a/pkg/exporter/probe/procsoftnet/procsoftnet.go +++ b/pkg/exporter/probe/procsoftnet/procsoftnet.go @@ -5,13 +5,12 @@ import ( "context" "fmt" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" "io" "os" "strconv" "strings" - "sync" "github.com/alibaba/kubeskoop/pkg/exporter/nettop" @@ -19,80 +18,61 @@ import ( ) const ( - SNProcessed = "Processed" - SNDropped = "Dropped" + SNProcessed = "processed" + SNDropped = "dropped" - ModuleName = "procsoftnet" + probeName = "softnet" ) var ( - SoftnetMetrics = []string{SNProcessed, SNDropped} - once = sync.Once{} - probe *ProcSoftnet + softnetMetrics = []string{SNProcessed, SNDropped} ) -type ProcSoftnet struct { +func init() { + probe.MustRegisterMetricsProbe(probeName, softNetProbeCreator) } -func GetProbe() *ProcSoftnet { - once.Do(func() { - probe = &ProcSoftnet{} - }) - return probe -} +func softNetProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &ProcSoftnet{} -func (s *ProcSoftnet) Close(_ proto.ProbeType) error { - return nil -} + batchMetrics := probe.NewLegacyBatchMetrics(probeName, softnetMetrics, p.CollectOnce) -func (s *ProcSoftnet) Start(_ context.Context, _ proto.ProbeType) { + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil } -func (s *ProcSoftnet) Ready() bool { - if _, err := os.Stat("/proc/net/softnet_stat"); os.IsNotExist(err) { - return false - } - return true +type ProcSoftnet struct { } -func (s *ProcSoftnet) Name() string { - return ModuleName +func (s *ProcSoftnet) Start(_ context.Context) error { + return nil } -func (s *ProcSoftnet) GetMetricNames() []string { - res := []string{} - for _, m := range SoftnetMetrics { - res = append(res, metricUniqueID("softnet", m)) - } - return res +func (s *ProcSoftnet) Stop(_ context.Context) error { + return nil } -func (s *ProcSoftnet) Collect(ctx context.Context) (map[string]map[uint32]uint64, error) { +func (s *ProcSoftnet) CollectOnce() (map[string]map[uint32]uint64, error) { ets := nettop.GetAllEntity() if len(ets) == 0 { - slog.Ctx(ctx).Info("collect", "mod", ModuleName, "ignore", "no entity found") + slog.Info("collect", "mod", probeName, "ignore", "no entity found") } - return collect(ctx, ets) -} - -func metricUniqueID(subject string, m string) string { - return fmt.Sprintf("%s%s", subject, strings.ToLower(m)) + return collect(ets) } -func collect(_ context.Context, nslist []*nettop.Entity) (map[string]map[uint32]uint64, error) { +func collect(nslist []*nettop.Entity) (map[string]map[uint32]uint64, error) { resMap := make(map[string]map[uint32]uint64) - for idx := range SoftnetMetrics { - resMap[metricUniqueID("softnet", SoftnetMetrics[idx])] = map[uint32]uint64{} + for _, m := range softnetMetrics { + resMap[m] = map[uint32]uint64{} } - for idx := range nslist { - stat, err := getSoftnetStatByPid(uint32(nslist[idx].GetPid())) + for _, ns := range nslist { + stat, err := getSoftnetStatByPid(uint32(ns.GetPid())) if err != nil { continue } - for indx := range SoftnetMetrics { - resMap[metricUniqueID("softnet", SoftnetMetrics[indx])][uint32(nslist[idx].GetNetns())] = stat[SoftnetMetrics[indx]] + for _, m := range softnetMetrics { + resMap[m][uint32(ns.GetNetns())] = stat[m] } } return resMap, nil @@ -118,9 +98,9 @@ func getSoftnetStatByPid(pid uint32) (map[string]uint64, error) { } res := map[string]uint64{} - for idx := range sns { - res[SNProcessed] += uint64(sns[idx].Processed) - res[SNDropped] += uint64(sns[idx].Dropped) + for _, ns := range sns { + res[SNProcessed] += uint64(ns.Processed) + res[SNDropped] += uint64(ns.Dropped) } return res, nil diff --git a/pkg/exporter/probe/proctcpsummary/proctcp.go b/pkg/exporter/probe/proctcpsummary/proctcp.go index c2e1f93a..e37fc31b 100644 --- a/pkg/exporter/probe/proctcpsummary/proctcp.go +++ b/pkg/exporter/probe/proctcpsummary/proctcp.go @@ -5,28 +5,26 @@ import ( "context" "encoding/hex" "fmt" - - "github.com/alibaba/kubeskoop/pkg/exporter/proto" - "io" "net" "os" "strconv" "strings" - "github.com/alibaba/kubeskoop/pkg/exporter/nettop" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" + log "github.com/sirupsen/logrus" - "golang.org/x/exp/slog" + "github.com/alibaba/kubeskoop/pkg/exporter/nettop" ) const ( ModuleName = "proctcpsummary" - TCPEstablishedConn = "tcpsummarytcpestablishedconn" - TCPTimewaitConn = "tcpsummarytcptimewaitconn" - TCPTXQueue = "tcpsummarytcptxqueue" - TCPRXQueue = "tcpsummarytcprxqueue" - TCPListenBacklog = "tcpsummarytcplistenbacklog" + TCPEstablishedConn = "tcpestablishedconn" + TCPTimewaitConn = "tcptimewaitconn" + TCPTXQueue = "tcptxqueue" + TCPRXQueue = "tcprxqueue" + TCPListenBacklog = "tcplistenbacklog" // st mapping of tcp state /*TCPEstablished:1 TCP_SYN_SENT:2 @@ -70,49 +68,42 @@ type ( ) var ( - probe = &ProcTCP{} TCPSummaryMetrics = []string{TCPEstablishedConn, TCPTimewaitConn, TCPTXQueue, TCPRXQueue} + probeName = "tcpsummary" ) -type ProcTCP struct { +func init() { + probe.MustRegisterMetricsProbe(probeName, softNetProbeCreator) } -func GetProbe() *ProcTCP { - return probe -} +func softNetProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &ProcTCP{} -func (s *ProcTCP) Close(_ proto.ProbeType) error { - return nil -} + batchMetrics := probe.NewLegacyBatchMetrics(probeName, TCPSummaryMetrics, p.CollectOnce) -func (s *ProcTCP) Start(_ context.Context, _ proto.ProbeType) { + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil } -func (s *ProcTCP) Ready() bool { - // determine by if default tcp file was ready - if _, err := os.Stat("/proc/net/tcp"); os.IsNotExist(err) { - return false - } - return true +type ProcTCP struct { } -func (s *ProcTCP) Name() string { - return ModuleName +func (s *ProcTCP) Start(_ context.Context) error { + return nil } -func (s *ProcTCP) GetMetricNames() []string { - return TCPSummaryMetrics +func (s *ProcTCP) Stop(_ context.Context) error { + return nil } -func (s *ProcTCP) Collect(ctx context.Context) (map[string]map[uint32]uint64, error) { +func (s *ProcTCP) CollectOnce() (map[string]map[uint32]uint64, error) { ets := nettop.GetAllEntity() if len(ets) == 0 { - slog.Ctx(ctx).Info("collect", "mod", ModuleName, "ignore", "no entity found") + log.Infof("failed collect tcp summary, no entity found") } - return collect(ctx, ets), nil + return collect(ets), nil } -func collect(ctx context.Context, pidlist []*nettop.Entity) map[string]map[uint32]uint64 { +func collect(pidlist []*nettop.Entity) map[string]map[uint32]uint64 { resMap := make(map[string]map[uint32]uint64) for idx := range TCPSummaryMetrics { @@ -120,14 +111,15 @@ func collect(ctx context.Context, pidlist []*nettop.Entity) map[string]map[uint3 } for idx := range pidlist { - summary, err := newNetTCP(fmt.Sprintf("/proc/%d/net/tcp", pidlist[idx].GetPid())) + path := fmt.Sprintf("/proc/%d/net/tcp", pidlist[idx].GetPid()) + summary, err := newNetTCP(path) if err != nil { - slog.Ctx(ctx).Warn("collect tcp", "mod", ModuleName, "err", err, "pid", pidlist[idx]) + log.Warnf("failed collect tcp, path %s, err: %v", path, err) continue } summary6, err := newNetTCP(fmt.Sprintf("/proc/%d/net/tcp6", pidlist[idx].GetPid())) if err != nil { - slog.Ctx(ctx).Warn("collect tcp6", "mod", ModuleName, "err", err, "pid", pidlist[idx]) + log.Warnf("failed collect tcp6, path %s, err: %v", path, err) continue } est, tw := summary.getEstTwCount() diff --git a/pkg/exporter/probe/proto.go b/pkg/exporter/probe/proto.go new file mode 100644 index 00000000..b6e5a042 --- /dev/null +++ b/pkg/exporter/probe/proto.go @@ -0,0 +1,151 @@ +package probe + +import ( + "context" + "errors" + "sync" + + "github.com/prometheus/client_golang/prometheus" +) + +var ( + ErrProbeNotExists = errors.New("probe not exists") + ErrProbeAlreadyExists = errors.New("probe already exists") + ErrInvalidProbeState = errors.New("invalid probe state") +) + +type Type uint8 + +type EventType string + +const ( + ProbeTypeMetrics = iota + ProbeTypeEvent + ProbeTypeCount +) + +func (p Type) String() string { + switch p { + case ProbeTypeMetrics: + return "metrics" + case ProbeTypeEvent: + return "event" + default: + return "" + } +} + +type State uint8 + +const ( + ProbeStateStopped = iota + ProbeStateStarting + ProbeStateRunning + ProbeStateStopping + ProbeStateFailed +) + +func (ps State) String() string { + switch ps { + case ProbeStateStopped: + return "Stopped" + case ProbeStateRunning: + return "Running" + case ProbeStateStarting: + return "Starting" + case ProbeStateStopping: + return "Stopping" + } + return "" +} + +type RawEvent struct { + Netns uint32 + EventType string + EventBody string +} + +type Label struct { + Name string `json:"name"` + Value string `json:"value"` +} +type Event struct { + Timestamp int64 `json:"timestamp"` + Type EventType `json:"type"` + Labels []Label `json:"labels"` + Message string `json:"msg"` +} + +type Probe interface { + Start(ctx context.Context) error + Stop(ctx context.Context) error + State() State + Name() string +} + +type MetricsProbe interface { + Probe + prometheus.Collector +} + +type EventProbe interface { + Probe +} + +type SimpleProbe interface { + Start(ctx context.Context) error + Stop(ctx context.Context) error +} + +type simpleProbe struct { + name string + state State + inner SimpleProbe + lock sync.Mutex +} + +func (s *simpleProbe) Start(ctx context.Context) error { + if s.state != ProbeStateStopped { + return ErrInvalidProbeState + } + + s.lock.Lock() + defer s.lock.Unlock() + s.state = ProbeStateStarting + if err := s.inner.Start(ctx); err != nil { + s.state = ProbeStateFailed + return err + } + s.state = ProbeStateRunning + return nil +} + +func (s *simpleProbe) Stop(ctx context.Context) error { + if s.state != ProbeStateRunning { + return ErrInvalidProbeState + } + + s.lock.Lock() + defer s.lock.Unlock() + if err := s.inner.Stop(ctx); err != nil { + s.state = ProbeStateFailed + return err + } + s.state = ProbeStateStopped + return nil +} + +func (s *simpleProbe) State() State { + return s.state +} + +func (s *simpleProbe) Name() string { + return s.name +} + +func NewProbe(name string, probe SimpleProbe) Probe { + return &simpleProbe{ + name: name, + inner: probe, + } +} diff --git a/pkg/exporter/probe/tracebiolatency/tracebiolatency.go b/pkg/exporter/probe/tracebiolatency/tracebiolatency.go index 7c973a30..d6a8140d 100644 --- a/pkg/exporter/probe/tracebiolatency/tracebiolatency.go +++ b/pkg/exporter/probe/tracebiolatency/tracebiolatency.go @@ -6,205 +6,151 @@ import ( "encoding/binary" "errors" "fmt" - "log" - "sync" + "time" "unsafe" "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" "github.com/alibaba/kubeskoop/pkg/exporter/nettop" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" - + "github.com/alibaba/kubeskoop/pkg/exporter/probe" "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" "github.com/cilium/ebpf/perf" "github.com/cilium/ebpf/ringbuf" "github.com/cilium/ebpf/rlimit" - "golang.org/x/exp/slog" + log "github.com/sirupsen/logrus" ) //go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc clang -cflags $BPF_CFLAGS -type insp_biolat_event_t bpf ../../../../bpf/tracebiolatency.c -- -I../../../../bpf/headers -D__TARGET_ARCH_x86 var ( - ModuleName = "insp_biolatency" // nolint - - probe = &BiolatencyProbe{once: sync.Once{}} - links = []link.Link{} - events = []string{"BIOLAT_10MS", "BIOLAT_100MS"} - - perfReader *perf.Reader + probeName = "biolatency" ) -type BiolatencyProbe struct { - enable bool - once sync.Once - sub chan<- proto.RawEvent - mtx sync.Mutex -} - -func GetProbe() *BiolatencyProbe { - return probe +func init() { + probe.MustRegisterEventProbe(probeName, bioLatencyProbeCreator) } -func (p *BiolatencyProbe) Name() string { - return ModuleName -} - -// Register register sub chan to get perf events -func (p *BiolatencyProbe) Register(receiver chan<- proto.RawEvent) error { - p.mtx.Lock() - defer p.mtx.Unlock() - p.sub = receiver - - return nil +func bioLatencyProbeCreator(sink chan<- *probe.Event, _ map[string]interface{}) (probe.EventProbe, error) { + p := &BiolatencyProbe{ + sink: sink, + } + return probe.NewEventProbe(probeName, p), nil } -func (p *BiolatencyProbe) Ready() bool { - return p.enable +type BiolatencyProbe struct { + sink chan<- *probe.Event + objs bpfObjects + links []link.Link + reader *perf.Reader } -func (p *BiolatencyProbe) Close(_ proto.ProbeType) error { - if p.enable { - for _, link := range links { - link.Close() - } - links = []link.Link{} - p.enable = false - p.once = sync.Once{} +func (p *BiolatencyProbe) Start(_ context.Context) error { + log.Debugf("start probe %s", probeName) + if err := p.loadAndAttachBPF(); err != nil { + _ = p.cleanup() + return err } - if perfReader != nil { - perfReader.Close() - perfReader = nil + var err error + p.reader, err = perf.NewReader(p.objs.InspBiolatEvts, int(unsafe.Sizeof(bpfInspBiolatEntryT{}))) + if err != nil { + _ = p.cleanup() + return err } - return nil -} + go p.perf() -func (p *BiolatencyProbe) GetEventNames() []string { - return events + // 开始针对perf事件进行读取 + return nil } -func (p *BiolatencyProbe) Start(ctx context.Context, _ proto.ProbeType) { - p.once.Do(func() { - err := start() - if err != nil { - slog.Ctx(ctx).Warn("start", "module", ModuleName, "err", err) - return - } - p.enable = true - }) - - if !p.enable { - // if load failed, do not start process - return - } - - slog.Debug("start probe", "module", ModuleName) - if perfReader == nil { - slog.Ctx(ctx).Warn("start", "module", ModuleName, "err", "perf reader not ready") - return - } - - // 开始针对perf事件进行读取 +func (p *BiolatencyProbe) perf() { for { - record, err := perfReader.Read() + record, err := p.reader.Read() if err != nil { if errors.Is(err, ringbuf.ErrClosed) { - slog.Ctx(ctx).Info("received signal, exiting..", "module", ModuleName) + log.Infof("%s received signal, exiting..", probeName) return } - slog.Ctx(ctx).Info("reading from reader", "module", ModuleName, "err", err) + log.Infof("%s failed reading from reader, err: %v", probeName, err) continue } if record.LostSamples != 0 { - slog.Ctx(ctx).Info("Perf event ring buffer full", "module", ModuleName, "drop samples", record.LostSamples) + log.Infof("%s perf event ring buffer full, drop: %d", probeName, record.LostSamples) continue } - // 解析perf事件信息,输出为proto.RawEvent + // 解析perf事件信息,输出为proto.Event var event bpfInspBiolatEventT // Parse the ringbuf event entry into a bpfEvent structure. if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event); err != nil { - slog.Ctx(ctx).Info("parsing event", "module", ModuleName, "err", err) + log.Infof("%s failed parsing event, err: %v", probeName, err) continue } pid := event.Pid if et, err := nettop.GetEntityByPid(int(pid)); err != nil || et == nil { - slog.Ctx(ctx).Warn("unspecified event", "pid", pid, "task", bpfutil.GetCommString(event.Target)) + log.Warnf("%s got unspecified event, pid: %d, task %s", probeName, pid, bpfutil.GetCommString(event.Target)) continue } - rawevt := proto.RawEvent{ - EventType: "BIOLAT_10MS", - EventBody: fmt.Sprintf("%s %d latency %s", bpfutil.GetCommString(event.Target), event.Pid, bpfutil.GetHumanTimes(event.Latency)), + evt := &probe.Event{ + Timestamp: time.Now().UnixNano(), + Type: "BIOLAT_10MS", + Message: fmt.Sprintf("%s %d latency %s", bpfutil.GetCommString(event.Target), event.Pid, bpfutil.GetHumanTimes(event.Latency)), } - // 分发给注册的dispatcher,其余逻辑由框架完成 - if p.sub != nil { - slog.Ctx(ctx).Debug("broadcast event", "module", ModuleName) - p.sub <- rawevt - } + log.Errorf("sink event to channel") + p.sink <- evt } } -func start() error { +func (p *BiolatencyProbe) Stop(_ context.Context) error { + return p.cleanup() +} + +func (p *BiolatencyProbe) cleanup() error { + if p.reader != nil { + p.reader.Close() + } + + for _, link := range p.links { + link.Close() + } + + p.objs.Close() + + return nil +} + +func (p *BiolatencyProbe) loadAndAttachBPF() error { // 准备动作 if err := rlimit.RemoveMemlock(); err != nil { log.Fatal(err) } + p.links = nil + opts := ebpf.CollectionOptions{} opts.Programs = ebpf.ProgramOptions{ KernelTypes: bpfutil.LoadBTFSpecOrNil(), } - objs := bpfObjects{} // Load pre-compiled programs and maps into the kernel. - if err := loadBpfObjects(&objs, &opts); err != nil { + if err := loadBpfObjects(&p.objs, &opts); err != nil { return fmt.Errorf("loading objects: %s", err.Error()) } - linkcreate, err := link.Kprobe("blk_account_io_start", objs.BiolatStart, nil) + linkcreate, err := link.Kprobe("blk_account_io_start", p.objs.BiolatStart, nil) if err != nil { return fmt.Errorf("link blk_account_io_start: %s", err.Error()) } - links = append(links, linkcreate) - linkdone, err := link.Kprobe("blk_account_io_done", objs.BiolatFinish, nil) + p.links = append(p.links, linkcreate) + + linkdone, err := link.Kprobe("blk_account_io_done", p.objs.BiolatFinish, nil) if err != nil { return fmt.Errorf("link blk_account_io_done: %s", err.Error()) } - links = append(links, linkdone) - reader, err := perf.NewReader(objs.InspBiolatEvts, int(unsafe.Sizeof(bpfInspBiolatEntryT{}))) - if err != nil { - return fmt.Errorf("perf new reader failed: %s", err.Error()) - } - perfReader = reader + p.links = append(p.links, linkdone) return nil - - // for { - // record, err := reader.Read() - // if err != nil { - // if errors.Is(err, ringbuf.ErrClosed) { - // log.Println("received signal, exiting..") - // return err - // } - // log.Printf("reading from reader: %s", err) - // continue - // } - - // if record.LostSamples != 0 { - // log.Printf("Perf event ring buffer full, dropped %d samples", record.LostSamples) - // continue - // } - - // var event bpfInspBiolatEventT - // // Parse the ringbuf event entry into a bpfEvent structure. - // if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event); err != nil { - // log.Printf("parsing event: %s", err) - // continue - // } - - // fmt.Printf("%-10s %-6d %-6s\n", bpfutil.GetCommString(event.Target), event.Pid, bpfutil.GetHumanTimes(event.Latency)) - // } } diff --git a/pkg/exporter/probe/tracekernel/tracekernel.go b/pkg/exporter/probe/tracekernel/tracekernel.go index cd24bb2e..ce08279c 100644 --- a/pkg/exporter/probe/tracekernel/tracekernel.go +++ b/pkg/exporter/probe/tracekernel/tracekernel.go @@ -9,17 +9,19 @@ import ( "math/bits" "strings" "sync" + "time" "unsafe" - "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" + "github.com/alibaba/kubeskoop/pkg/exporter/util" + log "github.com/sirupsen/logrus" + "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" "github.com/cilium/ebpf/perf" "github.com/cilium/ebpf/ringbuf" "github.com/cilium/ebpf/rlimit" - "golang.org/x/exp/slog" ) //go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc clang -cflags $BPF_CFLAGS -type insp_kl_event_t bpf ../../../../bpf/kernellatency.c -- -I../../../../bpf/headers -D__TARGET_ARCH_x86 @@ -42,153 +44,188 @@ const ( TXKERNEL_SLOW_METRIC = "kernellatency_txslow" RXKERNEL_SLOW100MS_METRIC = "kernellatency_rxslow100ms" TXKERNEL_SLOW100MS_METRIC = "kernellatency_txslow100ms" + + probeTypeEvent = 0 + probeTypeMetrics = 1 ) var ( - ModuleName = "insp_kernellatency" // nolint - probe = &KernelLatencyProbe{once: sync.Once{}, mtx: sync.Mutex{}, enabledProbes: map[proto.ProbeType]bool{}} - objs = bpfObjects{} - links = []link.Link{} - - events = []string{RXKERNEL_SLOW, TXKERNEL_SLOW} - metrics = []string{RXKERNEL_SLOW_METRIC, RXKERNEL_SLOW100MS_METRIC, TXKERNEL_SLOW_METRIC, TXKERNEL_SLOW100MS_METRIC} - metricsMap = map[string]map[uint32]uint64{} + metrics = []string{RXKERNEL_SLOW_METRIC, RXKERNEL_SLOW100MS_METRIC, TXKERNEL_SLOW_METRIC, TXKERNEL_SLOW100MS_METRIC} + probeName = "kernellatency" + latencyProbe = &kernelLatencyProbe{ + metricsMap: make(map[string]map[uint32]uint64), + } ) -func GetProbe() *KernelLatencyProbe { - return probe +func init() { + probe.MustRegisterMetricsProbe(probeName, metricsProbeCreator) + probe.MustRegisterEventProbe(probeName, eventProbeCreator) +} + +func metricsProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &metricsProbe{} + batchMetrics := probe.NewLegacyBatchMetrics(probeName, metrics, p.CollectOnce) + + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil } -func init() { - for m := range metrics { - metricsMap[metrics[m]] = map[uint32]uint64{} +func eventProbeCreator(sink chan<- *probe.Event, _ map[string]interface{}) (probe.EventProbe, error) { + p := &eventProbe{ + sink: sink, } + return probe.NewEventProbe(probeName, p), nil } -type KernelLatencyProbe struct { - enable bool - sub chan<- proto.RawEvent - once sync.Once - mtx sync.Mutex - enabledProbes map[proto.ProbeType]bool +type metricsProbe struct { } -func (p *KernelLatencyProbe) Name() string { - return ModuleName +func (p *metricsProbe) Start(ctx context.Context) error { + return latencyProbe.start(ctx, probe.ProbeTypeMetrics) } -func (p *KernelLatencyProbe) Ready() bool { - return p.enable +func (p *metricsProbe) Stop(ctx context.Context) error { + return latencyProbe.stop(ctx, probe.ProbeTypeMetrics) } -func (p *KernelLatencyProbe) GetEventNames() []string { - return events +func (p *metricsProbe) CollectOnce() (map[string]map[uint32]uint64, error) { + return latencyProbe.copyMetricsMap(), nil } -func (p *KernelLatencyProbe) Close(probeType proto.ProbeType) error { - if !p.enable { - return nil +type eventProbe struct { + sink chan<- *probe.Event +} + +func (e *eventProbe) Start(ctx context.Context) error { + err := latencyProbe.start(ctx, probe.ProbeTypeEvent) + if err != nil { + return err } - if _, ok := p.enabledProbes[probeType]; !ok { - return nil + latencyProbe.sink = e.sink + return nil +} + +func (e *eventProbe) Stop(ctx context.Context) error { + return latencyProbe.stop(ctx, probe.ProbeTypeEvent) +} + +type kernelLatencyProbe struct { + objs bpfObjects + links []link.Link + sink chan<- *probe.Event + refcnt [2]int + lock sync.Mutex + perfReader *perf.Reader + metricsMap map[string]map[uint32]uint64 + metricsLock sync.RWMutex +} + +func (p *kernelLatencyProbe) stop(_ context.Context, probeType probe.Type) error { + p.lock.Lock() + defer p.lock.Unlock() + if p.refcnt[probeType] == 0 { + return fmt.Errorf("probe %s never start", probeType) } - if len(p.enabledProbes) > 1 { - delete(p.enabledProbes, probeType) - return nil + + p.refcnt[probeType]-- + if p.totalReferenceCountLocked() == 0 { + return p.cleanup() } + return nil +} - for _, link := range links { +func (p *kernelLatencyProbe) cleanup() error { + if p.perfReader != nil { + p.perfReader.Close() + } + + for _, link := range p.links { link.Close() } - links = []link.Link{} - p.enable = false - p.once = sync.Once{} - metricsMap = map[string]map[uint32]uint64{} - delete(p.enabledProbes, probeType) - return nil -} + p.links = nil -func (p *KernelLatencyProbe) Register(receiver chan<- proto.RawEvent) error { - p.mtx.Lock() - defer p.mtx.Unlock() - p.sub = receiver + p.objs.Close() return nil } -func (p *KernelLatencyProbe) GetMetricNames() []string { - return metrics +func (p *kernelLatencyProbe) copyMetricsMap() map[string]map[uint32]uint64 { + p.metricsLock.RLock() + defer p.metricsLock.RUnlock() + return probe.CopyLegacyMetricsMap(p.metricsMap) } -func (p *KernelLatencyProbe) Collect(_ context.Context) (map[string]map[uint32]uint64, error) { - return metricsMap, nil +func (p *kernelLatencyProbe) totalReferenceCountLocked() int { + var c int + for _, n := range p.refcnt { + c += n + } + return c } -func (p *KernelLatencyProbe) Start(ctx context.Context, probeType proto.ProbeType) { - if p.enable { - p.enabledProbes[probeType] = true - return - } +func (p *kernelLatencyProbe) start(_ context.Context, probeType probe.Type) (err error) { + p.lock.Lock() + defer p.lock.Unlock() + + p.refcnt[probeType]++ + if p.totalReferenceCountLocked() == 1 { + if err = p.loadAndAttachBPF(); err != nil { + log.Errorf("%s failed load and attach bpf, err: %v", probeName, err) + _ = p.cleanup() + return fmt.Errorf("%s failed load bpf: %w", probeName, err) + } - p.once.Do(func() { - err := loadSync() + p.perfReader, err = perf.NewReader(p.objs.bpfMaps.InspKlatencyEvent, int(unsafe.Sizeof(bpfInspKlEventT{}))) if err != nil { - slog.Ctx(ctx).Warn("start", "module", ModuleName, "err", err) - return + log.Errorf("%s failed create perf reader, err: %v", probeName, err) + _ = p.cleanup() + return fmt.Errorf("%s failed create bpf reader: %w", probeName, err) } - p.enable = true - }) - if !p.enable { - // if load failed, do not start process - return + go p.perfLoop() } - p.enabledProbes[probeType] = true - go p.startRX(ctx) - // go p.startTX(ctx) + return nil } -func (p *KernelLatencyProbe) updateMetrics(netns uint32, metric string) { - p.mtx.Lock() - defer p.mtx.Unlock() - if _, ok := metricsMap[metric]; ok { - metricsMap[metric][netns]++ +func (p *kernelLatencyProbe) updateMetrics(netns uint32, metrics string) { + p.metricsLock.Lock() + defer p.metricsLock.Unlock() + if _, ok := p.metricsMap[metrics]; !ok { + p.metricsMap[metrics] = make(map[uint32]uint64) } -} -func (p *KernelLatencyProbe) startRX(ctx context.Context) { - reader, err := perf.NewReader(objs.bpfMaps.InspKlatencyEvent, int(unsafe.Sizeof(bpfInspKlEventT{}))) - if err != nil { - slog.Ctx(ctx).Warn("start new perf reader", "module", ModuleName, "err", err) - return - } + p.metricsMap[metrics][netns]++ +} +func (p *kernelLatencyProbe) perfLoop() { for { - record, err := reader.Read() + record, err := p.perfReader.Read() if err != nil { if errors.Is(err, ringbuf.ErrClosed) { - slog.Ctx(ctx).Info("received signal, exiting..", "module", ModuleName) + log.Errorf("%s received signal, exiting..", probeName) return } - slog.Ctx(ctx).Info("reading from reader", "module", ModuleName, "err", err) + log.Warnf("%s failed reading from reader, err: %v", probeName, err) continue } if record.LostSamples != 0 { - slog.Ctx(ctx).Info("Perf event ring buffer full", "module", ModuleName, "drop samples", record.LostSamples) + log.Warnf("%s perf event ring buffer full, drop: %d", probeName, record.LostSamples) continue } var event bpfInspKlEventT if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event); err != nil { - slog.Ctx(ctx).Info("parsing event", "module", ModuleName, "err", err) + log.Errorf("%s failed parsing event, err: %v", probeName, err) continue } - rawevt := proto.RawEvent{ - Netns: event.SkbMeta.Netns, + + netns := event.SkbMeta.Netns + evt := &probe.Event{ + Timestamp: time.Now().UnixNano(), + Labels: probe.LagacyEventLabels(netns), } /* #define RX_KLATENCY 1 @@ -197,7 +234,7 @@ func (p *KernelLatencyProbe) startRX(ctx context.Context) { tuple := fmt.Sprintf("protocol=%s saddr=%s sport=%d daddr=%s dport=%d ", bpfutil.GetProtoStr(event.Tuple.L4Proto), bpfutil.GetAddrStr(event.Tuple.L3Proto, *(*[16]byte)(unsafe.Pointer(&event.Tuple.Saddr))), bits.ReverseBytes16(event.Tuple.Sport), bpfutil.GetAddrStr(event.Tuple.L3Proto, *(*[16]byte)(unsafe.Pointer(&event.Tuple.Daddr))), bits.ReverseBytes16(event.Tuple.Dport)) switch event.Direction { case 1: - rawevt.EventType = RXKERNEL_SLOW + evt.Type = RXKERNEL_SLOW latency := []string{fmt.Sprintf("latency:%s", bpfutil.GetHumanTimes(event.Latency))} if event.Point2 > event.Point1 { latency = append(latency, fmt.Sprintf("PREROUTING:%s", bpfutil.GetHumanTimes(event.Point2-event.Point1))) @@ -208,10 +245,10 @@ func (p *KernelLatencyProbe) startRX(ctx context.Context) { if event.Point4 > event.Point3 && event.Point3 != 0 { latency = append(latency, fmt.Sprintf("LOCAL_IN:%s", bpfutil.GetHumanTimes(event.Point4-event.Point3))) } - rawevt.EventBody = fmt.Sprintf("%s %s", tuple, strings.Join(latency, " ")) - p.updateMetrics(rawevt.Netns, RXKERNEL_SLOW_METRIC) + evt.Message = fmt.Sprintf("%s %s", tuple, strings.Join(latency, " ")) + p.updateMetrics(netns, RXKERNEL_SLOW_METRIC) case 2: - rawevt.EventType = TXKERNEL_SLOW + evt.Type = TXKERNEL_SLOW latency := []string{fmt.Sprintf("latency=%s", bpfutil.GetHumanTimes(event.Latency))} if event.Point3 > event.Point1 && event.Point1 != 0 { latency = append(latency, fmt.Sprintf("LOCAL_OUT=%s", bpfutil.GetHumanTimes(event.Point3-event.Point1))) @@ -219,21 +256,21 @@ func (p *KernelLatencyProbe) startRX(ctx context.Context) { if event.Point4 > event.Point3 && event.Point3 != 0 { latency = append(latency, fmt.Sprintf("POSTROUTING=%s", bpfutil.GetHumanTimes(event.Point4-event.Point3))) } - rawevt.EventBody = fmt.Sprintf("%s %s", tuple, strings.Join(latency, " ")) - p.updateMetrics(rawevt.Netns, TXKERNEL_SLOW_METRIC) + evt.Message = fmt.Sprintf("%s %s", tuple, strings.Join(latency, " ")) + p.updateMetrics(netns, TXKERNEL_SLOW_METRIC) default: - slog.Ctx(ctx).Info("parsing event", "module", ModuleName, "ignore", event) + log.Infof("%s failed parsing event %s, ignore", probeName, util.ToJSONString(evt)) continue } - if p.sub != nil { - slog.Ctx(ctx).Debug("broadcast event", "module", ModuleName) - p.sub <- rawevt + if p.sink != nil { + log.Debugf("%s sink event %s", probeName, util.ToJSONString(evt)) + p.sink <- evt } } } -func loadSync() error { +func (p *kernelLatencyProbe) loadAndAttachBPF() error { // 准备动作 if err := rlimit.RemoveMemlock(); err != nil { return err @@ -246,69 +283,69 @@ func loadSync() error { } // 获取Loaded的程序/map的fd信息 - if err := loadBpfObjects(&objs, &opts); err != nil { + if err := loadBpfObjects(&p.objs, &opts); err != nil { return fmt.Errorf("loading objects: %v", err) } - progrcv, err := link.Kprobe(HOOK_IPRCV, objs.KlatencyIpRcv, &link.KprobeOptions{}) + progrcv, err := link.Kprobe(HOOK_IPRCV, p.objs.KlatencyIpRcv, &link.KprobeOptions{}) if err != nil { return fmt.Errorf("link HOOK_IPRCV: %s", err.Error()) } - links = append(links, progrcv) + p.links = append(p.links, progrcv) - progrcvfin, err := link.Kprobe(HOOK_IPRCVFIN, objs.KlatencyIpRcvFinish, &link.KprobeOptions{}) + progrcvfin, err := link.Kprobe(HOOK_IPRCVFIN, p.objs.KlatencyIpRcvFinish, &link.KprobeOptions{}) if err != nil { return fmt.Errorf("link HOOK_IPRCVFIN: %s", err.Error()) } - links = append(links, progrcvfin) + p.links = append(p.links, progrcvfin) - proglocal, err := link.Kprobe(HOOK_IPLOCAL, objs.KlatencyIpLocalDeliver, &link.KprobeOptions{}) + proglocal, err := link.Kprobe(HOOK_IPLOCAL, p.objs.KlatencyIpLocalDeliver, &link.KprobeOptions{}) if err != nil { return fmt.Errorf("link HOOK_IPRCV: %s", err.Error()) } - links = append(links, proglocal) + p.links = append(p.links, proglocal) - proglocalfin, err := link.Kprobe(HOOK_IPLOCALFIN, objs.KlatencyIpLocalDeliverFinish, &link.KprobeOptions{}) + proglocalfin, err := link.Kprobe(HOOK_IPLOCALFIN, p.objs.KlatencyIpLocalDeliverFinish, &link.KprobeOptions{}) if err != nil { return fmt.Errorf("link HOOK_IPLOCALFIN: %s", err.Error()) } - links = append(links, proglocalfin) + p.links = append(p.links, proglocalfin) - progxmit, err := link.Kprobe(HOOK_IPXMIT, objs.KlatencyIpQueueXmit, &link.KprobeOptions{}) + progxmit, err := link.Kprobe(HOOK_IPXMIT, p.objs.KlatencyIpQueueXmit, &link.KprobeOptions{}) if err != nil { return fmt.Errorf("link HOOK_IPXMIT: %s", err.Error()) } - links = append(links, progxmit) + p.links = append(p.links, progxmit) - proglocalout, err := link.Kprobe(HOOK_IPLOCALOUT, objs.KlatencyIpLocal, &link.KprobeOptions{}) + proglocalout, err := link.Kprobe(HOOK_IPLOCALOUT, p.objs.KlatencyIpLocal, &link.KprobeOptions{}) if err != nil { return fmt.Errorf("link HOOK_IPLOCALOUT: %s", err.Error()) } - links = append(links, proglocalout) + p.links = append(p.links, proglocalout) - progoutput, err := link.Kprobe(HOOK_IPOUTPUT, objs.KlatencyIpOutput, &link.KprobeOptions{}) + progoutput, err := link.Kprobe(HOOK_IPOUTPUT, p.objs.KlatencyIpOutput, &link.KprobeOptions{}) if err != nil { return fmt.Errorf("link HOOK_IPOUTPUT: %s", err.Error()) } - links = append(links, progoutput) + p.links = append(p.links, progoutput) - progfin, err := link.Kprobe(HOOK_IPOUTPUTFIN, objs.KlatencyIpFinishOutput2, &link.KprobeOptions{}) + progfin, err := link.Kprobe(HOOK_IPOUTPUTFIN, p.objs.KlatencyIpFinishOutput2, &link.KprobeOptions{}) if err != nil { return fmt.Errorf("link HOOK_IPOUTPUTFIN: %s", err.Error()) } - links = append(links, progfin) + p.links = append(p.links, progfin) - progkfree, err := link.Kprobe("kfree_skb", objs.ReportKfree, &link.KprobeOptions{}) + progkfree, err := link.Kprobe("kfree_skb", p.objs.ReportKfree, &link.KprobeOptions{}) if err != nil { return fmt.Errorf("link kfree_skb: %s", err.Error()) } - links = append(links, progkfree) + p.links = append(p.links, progkfree) - progconsume, err := link.Kprobe("consume_skb", objs.ReportConsume, &link.KprobeOptions{}) + progconsume, err := link.Kprobe("consume_skb", p.objs.ReportConsume, &link.KprobeOptions{}) if err != nil { return fmt.Errorf("link consume_skb: %s", err.Error()) } - links = append(links, progconsume) + p.links = append(p.links, progconsume) return nil } diff --git a/pkg/exporter/probe/tracenetiftxlatency/tracenetiftxlatency.go b/pkg/exporter/probe/tracenetiftxlatency/tracenetiftxlatency.go index b0c7c14e..89d8972d 100644 --- a/pkg/exporter/probe/tracenetiftxlatency/tracenetiftxlatency.go +++ b/pkg/exporter/probe/tracenetiftxlatency/tracenetiftxlatency.go @@ -6,21 +6,21 @@ import ( "encoding/binary" "errors" "fmt" - "log" "math/bits" "sync" + "time" "unsafe" - "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" - "github.com/alibaba/kubeskoop/pkg/exporter/nettop" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" + "github.com/alibaba/kubeskoop/pkg/exporter/util" + log "github.com/sirupsen/logrus" + "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" "github.com/cilium/ebpf/perf" "github.com/cilium/ebpf/ringbuf" "github.com/cilium/ebpf/rlimit" - "golang.org/x/exp/slog" ) // nolint @@ -32,80 +32,167 @@ const ( ) var ( - ModuleName = "insp_netiftxlat" // nolint + metrics = []string{TXLAT_QDISC_SLOW, TXLAT_NETDEV_SLOW} + probeName = "netiftxlat" + _netifTxlatencyProbe = &netifTxlatencyProbe{} +) - probe = &NetifTxlatencyProbe{once: sync.Once{}, enabledProbes: map[proto.ProbeType]bool{}} - links = []link.Link{} - events = []string{"TXLAT_QDISC_100MS", "TXLAT_NETDEV_100MS"} - metrics = []string{TXLAT_QDISC_SLOW, TXLAT_NETDEV_SLOW} - metricsMap = map[string]map[uint32]uint64{} +func init() { + probe.MustRegisterMetricsProbe(probeName, metricsProbeCreator) + probe.MustRegisterEventProbe(probeName, eventProbeCreator) +} - perfReader *perf.Reader -) +func metricsProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &metricsProbe{} + batchMetrics := probe.NewLegacyBatchMetrics(probeName, metrics, p.CollectOnce) -func GetProbe() *NetifTxlatencyProbe { - return probe + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil } -func init() { - for m := range metrics { - metricsMap[metrics[m]] = map[uint32]uint64{} +func eventProbeCreator(sink chan<- *probe.Event, _ map[string]interface{}) (probe.EventProbe, error) { + p := &eventProbe{ + sink: sink, + } + return probe.NewEventProbe(probeName, p), nil +} + +type metricsProbe struct { +} + +func (p *metricsProbe) Start(_ context.Context) error { + return _netifTxlatencyProbe.start(probe.ProbeTypeMetrics) +} + +func (p *metricsProbe) Stop(_ context.Context) error { + return _netifTxlatencyProbe.stop(probe.ProbeTypeMetrics) +} + +func (p *metricsProbe) CollectOnce() (map[string]map[uint32]uint64, error) { + return _netifTxlatencyProbe.copyMetricsMap(), nil +} + +type eventProbe struct { + sink chan<- *probe.Event +} + +func (e *eventProbe) Start(_ context.Context) error { + err := _netifTxlatencyProbe.start(probe.ProbeTypeEvent) + if err != nil { + return err } + + _netifTxlatencyProbe.sink = e.sink + return nil } -type NetifTxlatencyProbe struct { - enable bool - once sync.Once - sub chan<- proto.RawEvent - mtx sync.Mutex - enabledProbes map[proto.ProbeType]bool +func (e *eventProbe) Stop(_ context.Context) error { + return _netifTxlatencyProbe.stop(probe.ProbeTypeEvent) +} + +type netifTxlatencyProbe struct { + objs bpfObjects + links []link.Link + sink chan<- *probe.Event + refcnt [probe.ProbeTypeCount]int + lock sync.Mutex + perfReader *perf.Reader + metricsMap map[string]map[uint32]uint64 + metricsLock sync.RWMutex +} + +func (p *netifTxlatencyProbe) stop(probeType probe.Type) error { + p.lock.Lock() + defer p.lock.Unlock() + if p.refcnt[probeType] == 0 { + return fmt.Errorf("probe %s never start", probeType) + } + + p.refcnt[probeType]-- + if p.totalReferenceCountLocked() == 0 { + return p.cleanup() + } + return nil +} + +func (p *netifTxlatencyProbe) cleanup() error { + if p.perfReader != nil { + p.perfReader.Close() + } + + for _, link := range p.links { + link.Close() + } + + p.links = nil + + p.objs.Close() + + return nil } -func (p *NetifTxlatencyProbe) Name() string { - return ModuleName +func (p *netifTxlatencyProbe) copyMetricsMap() map[string]map[uint32]uint64 { + p.metricsLock.RLock() + defer p.metricsLock.RUnlock() + return probe.CopyLegacyMetricsMap(p.metricsMap) } -func (p *NetifTxlatencyProbe) Start(ctx context.Context, probeType proto.ProbeType) { - if p.enable { - p.enabledProbes[probeType] = true - return +func (p *netifTxlatencyProbe) totalReferenceCountLocked() int { + var c int + for _, n := range p.refcnt { + c += n } + return c +} + +func (p *netifTxlatencyProbe) start(probeType probe.Type) (err error) { + p.lock.Lock() + defer p.lock.Unlock() + + p.refcnt[probeType]++ + if p.totalReferenceCountLocked() == 1 { + if err = p.loadAndAttachBPF(); err != nil { + log.Errorf("%s failed load and attach bpf, err: %v", probeName, err) + _ = p.cleanup() + return fmt.Errorf("%s failed load bpf: %w", probeName, err) + } - // 将eBPF程序进行link - p.once.Do(func() { - err := start() + // 初始化map的读接口 + p.perfReader, err = perf.NewReader(p.objs.bpfMaps.InspSklatEvent, int(unsafe.Sizeof(bpfInspNftxlatEventT{}))) if err != nil { - slog.Ctx(ctx).Warn("start", "module", ModuleName, "err", err) - return + log.Errorf("%s failed create perf reader, err: %v", probeName, err) + return err } - p.enable = true - }) - if !p.enable { - // if load failed, do not start process - return + go p.perfLoop() } - p.enabledProbes[probeType] = true - slog.Debug("start probe", "module", ModuleName) - if perfReader == nil { - slog.Ctx(ctx).Warn("start", "module", ModuleName, "err", "perf reader not ready") - return + return nil +} + +func (p *netifTxlatencyProbe) updateMetrics(netns uint32, metrics string) { + p.metricsLock.Lock() + defer p.metricsLock.Unlock() + if _, ok := p.metricsMap[metrics]; !ok { + p.metricsMap[metrics] = make(map[uint32]uint64) } - // 开始针对perf事件进行读取 + + p.metricsMap[metrics][netns]++ +} + +func (p *netifTxlatencyProbe) perfLoop() { for { - record, err := perfReader.Read() + record, err := p.perfReader.Read() if err != nil { if errors.Is(err, ringbuf.ErrClosed) { - slog.Ctx(ctx).Info("received signal, exiting..", "module", ModuleName) + log.Errorf("%s received signal, exiting..", probeName) return } - slog.Ctx(ctx).Info("reading from reader", "module", ModuleName, "err", err) + log.Warnf("%s failed reading from reader, err: %v", probeName, err) continue } if record.LostSamples != 0 { - slog.Ctx(ctx).Info("Perf event ring buffer full", "module", ModuleName, "drop samples", record.LostSamples) + log.Warnf("%s perf event ring buffer full, drop: %d", probeName, record.LostSamples) continue } @@ -113,115 +200,44 @@ func (p *NetifTxlatencyProbe) Start(ctx context.Context, probeType proto.ProbeTy var event bpfInspNftxlatEventT // Parse the ringbuf event entry into a bpfEvent structure. if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event); err != nil { - slog.Ctx(ctx).Info("parsing event", "module", ModuleName, "err", err) + log.Errorf("%s failed parsing event, err: %v", probeName, err) continue } - rawevt := proto.RawEvent{ - Netns: event.SkbMeta.Netns, + evt := &probe.Event{ + Timestamp: time.Now().UnixNano(), + Labels: probe.LagacyEventLabels(event.SkbMeta.Netns), } tuple := fmt.Sprintf("protocol=%s saddr=%s sport=%d daddr=%s dport=%d ", bpfutil.GetProtoStr(event.Tuple.L4Proto), bpfutil.GetAddrStr(event.Tuple.L3Proto, *(*[16]byte)(unsafe.Pointer(&event.Tuple.Saddr))), bits.ReverseBytes16(event.Tuple.Sport), bpfutil.GetAddrStr(event.Tuple.L3Proto, *(*[16]byte)(unsafe.Pointer(&event.Tuple.Daddr))), bits.ReverseBytes16(event.Tuple.Dport)) - rawevt.EventBody = fmt.Sprintf("%s latency:%s", tuple, bpfutil.GetHumanTimes(event.Latency)) + evt.Message = fmt.Sprintf("%s latency:%s", tuple, bpfutil.GetHumanTimes(event.Latency)) /*#define THRESH #define ACTION_QDISC 1 #define ACTION_XMIT 2 */ if event.Type == 1 { - rawevt.EventType = "NETIFTXLAT_QDISC" - p.updateMetrics(rawevt.Netns, TXLAT_QDISC_SLOW) + evt.Type = "NETIFTXLAT_QDISC" + p.updateMetrics(event.SkbMeta.Netns, TXLAT_QDISC_SLOW) } else if event.Type == 2 { - rawevt.EventType = "NETIFTXLAT_XMIT" - p.updateMetrics(rawevt.Netns, TXLAT_NETDEV_SLOW) + evt.Type = "NETIFTXLAT_XMIT" + p.updateMetrics(event.SkbMeta.Netns, TXLAT_NETDEV_SLOW) } // 分发给注册的dispatcher,其余逻辑由框架完成 - if p.sub != nil { - slog.Ctx(ctx).Debug("broadcast event", "module", ModuleName) - p.sub <- rawevt + if p.sink != nil { + log.Debugf("%s sink event %s", probeName, util.ToJSONString(evt)) + p.sink <- evt } } -} - -// Register register sub chan to get perf events -func (p *NetifTxlatencyProbe) Register(receiver chan<- proto.RawEvent) error { - p.mtx.Lock() - defer p.mtx.Unlock() - p.sub = receiver - - return nil -} - -func (p *NetifTxlatencyProbe) Ready() bool { - return p.enable -} - -func (p *NetifTxlatencyProbe) Close(probeType proto.ProbeType) error { - if perfReader != nil { - perfReader.Close() - perfReader = nil - } - - if !p.enable { - return nil - } - if _, ok := p.enabledProbes[probeType]; !ok { - return nil - } - if len(p.enabledProbes) > 1 { - delete(p.enabledProbes, probeType) - return nil - } - - for _, link := range links { - link.Close() - } - links = []link.Link{} - p.enable = false - p.once = sync.Once{} - metricsMap = map[string]map[uint32]uint64{} - - return nil } -func (p *NetifTxlatencyProbe) updateMetrics(netns uint32, metric string) { - p.mtx.Lock() - defer p.mtx.Unlock() - if _, ok := metricsMap[metric]; ok { - metricsMap[metric][netns]++ - } +func (p *netifTxlatencyProbe) Collect(_ context.Context) (map[string]map[uint32]uint64, error) { + p.metricsLock.RLock() + defer p.metricsLock.RUnlock() + return probe.CopyLegacyMetricsMap(p.metricsMap), nil } -func (p *NetifTxlatencyProbe) GetEventNames() []string { - return events -} - -func (p *NetifTxlatencyProbe) GetMetricNames() []string { - return metrics -} - -func (p *NetifTxlatencyProbe) Collect(_ context.Context) (map[string]map[uint32]uint64, error) { - ets := nettop.GetAllEntity() - resMap := map[string]map[uint32]uint64{} - - for metric, v := range metricsMap { - resMap[metric] = make(map[uint32]uint64) - for _, et := range ets { - if et != nil { - nsinum := et.GetNetns() - if v, ok := v[uint32(nsinum)]; ok { - resMap[metric][uint32(nsinum)] = v - } else { - // if no kernel latency event recorded, set value to 0 - resMap[metric][uint32(nsinum)] = 0 - } - } - } - } - return resMap, nil -} - -func start() error { +func (p *netifTxlatencyProbe) loadAndAttachBPF() error { // 准备动作 if err := rlimit.RemoveMemlock(); err != nil { log.Fatal(err) @@ -234,36 +250,28 @@ func start() error { } // 获取Loaded的程序/map的fd信息 - objs := bpfObjects{} - if err := loadBpfObjects(&objs, &opts); err != nil { + if err := loadBpfObjects(p.objs, &opts); err != nil { return fmt.Errorf("loading objects: %v", err) } // 执行link操作,保存rawfd - progQueue, err := link.Tracepoint("net", "net_dev_queue", objs.NetDevQueue, &link.TracepointOptions{}) - if err != nil { - return err - } - links = append(links, progQueue) - - progStart, err := link.Tracepoint("net", "net_dev_start_xmit", objs.NetDevStartXmit, &link.TracepointOptions{}) + progQueue, err := link.Tracepoint("net", "net_dev_queue", p.objs.NetDevQueue, &link.TracepointOptions{}) if err != nil { return err } - links = append(links, progStart) + p.links = append(p.links, progQueue) - progXmit, err := link.Tracepoint("net", "net_dev_xmit", objs.NetDevXmit, &link.TracepointOptions{}) + progStart, err := link.Tracepoint("net", "net_dev_start_xmit", p.objs.NetDevStartXmit, &link.TracepointOptions{}) if err != nil { return err } - links = append(links, progXmit) + p.links = append(p.links, progStart) - // 初始化map的读接口 - reader, err := perf.NewReader(objs.bpfMaps.InspSklatEvent, int(unsafe.Sizeof(bpfInspNftxlatEventT{}))) + progXmit, err := link.Tracepoint("net", "net_dev_xmit", p.objs.NetDevXmit, &link.TracepointOptions{}) if err != nil { return err } - perfReader = reader + p.links = append(p.links, progXmit) return nil } diff --git a/pkg/exporter/probe/tracenetsoftirq/tracenetsoftirq.go b/pkg/exporter/probe/tracenetsoftirq/tracenetsoftirq.go index ebf7525a..6c4a8dde 100644 --- a/pkg/exporter/probe/tracenetsoftirq/tracenetsoftirq.go +++ b/pkg/exporter/probe/tracenetsoftirq/tracenetsoftirq.go @@ -7,17 +7,19 @@ import ( "errors" "fmt" "sync" + "time" "unsafe" - "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" + "github.com/alibaba/kubeskoop/pkg/exporter/util" + log "github.com/sirupsen/logrus" + "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" "github.com/cilium/ebpf/perf" "github.com/cilium/ebpf/ringbuf" "github.com/cilium/ebpf/rlimit" - "golang.org/x/exp/slog" ) //go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc clang -cflags $BPF_CFLAGS -type insp_softirq_event_t bpf ../../../../bpf/net_softirq.c -- -I../../../../bpf/headers -D__TARGET_ARCH_x86 @@ -29,132 +31,178 @@ const ( ) var ( - ModuleName = "insp_netsoftirq" // nolint - probe = &NetSoftirqProbe{once: sync.Once{}, mtx: sync.Mutex{}, enabledProbes: map[proto.ProbeType]bool{}} - objs = bpfObjects{} - links = []link.Link{} - metricsMap = map[string]map[uint32]uint64{} - - events = []string{"NETSOFTIRQ_SCHED_SLOW", "NETSOFTIRQ_SCHED_100MS", "NETSOFTIRQ_EXCUTE_SLOW", "NETSOFTIRQ_EXCUTE_100MS"} - metrics = []string{NETSOFTIRQ_SCHED_SLOW, NETSOFTIRQ_SCHED_100MS, NETSOFTIRQ_EXCUTE_SLOW, NETSOFTIRQ_EXCUTE_100MS} + metrics = []string{NETSOFTIRQ_SCHED_SLOW, NETSOFTIRQ_SCHED_100MS, NETSOFTIRQ_EXCUTE_SLOW, NETSOFTIRQ_EXCUTE_100MS} + probeName = "netsoftirq" + _netSoftirqProbe = &netSoftirqProbe{} ) -func GetProbe() *NetSoftirqProbe { - return probe +func init() { + probe.MustRegisterMetricsProbe(probeName, metricsProbeCreator) + probe.MustRegisterEventProbe(probeName, eventProbeCreator) } -func init() { - for m := range metrics { - metricsMap[metrics[m]] = map[uint32]uint64{ - 0: 0, - } +func metricsProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &metricsProbe{} + batchMetrics := probe.NewLegacyBatchMetrics(probeName, metrics, p.CollectOnce) + + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil +} + +func eventProbeCreator(sink chan<- *probe.Event, _ map[string]interface{}) (probe.EventProbe, error) { + p := &eventProbe{ + sink: sink, } + return probe.NewEventProbe(probeName, p), nil } -type NetSoftirqProbe struct { - enable bool - sub chan<- proto.RawEvent - once sync.Once - mtx sync.Mutex - enabledProbes map[proto.ProbeType]bool +type metricsProbe struct { } -func (p *NetSoftirqProbe) Name() string { - return ModuleName +func (p *metricsProbe) Start(_ context.Context) error { + return _netSoftirqProbe.start(probe.ProbeTypeMetrics) } -func (p *NetSoftirqProbe) Ready() bool { - return p.enable +func (p *metricsProbe) Stop(_ context.Context) error { + return _netSoftirqProbe.stop(probe.ProbeTypeMetrics) } -func (p *NetSoftirqProbe) GetEventNames() []string { - return events +func (p *metricsProbe) CollectOnce() (map[string]map[uint32]uint64, error) { + return _netSoftirqProbe.copyMetricsMap(), nil } -func (p *NetSoftirqProbe) GetMetricNames() []string { - return metrics +type eventProbe struct { + sink chan<- *probe.Event } -func (p *NetSoftirqProbe) Collect(_ context.Context) (map[string]map[uint32]uint64, error) { - return metricsMap, nil +func (e *eventProbe) Start(_ context.Context) error { + err := _netSoftirqProbe.start(probe.ProbeTypeEvent) + if err != nil { + return err + } + + _netSoftirqProbe.sink = e.sink + return nil +} + +func (e *eventProbe) Stop(_ context.Context) error { + return _netSoftirqProbe.stop(probe.ProbeTypeEvent) +} + +type netSoftirqProbe struct { + objs bpfObjects + links []link.Link + sink chan<- *probe.Event + refcnt [probe.ProbeTypeCount]int + lock sync.Mutex + perfReader *perf.Reader + metricsMap map[string]map[uint32]uint64 + metricsLock sync.RWMutex } -func (p *NetSoftirqProbe) Close(probeType proto.ProbeType) error { - if !p.enable { - return nil +func (p *netSoftirqProbe) stop(probeType probe.Type) error { + p.lock.Lock() + defer p.lock.Unlock() + if p.refcnt[probeType] == 0 { + return fmt.Errorf("probe %s never start", probeType) } - if _, ok := p.enabledProbes[probeType]; !ok { - return nil + p.refcnt[probeType]-- + if p.totalReferenceCountLocked() == 0 { + return p.cleanup() } - if len(p.enabledProbes) > 1 { - delete(p.enabledProbes, probeType) - return nil + return nil +} + +func (p *netSoftirqProbe) cleanup() error { + if p.perfReader != nil { + p.perfReader.Close() } - for _, link := range links { + for _, link := range p.links { link.Close() } - links = []link.Link{} - p.enable = false - p.once = sync.Once{} - metricsMap = map[string]map[uint32]uint64{} - delete(p.enabledProbes, probeType) + p.links = nil + + p.objs.Close() + return nil } -func (p *NetSoftirqProbe) Start(ctx context.Context, probeType proto.ProbeType) { - if p.enable { - p.enabledProbes[probeType] = true - return +func (p *netSoftirqProbe) copyMetricsMap() map[string]map[uint32]uint64 { + p.metricsLock.RLock() + defer p.metricsLock.RUnlock() + return probe.CopyLegacyMetricsMap(p.metricsMap) +} + +func (p *netSoftirqProbe) totalReferenceCountLocked() int { + var c int + for _, n := range p.refcnt { + c += n } + return c +} + +func (p *netSoftirqProbe) start(probeType probe.Type) (err error) { + p.lock.Lock() + defer p.lock.Unlock() - p.once.Do(func() { - err := loadSync() + p.refcnt[probeType]++ + if p.totalReferenceCountLocked() == 1 { + if err = p.loadAndAttachBPF(); err != nil { + log.Errorf("%s failed load and attach bpf, err: %v", probeName, err) + _ = p.cleanup() + return fmt.Errorf("%s failed load bpf: %w", probeName, err) + } + + // 初始化map的读接口 + p.perfReader, err = perf.NewReader(p.objs.bpfMaps.InspSoftirqEvents, int(unsafe.Sizeof(bpfInspSoftirqEventT{}))) if err != nil { - slog.Ctx(ctx).Warn("start", "module", ModuleName, "err", err) - return + log.Errorf("%s failed create perf reader, err: %v", probeName, err) + return err } - p.enable = true - }) - if !p.enable { - // if load failed, do not start process - return + go p.perfLoop() } - p.enabledProbes[probeType] = true - reader, err := perf.NewReader(objs.bpfMaps.InspSoftirqEvents, int(unsafe.Sizeof(bpfInspSoftirqEventT{}))) - if err != nil { - slog.Ctx(ctx).Warn("start new perf reader", "module", ModuleName, "err", err) - return + return nil +} + +func (p *netSoftirqProbe) updateMetrics(metrics string) { + p.metricsLock.Lock() + defer p.metricsLock.Unlock() + if _, ok := p.metricsMap[metrics]; !ok { + p.metricsMap[metrics] = make(map[uint32]uint64) } + p.metricsMap[metrics][0]++ +} + +func (p *netSoftirqProbe) perfLoop() { for { - record, err := reader.Read() + record, err := p.perfReader.Read() if err != nil { if errors.Is(err, ringbuf.ErrClosed) { - slog.Ctx(ctx).Info("received signal, exiting..", "module", ModuleName) + log.Errorf("%s received signal, exiting..", probeName) return } - slog.Ctx(ctx).Info("reading from reader", "module", ModuleName, "err", err) + log.Warnf("%s failed reading from reader, err: %v", probeName, err) continue } if record.LostSamples != 0 { - slog.Ctx(ctx).Info("Perf event ring buffer full", "module", ModuleName, "drop samples", record.LostSamples) + log.Warnf("%s perf event ring buffer full, drop: %d", probeName, record.LostSamples) continue } var event bpfInspSoftirqEventT if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event); err != nil { - slog.Ctx(ctx).Info("parsing event", "module", ModuleName, "err", err) + log.Errorf("%s failed parsing event, err: %v", probeName, err) continue } - rawevt := proto.RawEvent{ - Netns: 0, + evt := &probe.Event{ + Timestamp: time.Now().UnixNano(), } /* @@ -164,51 +212,35 @@ func (p *NetSoftirqProbe) Start(ctx context.Context, probeType proto.ProbeType) switch event.Phase { case 1: if event.Latency > 100000000 { - rawevt.EventType = "NETSOFTIRQ_SCHED_100MS" + evt.Type = "NETSOFTIRQ_SCHED_100MS" p.updateMetrics(NETSOFTIRQ_SCHED_100MS) } else { - rawevt.EventType = "NETSOFTIRQ_SCHED_SLOW" + evt.Type = "NETSOFTIRQ_SCHED_SLOW" p.updateMetrics(NETSOFTIRQ_SCHED_SLOW) } case 2: if event.Latency > 100000000 { - rawevt.EventType = "NETSOFTIRQ_EXCUTE_100MS" + evt.Type = "NETSOFTIRQ_EXCUTE_100MS" p.updateMetrics(NETSOFTIRQ_EXCUTE_100MS) } else { - rawevt.EventType = "NETSOFTIRQ_EXCUTE_SLOW" + evt.Type = "NETSOFTIRQ_EXCUTE_SLOW" p.updateMetrics(NETSOFTIRQ_EXCUTE_SLOW) } default: - slog.Ctx(ctx).Info("parsing event", "module", ModuleName, "ignore", event) + log.Infof("%s failed parsing event, phase: %d", probeName, event.Phase) continue } - rawevt.EventBody = fmt.Sprintf("cpu=%d pid=%d latency=%s ", event.Cpu, event.Pid, bpfutil.GetHumanTimes(event.Latency)) - if p.sub != nil { - slog.Ctx(ctx).Debug("broadcast event", "module", ModuleName) - p.sub <- rawevt + evt.Message = fmt.Sprintf("cpu=%d pid=%d latency=%s ", event.Cpu, event.Pid, bpfutil.GetHumanTimes(event.Latency)) + if p.sink != nil { + log.Debugf("%s sink event %s", probeName, util.ToJSONString(evt)) + p.sink <- evt } } } -func (p *NetSoftirqProbe) updateMetrics(k string) { - p.mtx.Lock() - defer p.mtx.Unlock() - if _, ok := metricsMap[k]; ok { - metricsMap[k][0]++ - } -} - -func (p *NetSoftirqProbe) Register(receiver chan<- proto.RawEvent) error { - p.mtx.Lock() - defer p.mtx.Unlock() - p.sub = receiver - - return nil -} - -func loadSync() error { +func (p *netSoftirqProbe) loadAndAttachBPF() error { // 准备动作 if err := rlimit.RemoveMemlock(); err != nil { return err @@ -221,27 +253,27 @@ func loadSync() error { } // 获取Loaded的程序/map的fd信息 - if err := loadBpfObjects(&objs, &opts); err != nil { + if err := loadBpfObjects(&p.objs, &opts); err != nil { return fmt.Errorf("loading objects: %v", err) } - prograise, err := link.Tracepoint("irq", "softirq_raise", objs.TraceSoftirqRaise, &link.TracepointOptions{}) + prograise, err := link.Tracepoint("irq", "softirq_raise", p.objs.TraceSoftirqRaise, &link.TracepointOptions{}) if err != nil { return fmt.Errorf("link softirq_raise: %s", err.Error()) } - links = append(links, prograise) + p.links = append(p.links, prograise) - progentry, err := link.Tracepoint("irq", "softirq_entry", objs.TraceSoftirqEntry, &link.TracepointOptions{}) + progentry, err := link.Tracepoint("irq", "softirq_entry", p.objs.TraceSoftirqEntry, &link.TracepointOptions{}) if err != nil { return fmt.Errorf("link softirq_entry: %s", err.Error()) } - links = append(links, progentry) + p.links = append(p.links, progentry) - progexit, err := link.Tracepoint("irq", "softirq_exit", objs.TraceSoftirqExit, &link.TracepointOptions{}) + progexit, err := link.Tracepoint("irq", "softirq_exit", p.objs.TraceSoftirqExit, &link.TracepointOptions{}) if err != nil { - return fmt.Errorf("link softirq_exit: %s", err.Error()) + return fmt.Errorf("link softirq_exit: %w", err) } - links = append(links, progexit) + p.links = append(p.links, progexit) return nil } diff --git a/pkg/exporter/probe/tracepacketloss/packetloss.go b/pkg/exporter/probe/tracepacketloss/packetloss.go index e237867a..efaa988f 100644 --- a/pkg/exporter/probe/tracepacketloss/packetloss.go +++ b/pkg/exporter/probe/tracepacketloss/packetloss.go @@ -6,6 +6,11 @@ import ( "encoding/binary" "errors" "fmt" + "time" + + "github.com/alibaba/kubeskoop/pkg/exporter/probe" + "github.com/alibaba/kubeskoop/pkg/exporter/util" + log "github.com/sirupsen/logrus" "math/bits" "strings" @@ -14,27 +19,23 @@ import ( "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" "github.com/alibaba/kubeskoop/pkg/exporter/nettop" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" - "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" "github.com/cilium/ebpf/perf" "github.com/cilium/ebpf/ringbuf" "github.com/cilium/ebpf/rlimit" - "golang.org/x/exp/slog" ) //go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc clang -cflags $BPF_CFLAGS -type insp_pl_event_t -type insp_pl_metric_t bpf ../../../../bpf/packetloss.c -- -I../../../../bpf/headers -D__TARGET_ARCH_x86 // nolint const ( - ModuleName = "insp_packetloss" - PACKETLOSS_ABNORMAL = "packetloss_abnormal" - PACKETLOSS_TOTAL = "packetloss_total" - PACKETLOSS_NETFILTER = "packetloss_netfilter" - PACKETLOSS_TCPSTATEM = "packetloss_tcpstatm" - PACKETLOSS_TCPRCV = "packetloss_tcprcv" - PACKETLOSS_TCPHANDLE = "packetloss_tcphandle" + PACKETLOSS_ABNORMAL = "abnormal" + PACKETLOSS_TOTAL = "total" + PACKETLOSS_NETFILTER = "netfilter" + PACKETLOSS_TCPSTATEM = "tcpstatm" + PACKETLOSS_TCPRCV = "tcprcv" + PACKETLOSS_TCPHANDLE = "tcphandle" PACKETLOSS = "PACKETLOSS" ) @@ -48,120 +49,156 @@ var ( tcprcvSymbol = "tcp_v4_rcv" tcpdorcvSymbol = "tcp_v4_do_rcv" - probe = &PacketLossProbe{enabledProbes: map[proto.ProbeType]bool{}} - objs = bpfObjects{} - links = []link.Link{} - events = []string{PACKETLOSS} + probeName = "packetloss" packetLossMetrics = []string{PACKETLOSS_TCPHANDLE, PACKETLOSS_TCPRCV, PACKETLOSS_ABNORMAL, PACKETLOSS_TOTAL, PACKETLOSS_NETFILTER, PACKETLOSS_TCPSTATEM} + + _packetLossProbe = &packetLossProbe{} ) func init() { - // sk_stream_kill_queues: skb moved to sock rqueue and then will be cleanup by this symbol - ignore("sk_stream_kill_queues") - // tcp_v4_rcv: skb of ingress stream will pass the symbol. - // ignore("tcp_v4_rcv") - // ignore("tcp_v6_rcv") - // // tcp_v4_do_rcv: skb drop when check CSUMERRORS - // ignore("tcp_v4_do_rcv") - // skb_queue_purge netlink recv function - ignore("skb_queue_purge") - ignore("nfnetlink_rcv_batch") - // unix_stream_connect unix stream io is not cared - ignore("unix_stream_connect") - ignore("skb_release_data") - - // kfree_skb_list: free skb batch - useless("kfree_skb_list") - // kfree_skb_reason: wrapper of kfree_skb in newer kernel - useless("kfree_skb_reason") - // kfree_skb.part - useless("kfree_skb.part.0") - // kfree_skb - useless("kfree_skb") + probe.MustRegisterMetricsProbe(probeName, metricsProbeCreator) + probe.MustRegisterEventProbe(probeName, eventProbeCreator) } -func ignore(sym string) { - ignoreSymbolList[sym] = struct{}{} +func metricsProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &metricsProbe{} + batchMetrics := probe.NewLegacyBatchMetricsWithUnderscore(probeName, packetLossMetrics, p.CollectOnce) + + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil } -func useless(sym string) { - uselessSymbolList[sym] = struct{}{} +func eventProbeCreator(sink chan<- *probe.Event, _ map[string]interface{}) (probe.EventProbe, error) { + p := &eventProbe{ + sink: sink, + } + return probe.NewEventProbe(probeName, p), nil } -func GetProbe() *PacketLossProbe { - return probe +type metricsProbe struct { } -type PacketLossProbe struct { - enable bool - sub chan<- proto.RawEvent - once sync.Once - mtx sync.Mutex - enabledProbes map[proto.ProbeType]bool +func (p *metricsProbe) Start(ctx context.Context) error { + return _packetLossProbe.start(ctx, probe.ProbeTypeMetrics) } -func (p *PacketLossProbe) Name() string { - return ModuleName +func (p *metricsProbe) Stop(ctx context.Context) error { + return _packetLossProbe.stop(ctx, probe.ProbeTypeMetrics) } -func (p *PacketLossProbe) Ready() bool { - return p.enable +func (p *metricsProbe) CollectOnce() (map[string]map[uint32]uint64, error) { + return _packetLossProbe.collect() } -func (p *PacketLossProbe) GetEventNames() []string { - return events +type eventProbe struct { + sink chan<- *probe.Event } -func (p *PacketLossProbe) GetMetricNames() []string { - res := []string{} - for _, m := range packetLossMetrics { - res = append(res, strings.ToLower(m)) +func (e *eventProbe) Start(ctx context.Context) error { + err := _packetLossProbe.start(ctx, probe.ProbeTypeEvent) + if err != nil { + return err } - return res + + _packetLossProbe.sink = e.sink + return nil +} + +func (e *eventProbe) Stop(ctx context.Context) error { + return _packetLossProbe.stop(ctx, probe.ProbeTypeEvent) +} + +type packetLossProbe struct { + objs bpfObjects + links []link.Link + sink chan<- *probe.Event + refcnt [probe.ProbeTypeCount]int + lock sync.Mutex + perfReader *perf.Reader } -func (p *PacketLossProbe) Close(probeType proto.ProbeType) error { - if !p.enable { - return nil +func (p *packetLossProbe) stop(_ context.Context, probeType probe.Type) error { + p.lock.Lock() + defer p.lock.Unlock() + if p.refcnt[probeType] == 0 { + return fmt.Errorf("probe %s never start", probeType) } - if _, ok := p.enabledProbes[probeType]; !ok { - return nil + p.refcnt[probeType]-- + + if p.refcnt[probe.ProbeTypeEvent] == 0 { + if p.perfReader != nil { + p.perfReader.Close() + } } - if len(p.enabledProbes) > 1 { - delete(p.enabledProbes, probeType) - return nil + + if p.totalReferenceCountLocked() == 0 { + return p.cleanup() } + return nil +} - for _, link := range links { +func (p *packetLossProbe) cleanup() error { + for _, link := range p.links { link.Close() } - links = []link.Link{} - p.enable = false - p.once = sync.Once{} - delete(p.enabledProbes, probeType) + p.links = nil + + p.objs.Close() + return nil } -func (p *PacketLossProbe) Register(receiver chan<- proto.RawEvent) error { - p.mtx.Lock() - defer p.mtx.Unlock() - p.sub = receiver +func (p *packetLossProbe) totalReferenceCountLocked() int { + var c int + for _, n := range p.refcnt { + c += n + } + return c +} + +func (p *packetLossProbe) start(ctx context.Context, probeType probe.Type) (err error) { + p.lock.Lock() + defer p.lock.Unlock() + + if p.refcnt[probeType] != 0 { + return fmt.Errorf("%s(%s) has already started", probeName, probeType) + } + + p.refcnt[probeType]++ + if p.totalReferenceCountLocked() == 1 { + if err = p.loadAndAttachBPF(); err != nil { + log.Errorf("%s failed load and attach bpf, err: %v", probeName, err) + _ = p.cleanup() + return + } + + if p.refcnt[probe.ProbeTypeEvent] == 1 { + p.perfReader, err = perf.NewReader(p.objs.bpfMaps.InspPlEvent, int(unsafe.Sizeof(bpfInspPlEventT{}))) + if err != nil { + log.Errorf("%s error create perf reader, err: %v", probeName, err) + _ = p.stop(ctx, probeType) + return + } + go p.perfLoop() + } + } return nil } -func (p *PacketLossProbe) Collect(ctx context.Context) (map[string]map[uint32]uint64, error) { +func (p *packetLossProbe) collect() (map[string]map[uint32]uint64, error) { + //TODO metrics of packetloss should be counter, not gauge. + // we should create metrics from events resMap := make(map[string]map[uint32]uint64) for _, metric := range packetLossMetrics { resMap[metric] = make(map[uint32]uint64) } - m := objs.bpfMaps.InspPlMetric + m := p.objs.bpfMaps.InspPlMetric if m == nil { - slog.Ctx(ctx).Warn("get metric map with nil", "module", ModuleName) + log.Warnf("%s get metric map with nil", probeName) return nil, nil } var ( @@ -197,7 +234,7 @@ func (p *PacketLossProbe) Collect(ctx context.Context) (map[string]map[uint32]ui sym, err := bpfutil.GetSymPtFromBpfLocation(key.Location) if err != nil { - slog.Ctx(ctx).Warn("get sym failed", "err", err, "module", ModuleName, "location", key.Location) + log.Warnf("%s get sym failed, location: %x, err: %v", probeName, key.Location, err) continue } @@ -229,69 +266,70 @@ func (p *PacketLossProbe) Collect(ctx context.Context) (map[string]map[uint32]ui return resMap, nil } -func (p *PacketLossProbe) Start(ctx context.Context, probeType proto.ProbeType) { - if p.enable { - p.enabledProbes[probeType] = true - return +func (p *packetLossProbe) loadAndAttachBPF() error { + // Allow the current process to lock memory for eBPF resources. + if err := rlimit.RemoveMemlock(); err != nil { + return fmt.Errorf("remove limit failed: %s", err.Error()) } - p.once.Do(func() { - err := loadSync() - if err != nil { - slog.Ctx(ctx).Warn("start", "module", ModuleName, "err", err) - return - } - p.enable = true - }) + opts := ebpf.CollectionOptions{} - if !p.enable { - // if load failed, do not start process - return + opts.Programs = ebpf.ProgramOptions{ + KernelTypes: bpfutil.LoadBTFSpecOrNil(), + } + + // Load pre-compiled programs and maps into the kernel. + if err := loadBpfObjects(&p.objs, &opts); err != nil { + return fmt.Errorf("loading objects: %s", err.Error()) } - p.enabledProbes[probeType] = true - reader, err := perf.NewReader(objs.bpfMaps.InspPlEvent, int(unsafe.Sizeof(bpfInspPlEventT{}))) + pl, err := link.Tracepoint("skb", "kfree_skb", p.objs.KfreeSkb, &link.TracepointOptions{}) if err != nil { - slog.Ctx(ctx).Warn("start new perf reader", "module", ModuleName, "err", err) - return + return fmt.Errorf("link tracepoint kfree_skb failed: %s", err.Error()) } + p.links = append(p.links, pl) + return nil +} +func (p *packetLossProbe) perfLoop() { for { anothor_loop: - record, err := reader.Read() + record, err := p.perfReader.Read() if err != nil { if errors.Is(err, ringbuf.ErrClosed) { - slog.Ctx(ctx).Info("received signal, exiting..", "module", ModuleName) + log.Infof("%s received signal, exiting..", probeName) return } - slog.Ctx(ctx).Info("reading from reader", "module", ModuleName, "err", err) + log.Errorf("%s failed reading from reader, err: %v", probeName, err) continue } if record.LostSamples != 0 { - slog.Ctx(ctx).Info("Perf event ring buffer full", "module", ModuleName, "drop samples", record.LostSamples) + log.Warnf("%s perf event ring buffer full, drop: %d", probeName, record.LostSamples) continue } var event bpfInspPlEventT if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event); err != nil { - slog.Ctx(ctx).Info("parsing event", "module", ModuleName, "err", err) + log.Errorf("%s failed parsing event, err: %v", probeName, err) continue } // filter netlink/unixsock/tproxy packet if event.Tuple.Dport == 0 && event.Tuple.Sport == 0 { continue } - rawevt := proto.RawEvent{ - Netns: event.SkbMeta.Netns, - EventType: PACKETLOSS, + + evt := &probe.Event{ + Timestamp: time.Now().UnixNano(), + Type: PACKETLOSS, + Labels: probe.LagacyEventLabels(event.SkbMeta.Netns), } tuple := fmt.Sprintf("protocol=%s saddr=%s sport=%d daddr=%s dport=%d ", bpfutil.GetProtoStr(event.Tuple.L4Proto), bpfutil.GetAddrStr(event.Tuple.L3Proto, *(*[16]byte)(unsafe.Pointer(&event.Tuple.Saddr))), bits.ReverseBytes16(event.Tuple.Sport), bpfutil.GetAddrStr(event.Tuple.L3Proto, *(*[16]byte)(unsafe.Pointer(&event.Tuple.Daddr))), bits.ReverseBytes16(event.Tuple.Dport)) - stacks, err := bpfutil.GetSymsByStack(uint32(event.StackId), objs.InspPlStack) + stacks, err := bpfutil.GetSymsByStack(uint32(event.StackId), p.objs.InspPlStack) if err != nil { - slog.Ctx(ctx).Warn("get sym by stack with", "module", ModuleName, "err", err) + log.Warnf("%s failed get sym by stack, err: %v", probeName, err) continue } strs := []string{} @@ -307,35 +345,11 @@ func (p *PacketLossProbe) Start(ctx context.Context, probeType proto.ProbeType) stackStr := strings.Join(strs, " ") - rawevt.EventBody = fmt.Sprintf("%s stacktrace:%s", tuple, stackStr) - if p.sub != nil { - slog.Ctx(ctx).Debug("broadcast event", "module", ModuleName) - p.sub <- rawevt - } - } -} + evt.Message = fmt.Sprintf("%s stacktrace:%s", tuple, stackStr) -func loadSync() error { - // Allow the current process to lock memory for eBPF resources. - if err := rlimit.RemoveMemlock(); err != nil { - return fmt.Errorf("remove limit failed: %s", err.Error()) - } - - opts := ebpf.CollectionOptions{} - - opts.Programs = ebpf.ProgramOptions{ - KernelTypes: bpfutil.LoadBTFSpecOrNil(), - } - - // Load pre-compiled programs and maps into the kernel. - if err := loadBpfObjects(&objs, &opts); err != nil { - return fmt.Errorf("loading objects: %s", err.Error()) - } - - pl, err := link.Tracepoint("skb", "kfree_skb", objs.KfreeSkb, &link.TracepointOptions{}) - if err != nil { - return fmt.Errorf("link tracepoint kfree_skb failed: %s", err.Error()) + if p.sink != nil { + log.Debugf("%s sink event %s", probeName, util.ToJSONString(evt)) + p.sink <- evt + } } - links = append(links, pl) - return nil } diff --git a/pkg/exporter/probe/tracesocketlatency/socketlatency.go b/pkg/exporter/probe/tracesocketlatency/socketlatency.go index 1a4b407a..00c21cfb 100644 --- a/pkg/exporter/probe/tracesocketlatency/socketlatency.go +++ b/pkg/exporter/probe/tracesocketlatency/socketlatency.go @@ -7,19 +7,20 @@ import ( "errors" "fmt" "math/bits" - "strings" "sync" + "time" "unsafe" - "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" + "github.com/alibaba/kubeskoop/pkg/exporter/util" + log "github.com/sirupsen/logrus" + "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" "github.com/cilium/ebpf/perf" "github.com/cilium/ebpf/ringbuf" "github.com/cilium/ebpf/rlimit" - "golang.org/x/exp/slog" ) //go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc clang -cflags $BPF_CFLAGS -type insp_sklat_metric_t -type insp_sklat_event_t bpf ../../../../bpf/socketlatency.c -- -I../../../../bpf/headers -D__TARGET_ARCH_x86 @@ -55,103 +56,183 @@ const ( ) var ( - probe = &SocketLatencyProbe{once: sync.Once{}, mtx: sync.Mutex{}, enabledProbes: map[proto.ProbeType]bool{}} - objs = bpfObjects{} - links = []link.Link{} - events = []string{SOCKETLAT_READSLOW, SOCKETLAT_SENDSLOW} - + probeName = "socketlatency" + _socketLatency = &socketLatencyProbe{} socketlatencyMetrics = []string{READ100MS, READ1MS, WRITE100MS, WRITE1MS} ) -func GetProbe() *SocketLatencyProbe { - return probe +func init() { + probe.MustRegisterMetricsProbe(probeName, metricsProbeCreator) + probe.MustRegisterEventProbe(probeName, eventProbeCreator) +} + +func metricsProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &metricsProbe{} + batchMetrics := probe.NewLegacyBatchMetrics(probeName, socketlatencyMetrics, p.CollectOnce) + + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil +} + +func eventProbeCreator(sink chan<- *probe.Event, _ map[string]interface{}) (probe.EventProbe, error) { + p := &eventProbe{ + sink: sink, + } + return probe.NewEventProbe(probeName, p), nil +} + +type metricsProbe struct { +} + +func (p *metricsProbe) Start(_ context.Context) error { + return _socketLatency.start(probe.ProbeTypeMetrics) } -type SocketLatencyProbe struct { - enable bool - sub chan<- proto.RawEvent - once sync.Once - mtx sync.Mutex - enabledProbes map[proto.ProbeType]bool +func (p *metricsProbe) Stop(_ context.Context) error { + return _socketLatency.stop(probe.ProbeTypeMetrics) } -func (p *SocketLatencyProbe) Name() string { - return ModuleName +func (p *metricsProbe) CollectOnce() (map[string]map[uint32]uint64, error) { + return _socketLatency.collect() } -func (p *SocketLatencyProbe) Ready() bool { - return p.enable +type eventProbe struct { + sink chan<- *probe.Event } -func (p *SocketLatencyProbe) GetEventNames() []string { - return events +func (e *eventProbe) Start(_ context.Context) error { + err := _socketLatency.start(probe.ProbeTypeEvent) + if err != nil { + return err + } + + _socketLatency.sink = e.sink + return nil } -func (p *SocketLatencyProbe) Close(probeType proto.ProbeType) error { - if !p.enable { - return nil +func (e *eventProbe) Stop(_ context.Context) error { + return _socketLatency.stop(probe.ProbeTypeEvent) +} + +type socketLatencyProbe struct { + objs bpfObjects + links []link.Link + sink chan<- *probe.Event + refcnt [probe.ProbeTypeCount]int + lock sync.Mutex + perfReader *perf.Reader +} + +func (p *socketLatencyProbe) stop(probeType probe.Type) error { + p.lock.Lock() + defer p.lock.Unlock() + if p.refcnt[probeType] == 0 { + return fmt.Errorf("probe %s never start", probeType) } - if _, ok := p.enabledProbes[probeType]; !ok { - return nil + p.refcnt[probeType]-- + if p.totalReferenceCountLocked() == 0 { + return p.cleanup() } - if len(p.enabledProbes) > 1 { - delete(p.enabledProbes, probeType) - return nil + return nil +} + +func (p *socketLatencyProbe) cleanup() error { + if p.perfReader != nil { + p.perfReader.Close() } - for _, link := range links { + for _, link := range p.links { link.Close() } - links = []link.Link{} - p.enable = false - p.once = sync.Once{} - delete(p.enabledProbes, probeType) + p.links = nil + + p.objs.Close() + return nil } -func (p *SocketLatencyProbe) GetMetricNames() []string { - res := []string{} - for _, m := range socketlatencyMetrics { - res = append(res, strings.ToLower(m)) +func (p *socketLatencyProbe) totalReferenceCountLocked() int { + var c int + for _, n := range p.refcnt { + c += n } - return res + return c } -func (p *SocketLatencyProbe) Register(receiver chan<- proto.RawEvent) error { - p.mtx.Lock() - defer p.mtx.Unlock() - p.sub = receiver +func (p *socketLatencyProbe) start(probeType probe.Type) (err error) { + p.lock.Lock() + defer p.lock.Unlock() + + p.refcnt[probeType]++ + if p.totalReferenceCountLocked() == 1 { + if err = p.loadAndAttachBPF(); err != nil { + log.Errorf("%s failed load and attach bpf, err: %v", probeName, err) + _ = p.cleanup() + return + } + p.perfReader, err = perf.NewReader(p.objs.bpfMaps.InspSklatEvents, int(unsafe.Sizeof(bpfInspSklatEventT{}))) + if err != nil { + log.Warnf("%s failed create new perf reader, err: %v", probeName, err) + return + } + + go p.perfLoop() + } return nil } -func (p *SocketLatencyProbe) Start(ctx context.Context, probeType proto.ProbeType) { - // metric and events both start probe - if p.enable { - p.enabledProbes[probeType] = true - return - } - p.once.Do(func() { - err := loadSync() +func (p *socketLatencyProbe) perfLoop() { + for { + record, err := p.perfReader.Read() if err != nil { - slog.Ctx(ctx).Warn("start", "module", ModuleName, "err", err) - return + if errors.Is(err, ringbuf.ErrClosed) { + log.Infof("%s received signal, exiting..", probeName) + return + } + log.Infof("%s failed reading from reader, err: %v", probeName, err) + continue } - p.enable = true - }) - if !p.enable { - // if load failed, do not start process - return - } - p.enabledProbes[probeType] = true + if record.LostSamples != 0 { + log.Infof("%s perf event ring buffer full, drop: %d", probeName, record.LostSamples) + continue + } - p.startEventPoll(ctx) + var event bpfInspSklatEventT + if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event); err != nil { + log.Infof("%s failed parsing event, err: %v", probeName, err) + continue + } + // filter netlink/unixsock/tproxy packet + if event.Tuple.Dport == 0 && event.Tuple.Sport == 0 { + continue + } + evt := &probe.Event{ + Timestamp: time.Now().UnixNano(), + Labels: probe.LagacyEventLabels(event.SkbMeta.Netns), + } + /* + #define ACTION_READ 1 + #define ACTION_WRITE 2 + */ + if event.Direction == ACTION_READ { + evt.Type = SOCKETLAT_READSLOW + } else if event.Direction == ACTION_WRITE { + evt.Type = SOCKETLAT_SENDSLOW + } + + tuple := fmt.Sprintf("protocol=%s saddr=%s sport=%d daddr=%s dport=%d ", bpfutil.GetProtoStr(event.Tuple.L4Proto), bpfutil.GetAddrStr(event.Tuple.L3Proto, *(*[16]byte)(unsafe.Pointer(&event.Tuple.Saddr))), bits.ReverseBytes16(event.Tuple.Sport), bpfutil.GetAddrStr(event.Tuple.L3Proto, *(*[16]byte)(unsafe.Pointer(&event.Tuple.Daddr))), bits.ReverseBytes16(event.Tuple.Dport)) + evt.Message = fmt.Sprintf("%s latency=%s", tuple, bpfutil.GetHumanTimes(event.Latency)) + if p.sink != nil { + log.Debugf("%s sink event %s", probeName, util.ToJSONString(evt)) + p.sink <- evt + } + } } -func (p *SocketLatencyProbe) Collect(_ context.Context) (map[string]map[uint32]uint64, error) { +func (p *socketLatencyProbe) collect() (map[string]map[uint32]uint64, error) { res := map[string]map[uint32]uint64{} for _, mtr := range socketlatencyMetrics { res[mtr] = map[uint32]uint64{} @@ -193,62 +274,7 @@ func (p *SocketLatencyProbe) Collect(_ context.Context) (map[string]map[uint32]u return res, nil } -func (p *SocketLatencyProbe) startEventPoll(ctx context.Context) { - reader, err := perf.NewReader(objs.bpfMaps.InspSklatEvents, int(unsafe.Sizeof(bpfInspSklatEventT{}))) - if err != nil { - slog.Ctx(ctx).Warn("start new perf reader", "module", ModuleName, "err", err) - return - } - - for { - record, err := reader.Read() - if err != nil { - if errors.Is(err, ringbuf.ErrClosed) { - slog.Ctx(ctx).Info("received signal, exiting..", "module", ModuleName) - return - } - slog.Ctx(ctx).Info("reading from reader", "module", ModuleName, "err", err) - continue - } - - if record.LostSamples != 0 { - slog.Ctx(ctx).Info("Perf event ring buffer full", "module", ModuleName, "drop samples", record.LostSamples) - continue - } - - var event bpfInspSklatEventT - if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event); err != nil { - slog.Ctx(ctx).Info("parsing event", "module", ModuleName, "err", err) - continue - } - // filter netlink/unixsock/tproxy packet - if event.Tuple.Dport == 0 && event.Tuple.Sport == 0 { - continue - } - rawevt := proto.RawEvent{ - Netns: event.SkbMeta.Netns, - } - /* - #define ACTION_READ 1 - #define ACTION_WRITE 2 - */ - if event.Direction == ACTION_READ { - rawevt.EventType = SOCKETLAT_READSLOW - } else if event.Direction == ACTION_WRITE { - rawevt.EventType = SOCKETLAT_SENDSLOW - } - - tuple := fmt.Sprintf("protocol=%s saddr=%s sport=%d daddr=%s dport=%d ", bpfutil.GetProtoStr(event.Tuple.L4Proto), bpfutil.GetAddrStr(event.Tuple.L3Proto, *(*[16]byte)(unsafe.Pointer(&event.Tuple.Saddr))), bits.ReverseBytes16(event.Tuple.Sport), bpfutil.GetAddrStr(event.Tuple.L3Proto, *(*[16]byte)(unsafe.Pointer(&event.Tuple.Daddr))), bits.ReverseBytes16(event.Tuple.Dport)) - rawevt.EventBody = fmt.Sprintf("%s latency=%s", tuple, bpfutil.GetHumanTimes(event.Latency)) - if p.sub != nil { - slog.Ctx(ctx).Debug("broadcast event", "module", ModuleName) - p.sub <- rawevt - } - } - -} - -func loadSync() error { +func (p *socketLatencyProbe) loadAndAttachBPF() error { // Allow the current process to lock memory for eBPF resources. if err := rlimit.RemoveMemlock(); err != nil { return fmt.Errorf("remove limit failed: %s", err.Error()) @@ -261,47 +287,47 @@ func loadSync() error { } // Load pre-compiled programs and maps into the kernel. - if err := loadBpfObjects(&objs, &opts); err != nil { + if err := loadBpfObjects(&p.objs, &opts); err != nil { return fmt.Errorf("loading objects: %s", err.Error()) } - linkcreate, err := link.Kprobe("inet_ehash_nolisten", objs.SockCreate, nil) + linkcreate, err := link.Kprobe("inet_ehash_nolisten", p.objs.SockCreate, nil) if err != nil { return fmt.Errorf("link inet_ehash_nolisten: %s", err.Error()) } - links = append(links, linkcreate) + p.links = append(p.links, linkcreate) - linkreceive, err := link.Kprobe("sock_def_readable", objs.SockReceive, nil) + linkreceive, err := link.Kprobe("sock_def_readable", p.objs.SockReceive, nil) if err != nil { return fmt.Errorf("link sock_def_readable: %s", err.Error()) } - links = append(links, linkreceive) + p.links = append(p.links, linkreceive) - linkread, err := link.Kprobe("tcp_cleanup_rbuf", objs.SockRead, nil) + linkread, err := link.Kprobe("tcp_cleanup_rbuf", p.objs.SockRead, nil) if err != nil { return fmt.Errorf("link tcp_cleanup_rbuf: %s", err.Error()) } - links = append(links, linkread) + p.links = append(p.links, linkread) - linkwrite, err := link.Kprobe("tcp_sendmsg_locked", objs.SockWrite, nil) + linkwrite, err := link.Kprobe("tcp_sendmsg_locked", p.objs.SockWrite, nil) if err != nil { return fmt.Errorf("link tcp_sendmsg_locked: %s", err.Error()) } - links = append(links, linkwrite) + p.links = append(p.links, linkwrite) - linksend, err := link.Kprobe("tcp_write_xmit", objs.SockSend, nil) + linksend, err := link.Kprobe("tcp_write_xmit", p.objs.SockSend, nil) if err != nil { return fmt.Errorf("link tcp_write_xmit: %s", err.Error()) } - links = append(links, linksend) + p.links = append(p.links, linksend) - linkdestroy, err := link.Kprobe("tcp_done", objs.SockDestroy, nil) + linkdestroy, err := link.Kprobe("tcp_done", p.objs.SockDestroy, nil) if err != nil { return fmt.Errorf("link tcp_done: %s", err.Error()) } - links = append(links, linkdestroy) + p.links = append(p.links, linkdestroy) - err = bpfutil.MustPin(objs.InspSklatMetric, ModuleName) + err = bpfutil.MustPin(p.objs.InspSklatMetric, probeName) if err != nil { return fmt.Errorf("pin map %s failed: %s", ModuleName, err.Error()) } diff --git a/pkg/exporter/probe/tracetcpreset/tracetcpreset.go b/pkg/exporter/probe/tracetcpreset/tracetcpreset.go index a482c404..e1112ab6 100644 --- a/pkg/exporter/probe/tracetcpreset/tracetcpreset.go +++ b/pkg/exporter/probe/tracetcpreset/tracetcpreset.go @@ -6,21 +6,21 @@ import ( "encoding/binary" "errors" "fmt" - "math/bits" - "sync" "syscall" + "time" "unsafe" - "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" + "github.com/alibaba/kubeskoop/pkg/exporter/util" + log "github.com/sirupsen/logrus" + "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" "github.com/cilium/ebpf/perf" "github.com/cilium/ebpf/ringbuf" "github.com/cilium/ebpf/rlimit" - "golang.org/x/exp/slog" ) //go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc clang -cflags $BPF_CFLAGS -type insp_tcpreset_event_t bpf ../../../../bpf/tcpreset.c -- -I../../../../bpf/headers -D__TARGET_ARCH_x86 @@ -34,158 +34,134 @@ const ( ) var ( - ModuleName = "insp_tcpreset" // nolint - objs = bpfObjects{} - probe = &TCPResetProbe{once: sync.Once{}, mtx: sync.Mutex{}, enabledProbes: map[proto.ProbeType]bool{}} - links = []link.Link{} - - events = []string{TCPRESET_NOSOCK, TCPRESET_ACTIVE, TCPRESET_PROCESS, TCPRESET_RECEIVE} + probeName = "tcpreset" ) -func GetProbe() *TCPResetProbe { - return probe +func init() { + probe.MustRegisterEventProbe(probeName, eventProbeCreator) } -type TCPResetProbe struct { - enable bool - sub chan<- proto.RawEvent - once sync.Once - mtx sync.Mutex - enabledProbes map[proto.ProbeType]bool -} - -func (p *TCPResetProbe) Name() string { - return ModuleName -} - -func (p *TCPResetProbe) Ready() bool { - return p.enable -} - -func (p *TCPResetProbe) GetEventNames() []string { - return events -} - -func (p *TCPResetProbe) Close(probeType proto.ProbeType) error { - if !p.enable { - return nil - } - - if _, ok := p.enabledProbes[probeType]; !ok { - return nil - } - if len(p.enabledProbes) > 1 { - delete(p.enabledProbes, probeType) - return nil +func eventProbeCreator(sink chan<- *probe.Event, _ map[string]interface{}) (probe.EventProbe, error) { + p := &tcpResetProbe{ + sink: sink, } - - for _, link := range links { - link.Close() - } - links = []link.Link{} - p.enable = false - p.once = sync.Once{} - - delete(p.enabledProbes, probeType) - return nil + return probe.NewEventProbe(probeName, p), nil } -func (p *TCPResetProbe) Register(receiver chan<- proto.RawEvent) error { - p.mtx.Lock() - defer p.mtx.Unlock() - p.sub = receiver - - return nil +type tcpResetProbe struct { + sink chan<- *probe.Event + objs bpfObjects + links []link.Link + perfReader *perf.Reader } -func (p *TCPResetProbe) Start(ctx context.Context, probeType proto.ProbeType) { - if p.enable { - p.enabledProbes[probeType] = true - return - } - - p.once.Do(func() { - err := loadSync() - if err != nil { - slog.Ctx(ctx).Warn("start", "module", ModuleName, "err", err) - return - } - p.enable = true - }) - - if !p.enable { - // if load failed, do not start process +func (p *tcpResetProbe) Start(_ context.Context) (err error) { + err = p.loadAndAttachBPF() + if err != nil { + log.Errorf("%s failed load and attach bpf, err: %v", probeName, err) + _ = p.cleanup() return } - p.enabledProbes[probeType] = true - reader, err := perf.NewReader(objs.bpfMaps.InspTcpresetEvents, int(unsafe.Sizeof(bpfInspTcpresetEventT{}))) + p.perfReader, err = perf.NewReader(p.objs.bpfMaps.InspTcpresetEvents, int(unsafe.Sizeof(bpfInspTcpresetEventT{}))) if err != nil { - slog.Ctx(ctx).Warn("start new perf reader", "module", ModuleName, "err", err) + log.Errorf("%s failed create new perf reader, err: %v", probeName, err) + _ = p.cleanup() return } + go p.perfLoop() + return +} + +func (p *tcpResetProbe) perfLoop() { for { - record, err := reader.Read() + record, err := p.perfReader.Read() if err != nil { if errors.Is(err, ringbuf.ErrClosed) { - slog.Ctx(ctx).Info("received signal, exiting..", "module", ModuleName) + log.Infof("%s received signal, exiting..", probeName) return } - slog.Ctx(ctx).Info("reading from reader", "module", ModuleName, "err", err) + log.Errorf("%s failed reading from reader, err: %v", probeName, err) continue } if record.LostSamples != 0 { - slog.Ctx(ctx).Info("Perf event ring buffer full", "module", ModuleName, "drop samples", record.LostSamples) + log.Infof("%s perf event ring buffer full, drop: %d", probeName, record.LostSamples) continue } var event bpfInspTcpresetEventT if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event); err != nil { - slog.Ctx(ctx).Info("parsing event", "module", ModuleName, "err", err) + log.Infof("%s failed parsing event, err: %v", probeName, err) continue } - rawevt := proto.RawEvent{ - Netns: 0, - } - /* #define RESET_NOSOCK 1 #define RESET_ACTIVE 2 #define RESET_PROCESS 4 #define RESET_RECEIVE 8 */ + + var eventType probe.EventType + switch event.Type { case 1: - rawevt.EventType = TCPRESET_NOSOCK + eventType = TCPRESET_NOSOCK case 2: - rawevt.EventType = TCPRESET_ACTIVE + eventType = TCPRESET_ACTIVE case 4: - rawevt.EventType = TCPRESET_PROCESS + eventType = TCPRESET_PROCESS case 8: - rawevt.EventType = TCPRESET_RECEIVE + eventType = TCPRESET_RECEIVE default: - slog.Ctx(ctx).Info("parsing event", "module", ModuleName, "ignore", event) + log.Infof("%s got invalid perf event type %d, data: %s", probeName, event.Type, util.ToJSONString(event)) + continue } - rawevt.Netns = event.SkbMeta.Netns if event.Tuple.L3Proto == syscall.ETH_P_IPV6 { - slog.Ctx(ctx).Debug("ignore event of ipv6 proto") + log.Infof("%s ignore event of ipv6 proto", probeName) continue } + + evt := &probe.Event{ + Timestamp: time.Now().UnixNano(), + Type: eventType, + Labels: probe.LagacyEventLabels(event.SkbMeta.Netns), + } + tuple := fmt.Sprintf("protocol=%s saddr=%s sport=%d daddr=%s dport=%d ", bpfutil.GetProtoStr(event.Tuple.L4Proto), bpfutil.GetAddrStr(event.Tuple.L3Proto, *(*[16]byte)(unsafe.Pointer(&event.Tuple.Saddr))), bits.ReverseBytes16(event.Tuple.Sport), bpfutil.GetAddrStr(event.Tuple.L3Proto, *(*[16]byte)(unsafe.Pointer(&event.Tuple.Daddr))), bits.ReverseBytes16(event.Tuple.Dport)) stateStr := bpfutil.GetSkcStateStr(event.State) - rawevt.EventBody = fmt.Sprintf("%s state:%s ", tuple, stateStr) - if p.sub != nil { - slog.Ctx(ctx).Debug("broadcast event", "module", ModuleName) - p.sub <- rawevt + evt.Message = fmt.Sprintf("%s state:%s ", tuple, stateStr) + if p.sink != nil { + log.Debugf("%s sink event: %s", probeName, util.ToJSONString(evt)) + p.sink <- evt } } } -func loadSync() error { +func (p *tcpResetProbe) Stop(_ context.Context) error { + return p.cleanup() +} + +func (p *tcpResetProbe) cleanup() error { + if p.perfReader != nil { + p.perfReader.Close() + } + + for _, link := range p.links { + link.Close() + } + p.links = nil + + p.objs.Close() + + return nil + +} + +func (p *tcpResetProbe) loadAndAttachBPF() error { // 准备动作 if err := rlimit.RemoveMemlock(); err != nil { return err @@ -198,27 +174,27 @@ func loadSync() error { } // 获取Loaded的程序/map的fd信息 - if err := loadBpfObjects(&objs, &opts); err != nil { + if err := loadBpfObjects(&p.objs, &opts); err != nil { return fmt.Errorf("loading objects: %v", err) } - progsend, err := link.Kprobe("tcp_v4_send_reset", objs.TraceSendreset, &link.KprobeOptions{}) + progsend, err := link.Kprobe("tcp_v4_send_reset", p.objs.TraceSendreset, &link.KprobeOptions{}) if err != nil { return fmt.Errorf("link tcp_v4_send_reset: %s", err.Error()) } - links = append(links, progsend) + p.links = append(p.links, progsend) - progactive, err := link.Kprobe("tcp_send_active_reset", objs.TraceSendactive, &link.KprobeOptions{}) + progactive, err := link.Kprobe("tcp_send_active_reset", p.objs.TraceSendactive, &link.KprobeOptions{}) if err != nil { return fmt.Errorf("link tcp_send_active_reset: %s", err.Error()) } - links = append(links, progactive) + p.links = append(p.links, progactive) - kprecv, err := link.Tracepoint("tcp", "tcp_receive_reset", objs.InspRstrx, nil) + kprecv, err := link.Tracepoint("tcp", "tcp_receive_reset", p.objs.InspRstrx, nil) if err != nil { return err } - links = append(links, kprecv) + p.links = append(p.links, kprecv) return nil } diff --git a/pkg/exporter/probe/tracevirtcmdlat/tracevirtcmdlat.go b/pkg/exporter/probe/tracevirtcmdlat/tracevirtcmdlat.go index 7b68d118..446ab962 100644 --- a/pkg/exporter/probe/tracevirtcmdlat/tracevirtcmdlat.go +++ b/pkg/exporter/probe/tracevirtcmdlat/tracevirtcmdlat.go @@ -9,164 +9,199 @@ import ( "sync" "unsafe" - "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" - "github.com/alibaba/kubeskoop/pkg/exporter/proto" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" + "github.com/alibaba/kubeskoop/pkg/exporter/util" + log "github.com/sirupsen/logrus" + "github.com/alibaba/kubeskoop/pkg/exporter/bpfutil" "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" "github.com/cilium/ebpf/perf" "github.com/cilium/ebpf/ringbuf" "github.com/cilium/ebpf/rlimit" - "golang.org/x/exp/slog" ) //go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc clang -cflags $BPF_CFLAGS -type insp_virtcmdlat_event_t bpf ../../../../bpf/virtcmdlatency.c -- -I../../../../bpf/headers -D__TARGET_ARCH_x86 const ( - ModuleName = "insp_virtcmdlatency" // nolint - VIRTCMD100MS = "virtcmdlatency100ms" VIRTCMD = "virtcmdlatency" VIRTCMDEXCUTE = "VIRTCMDEXCUTE" - fn = "virtnet_send_command" + fn = "virtnet_send_command" + probeName = "virtcmdLatency" ) var ( - probe = &VirtcmdLatencyProbe{once: sync.Once{}, mtx: sync.Mutex{}, enabledProbes: map[proto.ProbeType]bool{}} - objs = bpfObjects{} - links = []link.Link{} - events = []string{VIRTCMDEXCUTE} - metrics = []string{VIRTCMD100MS, VIRTCMD} - - metricsMap = map[string]map[uint32]uint64{} + metrics = []string{VIRTCMD100MS, VIRTCMD} + _virtcmdLatencyProbe = &virtcmdLatencyProbe{} ) -func GetProbe() *VirtcmdLatencyProbe { - return probe +func init() { + probe.MustRegisterMetricsProbe(probeName, metricsProbeCreator) + probe.MustRegisterEventProbe(probeName, eventProbeCreator) } -func init() { - for m := range metrics { - metricsMap[metrics[m]] = map[uint32]uint64{ - 0: 0, - } +func metricsProbeCreator(_ map[string]interface{}) (probe.MetricsProbe, error) { + p := &metricsProbe{} + batchMetrics := probe.NewLegacyBatchMetrics(probeName, metrics, p.CollectOnce) + + return probe.NewMetricsProbe(probeName, p, batchMetrics), nil +} + +func eventProbeCreator(sink chan<- *probe.Event, _ map[string]interface{}) (probe.EventProbe, error) { + p := &eventProbe{ + sink: sink, } + return probe.NewEventProbe(probeName, p), nil } -type VirtcmdLatencyProbe struct { - enable bool - sub chan<- proto.RawEvent - once sync.Once - mtx sync.Mutex - enabledProbes map[proto.ProbeType]bool +type metricsProbe struct { } -func (p *VirtcmdLatencyProbe) Name() string { - return ModuleName +func (p *metricsProbe) Start(ctx context.Context) error { + return _virtcmdLatencyProbe.start(ctx, probe.ProbeTypeMetrics) } -func (p *VirtcmdLatencyProbe) Ready() bool { - return p.enable +func (p *metricsProbe) Stop(ctx context.Context) error { + return _virtcmdLatencyProbe.stop(ctx, probe.ProbeTypeMetrics) } -func (p *VirtcmdLatencyProbe) GetMetricNames() []string { - return metrics +func (p *metricsProbe) CollectOnce() (map[string]map[uint32]uint64, error) { + return _virtcmdLatencyProbe.copyMetricsMap(), nil } -func (p *VirtcmdLatencyProbe) GetEventNames() []string { - return events +type eventProbe struct { + sink chan<- *probe.Event } -func (p *VirtcmdLatencyProbe) Close(probeType proto.ProbeType) error { - if !p.enable { - return nil +func (e *eventProbe) Start(ctx context.Context) error { + err := _virtcmdLatencyProbe.start(ctx, probe.ProbeTypeEvent) + if err != nil { + return err } - if _, ok := p.enabledProbes[probeType]; !ok { - return nil + _virtcmdLatencyProbe.sink = e.sink + return nil +} + +func (e *eventProbe) Stop(ctx context.Context) error { + return _virtcmdLatencyProbe.stop(ctx, probe.ProbeTypeEvent) +} + +type virtcmdLatencyProbe struct { + objs bpfObjects + links []link.Link + sink chan<- *probe.Event + refcnt [probe.ProbeTypeCount]int + lock sync.Mutex + perfReader *perf.Reader + metricsMap map[string]map[uint32]uint64 + metricsLock sync.RWMutex +} + +func (p *virtcmdLatencyProbe) stop(_ context.Context, probeType probe.Type) error { + p.lock.Lock() + defer p.lock.Unlock() + if p.refcnt[probeType] == 0 { + return fmt.Errorf("probe %s never start", probeType) } - if len(p.enabledProbes) > 1 { - delete(p.enabledProbes, probeType) - return nil + + p.refcnt[probeType]-- + if p.totalReferenceCountLocked() == 0 { + return p.cleanup() } + return nil +} - for _, link := range links { +func (p *virtcmdLatencyProbe) cleanup() error { + if p.perfReader != nil { + p.perfReader.Close() + } + + for _, link := range p.links { link.Close() } - links = []link.Link{} - p.enable = false - p.once = sync.Once{} - metricsMap = map[string]map[uint32]uint64{} - delete(p.enabledProbes, probeType) + p.links = nil + + p.objs.Close() + return nil } -func (p *VirtcmdLatencyProbe) Collect(_ context.Context) (map[string]map[uint32]uint64, error) { - return metricsMap, nil -} +func (p *virtcmdLatencyProbe) updateMetrics(metrics string) { + p.metricsLock.Lock() + defer p.metricsLock.Unlock() + if _, ok := p.metricsMap[metrics]; !ok { + p.metricsMap[metrics] = make(map[uint32]uint64) + } -func (p *VirtcmdLatencyProbe) Register(receiver chan<- proto.RawEvent) error { - p.mtx.Lock() - defer p.mtx.Unlock() - p.sub = receiver + p.metricsMap[metrics][0]++ +} - return nil +func (p *virtcmdLatencyProbe) copyMetricsMap() map[string]map[uint32]uint64 { + p.metricsLock.RLock() + defer p.metricsLock.RUnlock() + return probe.CopyLegacyMetricsMap(p.metricsMap) } -func (p *VirtcmdLatencyProbe) Start(ctx context.Context, probeType proto.ProbeType) { - // metric and events both start probe - if p.enable { - p.enabledProbes[probeType] = true - return +func (p *virtcmdLatencyProbe) totalReferenceCountLocked() int { + var c int + for _, n := range p.refcnt { + c += n } - p.once.Do(func() { - err := loadSync() + return c +} + +func (p *virtcmdLatencyProbe) start(_ context.Context, probeType probe.Type) (err error) { + p.lock.Lock() + defer p.lock.Unlock() + + p.refcnt[probeType]++ + if p.totalReferenceCountLocked() == 1 { + if err = p.loadAndAttachBPF(); err != nil { + log.Errorf("%s failed load and attach bpf, err: %v", probeName, err) + _ = p.cleanup() + return + } + p.perfReader, err = perf.NewReader(p.objs.bpfMaps.InspVirtcmdlatEvents, int(unsafe.Sizeof(bpfInspVirtcmdlatEventT{}))) if err != nil { - slog.Ctx(ctx).Warn("start", "module", ModuleName, "err", err) + log.Warnf("%s failed create new perf reader, err: %v", probeName, err) return } - p.enable = true - }) - if !p.enable { - // if load failed, do not start process - return + go p.perfLoop() } - p.enabledProbes[probeType] = true - reader, err := perf.NewReader(objs.bpfMaps.InspVirtcmdlatEvents, int(unsafe.Sizeof(bpfInspVirtcmdlatEventT{}))) - if err != nil { - slog.Ctx(ctx).Warn("start new perf reader", "module", ModuleName, "err", err) - return - } + return nil +} +func (p *virtcmdLatencyProbe) perfLoop() { for { - record, err := reader.Read() + record, err := p.perfReader.Read() if err != nil { if errors.Is(err, ringbuf.ErrClosed) { - slog.Ctx(ctx).Info("received signal, exiting..", "module", ModuleName) + log.Infof("%s received signal, exiting..", probeName) return } - slog.Ctx(ctx).Info("reading from reader", "module", ModuleName, "err", err) + log.Infof("%s failed reading from reader, err: %v", probeName, err) continue } if record.LostSamples != 0 { - slog.Ctx(ctx).Info("Perf event ring buffer full", "module", ModuleName, "drop samples", record.LostSamples) + log.Infof("%s perf event ring buffer full, drop: %d", probeName, record.LostSamples) continue } var event bpfInspVirtcmdlatEventT if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event); err != nil { - slog.Ctx(ctx).Info("parsing event", "module", ModuleName, "err", err) + log.Infof("%s failed parsing event, err: %v", probeName, err) continue } - rawevt := proto.RawEvent{ - Netns: 0, - EventType: VIRTCMDEXCUTE, + evt := &probe.Event{ + Type: VIRTCMDEXCUTE, } if event.Latency > 100000000 { @@ -175,23 +210,15 @@ func (p *VirtcmdLatencyProbe) Start(ctx context.Context, probeType proto.ProbeTy p.updateMetrics(VIRTCMD) } - rawevt.EventBody = fmt.Sprintf("cpu=%d pid=%d latency=%s", event.Cpu, event.Pid, bpfutil.GetHumanTimes(event.Latency)) - if p.sub != nil { - slog.Ctx(ctx).Debug("broadcast event", "module", ModuleName) - p.sub <- rawevt + evt.Message = fmt.Sprintf("cpu=%d pid=%d latency=%s", event.Cpu, event.Pid, bpfutil.GetHumanTimes(event.Latency)) + if p.sink != nil { + log.Debugf("%s sink event %s", probeName, util.ToJSONString(evt)) + p.sink <- evt } } } -func (p *VirtcmdLatencyProbe) updateMetrics(k string) { - p.mtx.Lock() - defer p.mtx.Unlock() - if _, ok := metricsMap[k]; ok { - metricsMap[k][0]++ - } -} - -func loadSync() error { +func (p *virtcmdLatencyProbe) loadAndAttachBPF() error { if err := rlimit.RemoveMemlock(); err != nil { return fmt.Errorf("remove limit failed: %s", err.Error()) } @@ -203,21 +230,21 @@ func loadSync() error { } // Load pre-compiled programs and maps into the kernel. - if err := loadBpfObjects(&objs, &opts); err != nil { + if err := loadBpfObjects(&p.objs, &opts); err != nil { return fmt.Errorf("loading objects: %s", err.Error()) } - linkentry, err := link.Kprobe(fn, objs.TraceVirtcmd, &link.KprobeOptions{}) + linkentry, err := link.Kprobe(fn, p.objs.TraceVirtcmd, &link.KprobeOptions{}) if err != nil { return fmt.Errorf("link %s: %s", fn, err.Error()) } - links = append(links, linkentry) + p.links = append(p.links, linkentry) - linkexit, err := link.Kretprobe(fn, objs.TraceVirtcmdret, &link.KprobeOptions{}) + linkexit, err := link.Kretprobe(fn, p.objs.TraceVirtcmdret, &link.KprobeOptions{}) if err != nil { return fmt.Errorf("link ret %s: %s", fn, err.Error()) } - links = append(links, linkexit) + p.links = append(p.links, linkexit) return nil } diff --git a/pkg/exporter/proto/inspector.pb.go b/pkg/exporter/proto/inspector.pb.go deleted file mode 100644 index 75540cca..00000000 --- a/pkg/exporter/proto/inspector.pb.go +++ /dev/null @@ -1,643 +0,0 @@ -// Code generated by protoc-gen-go. DO NOT EDIT. -// versions: -// protoc-gen-go v1.28.1 -// protoc v3.21.12 -// source: inspector.proto - -package proto - -import ( - reflect "reflect" - sync "sync" - - protoreflect "google.golang.org/protobuf/reflect/protoreflect" - protoimpl "google.golang.org/protobuf/runtime/protoimpl" -) - -const ( - // Verify that this generated code is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) - // Verify that runtime/protoimpl is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) -) - -type WatchRequest struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` - Filter *Meta `protobuf:"bytes,2,opt,name=filter,proto3" json:"filter,omitempty"` -} - -func (x *WatchRequest) Reset() { - *x = WatchRequest{} - if protoimpl.UnsafeEnabled { - mi := &file_inspector_proto_msgTypes[0] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *WatchRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*WatchRequest) ProtoMessage() {} - -func (x *WatchRequest) ProtoReflect() protoreflect.Message { - mi := &file_inspector_proto_msgTypes[0] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use WatchRequest.ProtoReflect.Descriptor instead. -func (*WatchRequest) Descriptor() ([]byte, []int) { - return file_inspector_proto_rawDescGZIP(), []int{0} -} - -func (x *WatchRequest) GetName() string { - if x != nil { - return x.Name - } - return "" -} - -func (x *WatchRequest) GetFilter() *Meta { - if x != nil { - return x.Filter - } - return nil -} - -type WatchReply struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` - Event *Event `protobuf:"bytes,2,opt,name=event,proto3" json:"event,omitempty"` -} - -func (x *WatchReply) Reset() { - *x = WatchReply{} - if protoimpl.UnsafeEnabled { - mi := &file_inspector_proto_msgTypes[1] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *WatchReply) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*WatchReply) ProtoMessage() {} - -func (x *WatchReply) ProtoReflect() protoreflect.Message { - mi := &file_inspector_proto_msgTypes[1] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use WatchReply.ProtoReflect.Descriptor instead. -func (*WatchReply) Descriptor() ([]byte, []int) { - return file_inspector_proto_rawDescGZIP(), []int{1} -} - -func (x *WatchReply) GetName() string { - if x != nil { - return x.Name - } - return "" -} - -func (x *WatchReply) GetEvent() *Event { - if x != nil { - return x.Event - } - return nil -} - -type QueryMetricRequest struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` - Filter *Meta `protobuf:"bytes,2,opt,name=filter,proto3" json:"filter,omitempty"` -} - -func (x *QueryMetricRequest) Reset() { - *x = QueryMetricRequest{} - if protoimpl.UnsafeEnabled { - mi := &file_inspector_proto_msgTypes[2] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *QueryMetricRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*QueryMetricRequest) ProtoMessage() {} - -func (x *QueryMetricRequest) ProtoReflect() protoreflect.Message { - mi := &file_inspector_proto_msgTypes[2] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use QueryMetricRequest.ProtoReflect.Descriptor instead. -func (*QueryMetricRequest) Descriptor() ([]byte, []int) { - return file_inspector_proto_rawDescGZIP(), []int{2} -} - -func (x *QueryMetricRequest) GetName() string { - if x != nil { - return x.Name - } - return "" -} - -func (x *QueryMetricRequest) GetFilter() *Meta { - if x != nil { - return x.Filter - } - return nil -} - -type QueryMetricResponse struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` - Metrics []*Metric `protobuf:"bytes,2,rep,name=metrics,proto3" json:"metrics,omitempty"` -} - -func (x *QueryMetricResponse) Reset() { - *x = QueryMetricResponse{} - if protoimpl.UnsafeEnabled { - mi := &file_inspector_proto_msgTypes[3] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *QueryMetricResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*QueryMetricResponse) ProtoMessage() {} - -func (x *QueryMetricResponse) ProtoReflect() protoreflect.Message { - mi := &file_inspector_proto_msgTypes[3] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use QueryMetricResponse.ProtoReflect.Descriptor instead. -func (*QueryMetricResponse) Descriptor() ([]byte, []int) { - return file_inspector_proto_rawDescGZIP(), []int{3} -} - -func (x *QueryMetricResponse) GetName() string { - if x != nil { - return x.Name - } - return "" -} - -func (x *QueryMetricResponse) GetMetrics() []*Metric { - if x != nil { - return x.Metrics - } - return nil -} - -type Meta struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Node string `protobuf:"bytes,1,opt,name=node,proto3" json:"node,omitempty"` - Pod string `protobuf:"bytes,2,opt,name=pod,proto3" json:"pod,omitempty"` - Namespace string `protobuf:"bytes,3,opt,name=namespace,proto3" json:"namespace,omitempty"` - Netns string `protobuf:"bytes,4,opt,name=netns,proto3" json:"netns,omitempty"` -} - -func (x *Meta) Reset() { - *x = Meta{} - if protoimpl.UnsafeEnabled { - mi := &file_inspector_proto_msgTypes[4] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *Meta) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*Meta) ProtoMessage() {} - -func (x *Meta) ProtoReflect() protoreflect.Message { - mi := &file_inspector_proto_msgTypes[4] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use Meta.ProtoReflect.Descriptor instead. -func (*Meta) Descriptor() ([]byte, []int) { - return file_inspector_proto_rawDescGZIP(), []int{4} -} - -func (x *Meta) GetNode() string { - if x != nil { - return x.Node - } - return "" -} - -func (x *Meta) GetPod() string { - if x != nil { - return x.Pod - } - return "" -} - -func (x *Meta) GetNamespace() string { - if x != nil { - return x.Namespace - } - return "" -} - -func (x *Meta) GetNetns() string { - if x != nil { - return x.Netns - } - return "" -} - -type Metric struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Meta *Meta `protobuf:"bytes,1,opt,name=meta,proto3" json:"meta,omitempty"` - Name string `protobuf:"bytes,2,opt,name=name,proto3" json:"name,omitempty"` - Value float32 `protobuf:"fixed32,3,opt,name=value,proto3" json:"value,omitempty"` -} - -func (x *Metric) Reset() { - *x = Metric{} - if protoimpl.UnsafeEnabled { - mi := &file_inspector_proto_msgTypes[5] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *Metric) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*Metric) ProtoMessage() {} - -func (x *Metric) ProtoReflect() protoreflect.Message { - mi := &file_inspector_proto_msgTypes[5] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use Metric.ProtoReflect.Descriptor instead. -func (*Metric) Descriptor() ([]byte, []int) { - return file_inspector_proto_rawDescGZIP(), []int{5} -} - -func (x *Metric) GetMeta() *Meta { - if x != nil { - return x.Meta - } - return nil -} - -func (x *Metric) GetName() string { - if x != nil { - return x.Name - } - return "" -} - -func (x *Metric) GetValue() float32 { - if x != nil { - return x.Value - } - return 0 -} - -type Event struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Meta *Meta `protobuf:"bytes,1,opt,name=meta,proto3" json:"meta,omitempty"` - Name string `protobuf:"bytes,2,opt,name=name,proto3" json:"name,omitempty"` - Value string `protobuf:"bytes,3,opt,name=value,proto3" json:"value,omitempty"` -} - -func (x *Event) Reset() { - *x = Event{} - if protoimpl.UnsafeEnabled { - mi := &file_inspector_proto_msgTypes[6] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *Event) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*Event) ProtoMessage() {} - -func (x *Event) ProtoReflect() protoreflect.Message { - mi := &file_inspector_proto_msgTypes[6] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use Event.ProtoReflect.Descriptor instead. -func (*Event) Descriptor() ([]byte, []int) { - return file_inspector_proto_rawDescGZIP(), []int{6} -} - -func (x *Event) GetMeta() *Meta { - if x != nil { - return x.Meta - } - return nil -} - -func (x *Event) GetName() string { - if x != nil { - return x.Name - } - return "" -} - -func (x *Event) GetValue() string { - if x != nil { - return x.Value - } - return "" -} - -var File_inspector_proto protoreflect.FileDescriptor - -var file_inspector_proto_rawDesc = []byte{ - 0x0a, 0x0f, 0x69, 0x6e, 0x73, 0x70, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x2e, 0x70, 0x72, 0x6f, 0x74, - 0x6f, 0x12, 0x05, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0x47, 0x0a, 0x0c, 0x57, 0x61, 0x74, 0x63, - 0x68, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, - 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x23, 0x0a, 0x06, - 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0b, 0x2e, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x4d, 0x65, 0x74, 0x61, 0x52, 0x06, 0x66, 0x69, 0x6c, 0x74, 0x65, - 0x72, 0x22, 0x44, 0x0a, 0x0a, 0x57, 0x61, 0x74, 0x63, 0x68, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x12, - 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, - 0x61, 0x6d, 0x65, 0x12, 0x22, 0x0a, 0x05, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x18, 0x02, 0x20, 0x01, - 0x28, 0x0b, 0x32, 0x0c, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x45, 0x76, 0x65, 0x6e, 0x74, - 0x52, 0x05, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x22, 0x4d, 0x0a, 0x12, 0x51, 0x75, 0x65, 0x72, 0x79, - 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x12, 0x0a, - 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, - 0x65, 0x12, 0x23, 0x0a, 0x06, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, - 0x0b, 0x32, 0x0b, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x4d, 0x65, 0x74, 0x61, 0x52, 0x06, - 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x22, 0x52, 0x0a, 0x13, 0x51, 0x75, 0x65, 0x72, 0x79, 0x4d, - 0x65, 0x74, 0x72, 0x69, 0x63, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x12, 0x0a, - 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, - 0x65, 0x12, 0x27, 0x0a, 0x07, 0x6d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x18, 0x02, 0x20, 0x03, - 0x28, 0x0b, 0x32, 0x0d, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x4d, 0x65, 0x74, 0x72, 0x69, - 0x63, 0x52, 0x07, 0x6d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x22, 0x60, 0x0a, 0x04, 0x4d, 0x65, - 0x74, 0x61, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x6f, 0x64, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, - 0x52, 0x04, 0x6e, 0x6f, 0x64, 0x65, 0x12, 0x10, 0x0a, 0x03, 0x70, 0x6f, 0x64, 0x18, 0x02, 0x20, - 0x01, 0x28, 0x09, 0x52, 0x03, 0x70, 0x6f, 0x64, 0x12, 0x1c, 0x0a, 0x09, 0x6e, 0x61, 0x6d, 0x65, - 0x73, 0x70, 0x61, 0x63, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x6e, 0x61, 0x6d, - 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x6e, 0x65, 0x74, 0x6e, 0x73, 0x18, - 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x6e, 0x65, 0x74, 0x6e, 0x73, 0x22, 0x53, 0x0a, 0x06, - 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x12, 0x1f, 0x0a, 0x04, 0x6d, 0x65, 0x74, 0x61, 0x18, 0x01, - 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0b, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x4d, 0x65, 0x74, - 0x61, 0x52, 0x04, 0x6d, 0x65, 0x74, 0x61, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, - 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x02, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x22, 0x52, 0x0a, 0x05, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x12, 0x1f, 0x0a, 0x04, 0x6d, 0x65, - 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0b, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, - 0x2e, 0x4d, 0x65, 0x74, 0x61, 0x52, 0x04, 0x6d, 0x65, 0x74, 0x61, 0x12, 0x12, 0x0a, 0x04, 0x6e, - 0x61, 0x6d, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, - 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x32, 0x8d, 0x01, 0x0a, 0x09, 0x69, 0x6e, 0x73, 0x70, 0x65, 0x63, - 0x74, 0x6f, 0x72, 0x12, 0x38, 0x0a, 0x0a, 0x57, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, - 0x74, 0x12, 0x13, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x57, 0x61, 0x74, 0x63, 0x68, 0x52, - 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x11, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x57, - 0x61, 0x74, 0x63, 0x68, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x30, 0x01, 0x12, 0x46, 0x0a, - 0x0b, 0x51, 0x75, 0x65, 0x72, 0x79, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x12, 0x19, 0x2e, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x51, 0x75, 0x65, 0x72, 0x79, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, - 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x1a, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, - 0x51, 0x75, 0x65, 0x72, 0x79, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x52, 0x65, 0x73, 0x70, 0x6f, - 0x6e, 0x73, 0x65, 0x22, 0x00, 0x42, 0x09, 0x5a, 0x07, 0x2e, 0x3b, 0x70, 0x72, 0x6f, 0x74, 0x6f, - 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, -} - -var ( - file_inspector_proto_rawDescOnce sync.Once - file_inspector_proto_rawDescData = file_inspector_proto_rawDesc -) - -func file_inspector_proto_rawDescGZIP() []byte { - file_inspector_proto_rawDescOnce.Do(func() { - file_inspector_proto_rawDescData = protoimpl.X.CompressGZIP(file_inspector_proto_rawDescData) - }) - return file_inspector_proto_rawDescData -} - -var file_inspector_proto_msgTypes = make([]protoimpl.MessageInfo, 7) -var file_inspector_proto_goTypes = []interface{}{ - (*WatchRequest)(nil), // 0: proto.WatchRequest - (*WatchReply)(nil), // 1: proto.WatchReply - (*QueryMetricRequest)(nil), // 2: proto.QueryMetricRequest - (*QueryMetricResponse)(nil), // 3: proto.QueryMetricResponse - (*Meta)(nil), // 4: proto.Meta - (*Metric)(nil), // 5: proto.Metric - (*Event)(nil), // 6: proto.Event -} -var file_inspector_proto_depIdxs = []int32{ - 4, // 0: proto.WatchRequest.filter:type_name -> proto.Meta - 6, // 1: proto.WatchReply.event:type_name -> proto.Event - 4, // 2: proto.QueryMetricRequest.filter:type_name -> proto.Meta - 5, // 3: proto.QueryMetricResponse.metrics:type_name -> proto.Metric - 4, // 4: proto.Metric.meta:type_name -> proto.Meta - 4, // 5: proto.Event.meta:type_name -> proto.Meta - 0, // 6: proto.inspector.WatchEvent:input_type -> proto.WatchRequest - 2, // 7: proto.inspector.QueryMetric:input_type -> proto.QueryMetricRequest - 1, // 8: proto.inspector.WatchEvent:output_type -> proto.WatchReply - 3, // 9: proto.inspector.QueryMetric:output_type -> proto.QueryMetricResponse - 8, // [8:10] is the sub-list for method output_type - 6, // [6:8] is the sub-list for method input_type - 6, // [6:6] is the sub-list for extension type_name - 6, // [6:6] is the sub-list for extension extendee - 0, // [0:6] is the sub-list for field type_name -} - -func init() { file_inspector_proto_init() } -func file_inspector_proto_init() { - if File_inspector_proto != nil { - return - } - if !protoimpl.UnsafeEnabled { - file_inspector_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*WatchRequest); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_inspector_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*WatchReply); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_inspector_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*QueryMetricRequest); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_inspector_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*QueryMetricResponse); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_inspector_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*Meta); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_inspector_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*Metric); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_inspector_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*Event); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - } - type x struct{} - out := protoimpl.TypeBuilder{ - File: protoimpl.DescBuilder{ - GoPackagePath: reflect.TypeOf(x{}).PkgPath(), - RawDescriptor: file_inspector_proto_rawDesc, - NumEnums: 0, - NumMessages: 7, - NumExtensions: 0, - NumServices: 1, - }, - GoTypes: file_inspector_proto_goTypes, - DependencyIndexes: file_inspector_proto_depIdxs, - MessageInfos: file_inspector_proto_msgTypes, - }.Build() - File_inspector_proto = out.File - file_inspector_proto_rawDesc = nil - file_inspector_proto_goTypes = nil - file_inspector_proto_depIdxs = nil -} diff --git a/pkg/exporter/proto/inspector.proto b/pkg/exporter/proto/inspector.proto deleted file mode 100644 index a3068847..00000000 --- a/pkg/exporter/proto/inspector.proto +++ /dev/null @@ -1,49 +0,0 @@ -syntax = "proto3"; - -package proto; - -option go_package = ".;proto"; - -service inspector { - rpc WatchEvent(WatchRequest) returns (stream WatchReply) {} - rpc QueryMetric(QueryMetricRequest) returns (QueryMetricResponse) {} -} - -message WatchRequest { - string name = 1; - Meta filter = 2; -} - -message WatchReply { - string name = 1; - Event event = 2; -} - -message QueryMetricRequest { - string name = 1; - Meta filter = 2; -} - -message QueryMetricResponse { - string name = 1; - repeated Metric metrics = 2; -} - -message Meta { - string node = 1; - string pod = 2; - string namespace = 3; - string netns = 4; -} - -message Metric { - Meta meta = 1; - string name = 2; - float value = 3; -} - -message Event { - Meta meta = 1; - string name = 2; - string value = 3; -} diff --git a/pkg/exporter/proto/inspector_grpc.pb.go b/pkg/exporter/proto/inspector_grpc.pb.go deleted file mode 100644 index 92a96b8e..00000000 --- a/pkg/exporter/proto/inspector_grpc.pb.go +++ /dev/null @@ -1,170 +0,0 @@ -// Code generated by protoc-gen-go-grpc. DO NOT EDIT. -// versions: -// - protoc-gen-go-grpc v1.2.0 -// - protoc v3.5.0 -// source: inspector.proto - -package proto - -import ( - context "context" - - grpc "google.golang.org/grpc" - codes "google.golang.org/grpc/codes" - status "google.golang.org/grpc/status" -) - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the grpc package it is being compiled against. -// Requires gRPC-Go v1.32.0 or later. -const _ = grpc.SupportPackageIsVersion7 - -// InspectorClient is the client API for Inspector service. -// -// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. -type InspectorClient interface { - WatchEvent(ctx context.Context, in *WatchRequest, opts ...grpc.CallOption) (Inspector_WatchEventClient, error) - QueryMetric(ctx context.Context, in *QueryMetricRequest, opts ...grpc.CallOption) (*QueryMetricResponse, error) -} - -type inspectorClient struct { - cc grpc.ClientConnInterface -} - -func NewInspectorClient(cc grpc.ClientConnInterface) InspectorClient { - return &inspectorClient{cc} -} - -func (c *inspectorClient) WatchEvent(ctx context.Context, in *WatchRequest, opts ...grpc.CallOption) (Inspector_WatchEventClient, error) { - stream, err := c.cc.NewStream(ctx, &Inspector_ServiceDesc.Streams[0], "/proto.inspector/WatchEvent", opts...) - if err != nil { - return nil, err - } - x := &inspectorWatchEventClient{stream} - if err := x.ClientStream.SendMsg(in); err != nil { - return nil, err - } - if err := x.ClientStream.CloseSend(); err != nil { - return nil, err - } - return x, nil -} - -type Inspector_WatchEventClient interface { - Recv() (*WatchReply, error) - grpc.ClientStream -} - -type inspectorWatchEventClient struct { - grpc.ClientStream -} - -func (x *inspectorWatchEventClient) Recv() (*WatchReply, error) { - m := new(WatchReply) - if err := x.ClientStream.RecvMsg(m); err != nil { - return nil, err - } - return m, nil -} - -func (c *inspectorClient) QueryMetric(ctx context.Context, in *QueryMetricRequest, opts ...grpc.CallOption) (*QueryMetricResponse, error) { - out := new(QueryMetricResponse) - err := c.cc.Invoke(ctx, "/proto.inspector/QueryMetric", in, out, opts...) - if err != nil { - return nil, err - } - return out, nil -} - -// InspectorServer is the server API for Inspector service. -// All implementations must embed UnimplementedInspectorServer -// for forward compatibility -type InspectorServer interface { - WatchEvent(*WatchRequest, Inspector_WatchEventServer) error - QueryMetric(context.Context, *QueryMetricRequest) (*QueryMetricResponse, error) - mustEmbedUnimplementedInspectorServer() -} - -// UnimplementedInspectorServer must be embedded to have forward compatible implementations. -type UnimplementedInspectorServer struct { -} - -func (UnimplementedInspectorServer) WatchEvent(*WatchRequest, Inspector_WatchEventServer) error { - return status.Errorf(codes.Unimplemented, "method WatchEvent not implemented") -} -func (UnimplementedInspectorServer) QueryMetric(context.Context, *QueryMetricRequest) (*QueryMetricResponse, error) { - return nil, status.Errorf(codes.Unimplemented, "method QueryMetric not implemented") -} -func (UnimplementedInspectorServer) mustEmbedUnimplementedInspectorServer() {} - -// UnsafeInspectorServer may be embedded to opt out of forward compatibility for this service. -// Use of this interface is not recommended, as added methods to InspectorServer will -// result in compilation errors. -type UnsafeInspectorServer interface { - mustEmbedUnimplementedInspectorServer() -} - -func RegisterInspectorServer(s grpc.ServiceRegistrar, srv InspectorServer) { - s.RegisterService(&Inspector_ServiceDesc, srv) -} - -func _Inspector_WatchEvent_Handler(srv interface{}, stream grpc.ServerStream) error { - m := new(WatchRequest) - if err := stream.RecvMsg(m); err != nil { - return err - } - return srv.(InspectorServer).WatchEvent(m, &inspectorWatchEventServer{stream}) -} - -type Inspector_WatchEventServer interface { - Send(*WatchReply) error - grpc.ServerStream -} - -type inspectorWatchEventServer struct { - grpc.ServerStream -} - -func (x *inspectorWatchEventServer) Send(m *WatchReply) error { - return x.ServerStream.SendMsg(m) -} - -func _Inspector_QueryMetric_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { - in := new(QueryMetricRequest) - if err := dec(in); err != nil { - return nil, err - } - if interceptor == nil { - return srv.(InspectorServer).QueryMetric(ctx, in) - } - info := &grpc.UnaryServerInfo{ - Server: srv, - FullMethod: "/proto.inspector/QueryMetric", - } - handler := func(ctx context.Context, req interface{}) (interface{}, error) { - return srv.(InspectorServer).QueryMetric(ctx, req.(*QueryMetricRequest)) - } - return interceptor(ctx, in, info, handler) -} - -// Inspector_ServiceDesc is the grpc.ServiceDesc for Inspector service. -// It's only intended for direct use with grpc.RegisterService, -// and not to be introspected or modified (even as a copy) -var Inspector_ServiceDesc = grpc.ServiceDesc{ - ServiceName: "proto.inspector", - HandlerType: (*InspectorServer)(nil), - Methods: []grpc.MethodDesc{ - { - MethodName: "QueryMetric", - Handler: _Inspector_QueryMetric_Handler, - }, - }, - Streams: []grpc.StreamDesc{ - { - StreamName: "WatchEvent", - Handler: _Inspector_WatchEvent_Handler, - ServerStreams: true, - }, - }, - Metadata: "inspector.proto", -} diff --git a/pkg/exporter/proto/proto.go b/pkg/exporter/proto/proto.go deleted file mode 100644 index 9aba8ca1..00000000 --- a/pkg/exporter/proto/proto.go +++ /dev/null @@ -1,39 +0,0 @@ -package proto - -import ( - "context" -) - -//go:generate protoc --go_out=. ./inspector.proto - -type ProbeType string - -var ( - ProbeTypeMetrics ProbeType = "metrics" - ProbeTypeEvent ProbeType = "event" -) - -type RawEvent struct { - Netns uint32 - EventType string - EventBody string -} - -type Probe interface { - Start(ctx context.Context, probeType ProbeType) - Close(probeType ProbeType) error - Ready() bool - Name() string -} - -type MetricProbe interface { - Probe - GetMetricNames() []string - Collect(ctx context.Context) (map[string]map[uint32]uint64, error) -} - -type EventProbe interface { - Probe - GetEventNames() []string - Register(receiver chan<- RawEvent) error -} diff --git a/pkg/exporter/sink/file.go b/pkg/exporter/sink/file.go new file mode 100644 index 00000000..d8ff80a4 --- /dev/null +++ b/pkg/exporter/sink/file.go @@ -0,0 +1,41 @@ +package sink + +import ( + "encoding/json" + "fmt" + "os" + + "github.com/alibaba/kubeskoop/pkg/exporter/probe" +) + +func NewFileSink(path string) (*FileSink, error) { + file, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) + if err != nil { + return nil, fmt.Errorf("failed open file %s, err: %w", path, err) + } + + return &FileSink{ + file: file, + }, nil +} + +type FileSink struct { + file *os.File +} + +func (f *FileSink) Write(event *probe.Event) error { + data, err := json.Marshal(event) + if err != nil { + return fmt.Errorf("failed marshal event, err: %w", err) + } + _, err = f.file.Write(data) + _, _ = f.file.Write([]byte{0x0a}) + + if err != nil { + return fmt.Errorf("failed sink event to file %s, err: %w", f.file.Name(), err) + } + return nil + +} + +var _ Sink = &FileSink{} diff --git a/pkg/exporter/sink/loki.go b/pkg/exporter/sink/loki.go new file mode 100644 index 00000000..97eb5c07 --- /dev/null +++ b/pkg/exporter/sink/loki.go @@ -0,0 +1,75 @@ +package sink + +import ( + "encoding/json" + "fmt" + "net/url" + "strings" + "time" + + log "github.com/sirupsen/logrus" + + "github.com/afiskon/promtail-client/promtail" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" +) + +func NewLokiSink(addr string, node string) (*LokiSink, error) { + url, err := buildURL(addr) + if err != nil { + return nil, fmt.Errorf("failed parse addr, not a valild url, err: %w", err) + } + log.Infof("create loki client with url %s", url) + + labels := `{instance = "%s",job = "kubeskoop"}` + conf := promtail.ClientConfig{ + PushURL: url, + Labels: fmt.Sprintf(labels, node), + BatchWait: 5 * time.Second, + BatchEntriesNumber: 10000, + SendLevel: promtail.DEBUG, + PrintLevel: promtail.DEBUG, + } + client, err := promtail.NewClientProto(conf) + if err != nil { + return nil, fmt.Errorf("failed create loki client, err: %s", err) + } + return &LokiSink{ + client: client, + }, nil +} + +func buildURL(addr string) (string, error) { + if !strings.HasPrefix(addr, "http://") || !strings.HasPrefix(addr, "https://") { + addr = "http://" + addr + } + u, err := url.Parse(addr) + if err != nil { + return "", err + } + + if u.Path == "" { + u.Path = "/api/prom/push" + } + + if u.Port() == "" { + u.Host = fmt.Sprintf("%s:%s", u.Hostname(), "3100") + } + + return u.String(), nil +} + +type LokiSink struct { + client promtail.Client +} + +func (l *LokiSink) Write(event *probe.Event) error { + data, err := json.Marshal(event) + if err != nil { + return fmt.Errorf("failed marshal event, err: %w", err) + } + + l.client.Infof(string(data)) + return nil +} + +var _ Sink = &LokiSink{} diff --git a/pkg/exporter/sink/sink.go b/pkg/exporter/sink/sink.go new file mode 100644 index 00000000..d27b1f56 --- /dev/null +++ b/pkg/exporter/sink/sink.go @@ -0,0 +1,35 @@ +package sink + +import ( + "fmt" + + "github.com/alibaba/kubeskoop/pkg/exporter/nettop" + "github.com/alibaba/kubeskoop/pkg/exporter/probe" +) + +const ( + Stderr = "stderr" + File = "file" + Loki = "loki" +) + +type Sink interface { + Write(event *probe.Event) error +} + +func CreateSink(name string, args interface{}) (Sink, error) { + //TODO create with register and reflect + argsMap, _ := args.(map[string]interface{}) + + switch name { + case Stderr: + return NewStderrSink(), nil + case Loki: + addr := argsMap["addr"].(string) + return NewLokiSink(addr, nettop.GetNodeName()) + case File: + path := argsMap["path"].(string) + return NewFileSink(path) + } + return nil, fmt.Errorf("unknown sink type %s", name) +} diff --git a/pkg/exporter/sink/stderr.go b/pkg/exporter/sink/stderr.go new file mode 100644 index 00000000..f1afd22c --- /dev/null +++ b/pkg/exporter/sink/stderr.go @@ -0,0 +1,28 @@ +package sink + +import ( + "encoding/json" + "fmt" + "os" + + "github.com/alibaba/kubeskoop/pkg/exporter/probe" +) + +type StderrSink struct { +} + +func NewStderrSink() *StderrSink { + return &StderrSink{} +} + +func (s StderrSink) Write(event *probe.Event) error { + data, err := json.Marshal(event) + if err != nil { + return fmt.Errorf("failed marshal event, err: %w", err) + } + + fmt.Fprintf(os.Stderr, "event: %s\n", string(data)) + return nil +} + +var _ Sink = &StderrSink{} diff --git a/pkg/exporter/util/util.go b/pkg/exporter/util/util.go new file mode 100644 index 00000000..77a5e5bb --- /dev/null +++ b/pkg/exporter/util/util.go @@ -0,0 +1,16 @@ +package util + +import ( + "encoding/json" + + log "github.com/sirupsen/logrus" +) + +func ToJSONString(v interface{}) string { + data, err := json.Marshal(v) + if err != nil { + log.Errorf("error marshal json: %v", err) + return "" + } + return string(data) +}