diff --git a/CHANGELOG.md b/CHANGELOG.md index bd037692..7439021d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,21 +1,37 @@ ## [Unreleased] +## [3.15.0] - 2024-05-09 + +### Fixed + +- CPU usage sampler could not be disabled (#259) +- Example config out-of-date (#260) + +### Added + +- Example Grafana/Prometheus dashboard (#239) +- Off-CPU time added to the scheduler runqueue sampler (#240) +- Filesystem sampler added and tracks the number of open file descriptors (#242) +- Log level can now be set in the config (#246) +- Network interface statistics for tx/rx errors (drop, crc, ...) (#247) +- BPF can be enabled/disabled per-sampler or globally. (#258) + ## [3.14.2] - 2024-04-18 -## Fixed +### Fixed - CPU usage for soft and hard irq was incorrectly reported. (#236) ## [3.14.1] - 2024-04-16 -## Fixed +### Fixed - CPU usage reporting via BPF would report CPU as always idle on some systems. (#233) ## [3.14.0] - 2024-04-03 -## Changed +### Changed - metriken crates updated which changes the msgpack output. (#224) @@ -25,7 +41,7 @@ ## [3.13.0] - 2024-04-01 -## Changed +### Changed - Memory sampler was reporting memory usage stats in KiB, but with bytes for the unit metadata. This change corrects the sampler to report memory usage in @@ -34,62 +50,62 @@ ## [3.12.0] - 2024-03-28 -## Added +### Added - MacOS cpu usage sampling. (#203) - Metric unit annotations are added and exposed as metadata. - Logs version number on startup. (#213) -## Fixed +### Fixed - Incorrect summary stats (percentiles) were reported in version 3.10.2, 3.10.3, and 3.11.0. (#216) ## [3.11.0] - 2024-03-25 -## Changed +### Changed - Refactored the scheduler and syscall BPF samplers to reduce overheads. (#193 #195) -## Added +### Added - BlockIO thoughput and operation metrics using BPF. (#198) - Network throughput and packet metrics using BPF. (#200) -## Fixed +### Fixed - Online CPU detection for CPU usage sampler needed a trimmed string. (#194) ## [3.10.3] - 2024-03-20 -## Fixed +### Fixed - Fixes an incorrect calculation of the number of online CPUs in the BPF-based CPU usage sampler. ## [3.10.2] - 2024-03-20 -## Fixed +### Fixed - Fixes a panic in the CPU perf event sampler due to a divide-by-zero. This occurs when there are no active perf event groups. (#185) ## [3.10.1] - 2024-03-20 -## Fixed +### Fixed - Fixes per-CPU idle time accounting in the BPF-based sampler. Starting in release 3.9.0 these metrics incorrectly report no idle time. (#181) ## [3.10.0] - 2024-03-19 -## Added +### Added - Additional system information fields including kernel version, CPU frequency details, network queues, and IRQ affinity. (#100) -## Fixed +### Fixed - Fixes a panic on some systems when perf counter initialization has failed. This bug was introduced in 3.9.0. (#175) @@ -98,7 +114,7 @@ ## [3.9.0] - 2024-03-15 -## Added +### Added - CPU usage metrics are now collected via BPF when available. (#165) - Perf event sampler can now initialize when only some counters are available. @@ -106,12 +122,12 @@ ## [3.8.0] - 2024-03-04 -## Added +### Added - Allows Rezolus to run on MacOS though sampler support is limited. - Provides msgpack exposition format as a more efficient exposition format. -## Fixed +### Fixed - Updates of various direct dependencies. @@ -230,7 +246,8 @@ - Rewritten implementation of Rezolus using libbpf-rs and perf-event2 to provide a more modern approach to BPF and Perf Event instrumentation. -[unreleased]: https://github.com/iopsystems/rezolus/compare/v3.14.2...HEAD +[unreleased]: https://github.com/iopsystems/rezolus/compare/v3.15.0...HEAD +[3.15.0]: https://github.com/iopsystems/rezolus/compare/v3.14.2...v3.15.0 [3.14.2]: https://github.com/iopsystems/rezolus/compare/v3.14.1...v3.14.2 [3.14.1]: https://github.com/iopsystems/rezolus/compare/v3.14.0...v3.14.1 [3.14.0]: https://github.com/iopsystems/rezolus/compare/v3.13.0...v3.14.0 diff --git a/Cargo.lock b/Cargo.lock index 3574f852..63b5b678 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -77,47 +77,48 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.13" +version = "0.6.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" +checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", + "is_terminal_polyfill", "utf8parse", ] [[package]] name = "anstyle" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" +checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" [[package]] name = "anstyle-parse" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" +checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" +checksum = "a64c907d4e79225ac72e2a354c9ce84d50ebb4586dee56c82b3ee73004f537f5" dependencies = [ "windows-sys 0.52.0", ] [[package]] name = "anstyle-wincon" -version = "3.0.2" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" +checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" dependencies = [ "anstyle", "windows-sys 0.52.0", @@ -131,9 +132,9 @@ checksum = "f538837af36e6f6a9be0faa67f9a314f8119e4e4b5867c6ab40ed60360142519" [[package]] name = "async-compression" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07dbbf24db18d609b1462965249abdf49129ccad073ec257da372adc83259c60" +checksum = "4e9eabd7a98fe442131a17c316bd9349c43695e49e730c3c8e12cfb5f4da2693" dependencies = [ "brotli", "flate2", @@ -145,9 +146,9 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.2.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" [[package]] name = "backtrace" @@ -193,9 +194,9 @@ dependencies = [ [[package]] name = "brotli" -version = "4.0.0" +version = "5.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "125740193d7fee5cc63ab9e16c2fdc4e07c74ba755cc53b327d6ea029e9fc569" +checksum = "19483b140a7ac7174d34b5a581b406c64f84da5409d3e09cf4fff604f9270e67" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -204,9 +205,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "3.0.0" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65622a320492e09b5e0ac436b14c54ff68199bac392d0e89a6832c4518eea525" +checksum = "e6221fe77a248b9117d431ad93761222e1cf8ff282d9d1d5d9f53d6299a1cf76" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -276,9 +277,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.94" +version = "1.0.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f6e324229dc011159fcc089755d1e2e216a90d43a7dea6853ca740b84f35e7" +checksum = "099a5357d84c4c61eb35fc8eafa9a79a902c2f76911e5747ced4e032edd8d9b4" [[package]] name = "cfg-if" @@ -354,9 +355,9 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" [[package]] name = "core-foundation-sys" @@ -429,9 +430,9 @@ dependencies = [ [[package]] name = "data-encoding" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e962a19be5cfc3f3bf6dd8f61eb50107f356ad6270fbb3ed41476571db78be5" +checksum = "e8566979429cf69b49a5c740c60791108e86440e8be149bbea4fe54d2c32d6e2" [[package]] name = "deranged" @@ -452,6 +453,12 @@ dependencies = [ "crypto-common", ] +[[package]] +name = "either" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2" + [[package]] name = "encoding_rs" version = "0.8.34" @@ -500,15 +507,15 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.2" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "658bd65b1cf4c852a3cc96f18a8ce7b5640f6b703f905c7d74532294c2a63984" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" [[package]] name = "flate2" -version = "1.0.28" +version = "1.0.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" +checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" dependencies = [ "crc32fast", "miniz_oxide", @@ -619,9 +626,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.3" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" [[package]] name = "headers" @@ -676,9 +683,9 @@ dependencies = [ [[package]] name = "histogram" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4d3bddd75a32b17e75762f128ffc7a33158b933b6eb27424da9be4a58f30eb9" +checksum = "5bd81cb9a629d0a868f2092937332fca5fd985971d2155bf6fcc225ef8a6be2c" dependencies = [ "serde", "thiserror", @@ -808,6 +815,21 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.11" @@ -893,9 +915,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.153" +version = "0.2.154" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "ae743338b92ff9146ce83992f766a31066a91a8c84a45e0e9f21e7cf6de6d346" [[package]] name = "libloading" @@ -935,9 +957,9 @@ checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] name = "lock_api" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" dependencies = [ "autocfg", "scopeguard", @@ -966,9 +988,9 @@ dependencies = [ [[package]] name = "memmap2" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92" +checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322" dependencies = [ "libc", ] @@ -1001,7 +1023,7 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc0bad9aa443621ae4972578da55b7f3c24d930cc1491551a8f989edead4762c" dependencies = [ - "histogram 0.10.0", + "histogram 0.10.1", "metriken-core", "metriken-derive", "once_cell", @@ -1033,12 +1055,12 @@ dependencies = [ [[package]] name = "metriken-exposition" -version = "0.5.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "668934cb4bd14f41e8d98a447ee237c76fc19ef0b935d5127026121a096ea86e" +checksum = "0fa6cd31e1dcdad522b4599076c96b3a26973fe32ff36f32000dbcd90e16d460" dependencies = [ "chrono", - "histogram 0.10.0", + "histogram 0.10.1", "metriken 0.6.0", "rmp-serde", "serde", @@ -1125,9 +1147,9 @@ checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" [[package]] name = "num-traits" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] @@ -1203,9 +1225,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "ouroboros" -version = "0.17.2" +version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2ba07320d39dfea882faa70554b4bd342a5f273ed59ba7c1c6b4c840492c954" +checksum = "97b7be5a8a3462b752f4be3ff2b2bf2f7f1d00834902e46be2a4d68b87b0573c" dependencies = [ "aliasable", "ouroboros_macro", @@ -1214,22 +1236,23 @@ dependencies = [ [[package]] name = "ouroboros_macro" -version = "0.17.2" +version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec4c6225c69b4ca778c0aea097321a64c421cf4577b331c61b229267edabb6f8" +checksum = "b645dcde5f119c2c454a92d0dfa271a2a3b205da92e4292a68ead4bdbfde1f33" dependencies = [ "heck 0.4.1", - "proc-macro-error", + "itertools", "proc-macro2", + "proc-macro2-diagnostics", "quote", "syn 2.0.48", ] [[package]] name = "parking_lot" -version = "0.12.1" +version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +checksum = "7e4af0ca4f6caed20e900d564c242b8e5d4903fdacf31d3daf527b66fe6f42fb" dependencies = [ "lock_api", "parking_lot_core", @@ -1237,15 +1260,15 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.9" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", "redox_syscall", "smallvec", - "windows-targets 0.48.5", + "windows-targets 0.52.0", ] [[package]] @@ -1289,7 +1312,7 @@ dependencies = [ "bitflags 2.4.2", "c-enum 0.1.2", "libc", - "memmap2 0.9.3", + "memmap2 0.9.4", "perf-event-data", "perf-event-open-sys2", ] @@ -1406,36 +1429,25 @@ dependencies = [ ] [[package]] -name = "proc-macro-error" -version = "1.0.4" +name = "proc-macro2" +version = "1.0.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba" dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn 1.0.109", - "version_check", + "unicode-ident", ] [[package]] -name = "proc-macro-error-attr" -version = "1.0.4" +name = "proc-macro2-diagnostics" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" dependencies = [ "proc-macro2", "quote", + "syn 2.0.48", "version_check", -] - -[[package]] -name = "proc-macro2" -version = "1.0.80" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56dea16b0a29e94408b9aa5e2940a4eedbd128a1ba20e8f7ae60fd3d465af0e" -dependencies = [ - "unicode-ident", + "yansi", ] [[package]] @@ -1479,11 +1491,11 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.4.1" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.4.2", ] [[package]] @@ -1517,12 +1529,12 @@ checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" [[package]] name = "rezolus" -version = "3.14.2" +version = "3.15.0" dependencies = [ "backtrace", "chrono", "clap", - "histogram 0.10.0", + "histogram 0.10.1", "humantime", "lazy_static", "libbpf-cargo", @@ -1530,7 +1542,7 @@ dependencies = [ "libbpf-sys", "libc", "linkme", - "memmap2 0.5.10", + "memmap2 0.9.4", "metriken 0.6.0", "metriken-exposition", "num_cpus", @@ -1565,9 +1577,9 @@ dependencies = [ [[package]] name = "rmp" -version = "0.8.12" +version = "0.8.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f9860a6cc38ed1da53456442089b4dfa35e7cedaa326df63017af88385e6b20" +checksum = "228ed7c16fa39782c3b3468e974aec2795e9089153cd08ee2e9aefb3613334c4" dependencies = [ "byteorder", "num-traits", @@ -1576,9 +1588,9 @@ dependencies = [ [[package]] name = "rmp-serde" -version = "1.1.2" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bffea85eea980d8a74453e5d02a8d93028f3c34725de143085a844ebe953258a" +checksum = "52e599a477cf9840e92f2cde9a7189e67b42c57532749bf90aea6ec10facd4db" dependencies = [ "byteorder", "rmp", @@ -1593,9 +1605,9 @@ checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" [[package]] name = "rustix" -version = "0.38.32" +version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ "bitflags 2.4.2", "errno 0.3.8", @@ -1665,18 +1677,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.197" +version = "1.0.200" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" +checksum = "ddc6f9cc94d67c0e21aaf7eda3a010fd3af78ebf6e096aa6e2e13c79749cce4f" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.197" +version = "1.0.200" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" +checksum = "856f046b9400cee3c8c94ed572ecdb752444c24528c035cd35882aad6f492bcb" dependencies = [ "proc-macro2", "quote", @@ -1685,9 +1697,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.115" +version = "1.0.116" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" +checksum = "3e17db7126d17feb94eb3fad46bf1a96b034e8aacbc2e775fe81505f8b0b2813" dependencies = [ "itoa", "ryu", @@ -1739,9 +1751,9 @@ dependencies = [ [[package]] name = "signal-hook-registry" -version = "1.4.1" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1" dependencies = [ "libc", ] @@ -1769,9 +1781,9 @@ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "socket2" -version = "0.5.6" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" dependencies = [ "libc", "windows-sys 0.52.0", @@ -1856,8 +1868,9 @@ dependencies = [ [[package]] name = "systeminfo" -version = "3.14.2" +version = "3.15.0" dependencies = [ + "log", "serde", "serde_json", "walkdir", @@ -1877,18 +1890,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.58" +version = "1.0.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" +checksum = "f0126ad08bff79f29fc3ae6a55cc72352056dfff61e3ff8bb7129476d44b23aa" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.58" +version = "1.0.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" +checksum = "d1cd413b5d558b4c5bf3680e324a6fa5014e7b7c067a51e69dbdf47eb7148b66" dependencies = [ "proc-macro2", "quote", @@ -1985,28 +1998,27 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.10" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" +checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" dependencies = [ "bytes", "futures-core", "futures-sink", "pin-project-lite", "tokio", - "tracing", ] [[package]] name = "toml" -version = "0.7.8" +version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd79e69d3b627db300ff956027cc6c3798cef26d22526befdfcd12feeb6d2257" +checksum = "e9dd1545e8208b4a5af1aa9bbd0b4cf7e9ea08fabc5d0a5c67fcaafa17433aa3" dependencies = [ "serde", "serde_spanned", "toml_datetime", - "toml_edit 0.19.15", + "toml_edit 0.22.12", ] [[package]] @@ -2025,10 +2037,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421" dependencies = [ "indexmap", - "serde", - "serde_spanned", "toml_datetime", - "winnow", + "winnow 0.5.40", ] [[package]] @@ -2039,7 +2049,20 @@ checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1" dependencies = [ "indexmap", "toml_datetime", - "winnow", + "winnow 0.5.40", +] + +[[package]] +name = "toml_edit" +version = "0.22.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3328d4f68a705b2a4498da1d580585d39a6510f98318a2cec3018a7ec61ddef" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "winnow 0.6.6", ] [[package]] @@ -2307,11 +2330,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.6" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b" dependencies = [ - "winapi 0.3.9", + "windows-sys 0.52.0", ] [[package]] @@ -2470,6 +2493,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "winnow" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0c976aaaa0e1f90dbb21e9587cdaf1d9679a1cde8875c0d6bd83ab96a208352" +dependencies = [ + "memchr", +] + [[package]] name = "wrapcenum-derive" version = "0.4.1" @@ -2482,20 +2514,26 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "yansi" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" + [[package]] name = "zerocopy" -version = "0.7.32" +version = "0.7.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +checksum = "087eca3c1eaf8c47b94d02790dd086cd594b912d2043d4de4bfdd466b3befb7c" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.32" +version = "0.7.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +checksum = "6f4b6c273f496d8fd4eaf18853e6b448760225dc030ff2c485a786859aea6393" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index c580ad38..06bbc374 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,35 +7,35 @@ edition = "2021" description = "High resolution systems performance telemetry agent" [workspace.package] -version = "3.14.2" +version = "3.15.0" license = "MIT OR Apache-2.0" [dependencies] systeminfo = { workspace = true } -backtrace = "0.3.69" -clap = "4.3.24" +backtrace = "0.3.71" +chrono = { version = "0.4.38", features = ["serde"] } +clap = "4.5.4" +histogram = { version = "0.10.0", features = ["serde"] } humantime = "2.1.0" lazy_static = "1.4.0" -libc = "0.2.147" -linkme = "0.3.15" +libc = "0.2.153" +linkme = "0.3.25" metriken = "0.6.0" -metriken-exposition = { version = "0.5.0", features = ["serde", "msgpack"] } -memmap2 = "0.5.10" +metriken-exposition = { version = "0.6.1", features = ["serde", "msgpack"] } +memmap2 = "0.9.4" num_cpus = "1.16.0" once_cell = "1.18.0" -ouroboros = "0.17.2" +ouroboros = "0.18.3" ringlog = "0.6.0" -serde = { version = "1.0.185", features = ["derive"] } +serde = { version = "1.0.198", features = ["derive"] } +serde_repr = "0.1.19" sysconf = "0.3.4" -syscall-numbers = "3.1.0" -tokio = { version = "1.32.0", features = ["full"] } -toml = "0.7.6" -walkdir = "2.3.3" -warp = { version = "0.3.6", features = ["compression"] } -serde_repr = "0.1.18" -histogram = { version = "0.10.0", features = ["serde"] } -chrono = { version = "0.4.33", features = ["serde"] } +syscall-numbers = "3.1.1" +tokio = { version = "1.37.0", features = ["full"] } +toml = "0.8.12" +walkdir = "2.5.0" +warp = { version = "0.3.7", features = ["compression"] } [target.'cfg(target_os = "linux")'.dependencies] # libbpf-sys version `1.4.0+v1.4.0` fails to compile on Amazon Linux 2 kernel 5.15 diff --git a/README.md b/README.md index 846a7d3c..64bef072 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,18 @@ Rezolus provides valuable data about systems performance and can be used to root cause production performance issues, capture better data in test environments, and provide signals for optimization efforts. +### Configuration + +Rezolus uses a TOML configuration. See `config.toml` in this project for an +example config file. + +### Dashboard + +If you are running Prometheus and Grafana for collecting and visualizing +metrics, the `dashboard.json` file is an example Grafana dashboard that +demonstrates some ways to use the collected data. This can help you get started +on your own dashboards. + ## Getting Help Join our [Discord server][discord] to ask questions and have discussions. diff --git a/build.rs b/build.rs index 291539d2..bf78e78e 100644 --- a/build.rs +++ b/build.rs @@ -10,7 +10,7 @@ mod bpf { // `SOURCES` lists all BPF programs and the sampler that contains them. // Each entry `(sampler, program)` maps to a unique path in the `samplers` // directory. - const SOURCES: &'static [(&str, &str)] = &[ + const SOURCES: &[(&str, &str)] = &[ ("block_io", "latency"), ("cpu", "usage"), ("network", "traffic"), diff --git a/config.toml b/config.toml index bddf4426..74df7a15 100644 --- a/config.toml +++ b/config.toml @@ -7,6 +7,10 @@ listen = "0.0.0.0:4242" # prometheus endpoint. compression = false +[log] +# Controls the log level: "error", "warn", "info", "debug", "trace" +level = "info" + [prometheus] # Controls whether the full distribution for each histogram is exposed via the # prometheus endpoint (`/metrics`). This adds a considerable number of time @@ -48,6 +52,10 @@ histogram_grouping_power = 4 # collection for that sampler. Setting the default to false requires that # individual sampler configs are used to opt-in to collection. enabled = true +# Controls whether BPF sampler will be used. When a metric can be collected +# without BPF, that sampler will be used instead. Otherwise, the sampler will +# effectively be disabled. +bpf = true # The collection interval for counter and gauge based metrics. Shorter intervals # allow for more accurately capturing bursts in the related percentile metrics. interval = "10ms" @@ -69,9 +77,11 @@ enabled = true [samplers.cpu_perf] enabled = true -# Instruments CPU usage by state by reading /proc/stat -[samplers.cpu_proc_stat] +# Instruments CPU usage by state with BPF or by reading /proc/stat on linux +# On macos host_processor_info() is used +[samplers.cpu_usage] enabled = true +bpf = true # Produces various nVIDIA specific GPU metrics using NVML [samplers.gpu_nvidia] @@ -85,6 +95,14 @@ enabled = true [samplers.memory_vmstat] enabled = true +# Produces network interface statistics from /sys/class/net for TX/RX errors +[samplers.network_interfaces] +enabled = true + +# Produces network traffic statistics using BPF +[samplers.network_traffic] +enabled = true + # Sample resource utilization for Rezolus itself [samplers.rezolus_rusage] enabled = true @@ -122,11 +140,8 @@ enabled = true [samplers.tcp_retransmit] enabled = true -# TCP sampler that reads from /proc/snmp -[samplers.tcp_snmp] -enabled = true - -# BPF sampler that probes TCP send and receive paths to instrument tx/rx size -# distribution, bytes, and packets. +# Samples TCP traffic using either a BPF sampler or /proc/net/snmp to provide +# metrics for TX/RX bytes and packets [samplers.tcp_traffic] enabled = true +bpf = true diff --git a/crates/systeminfo/Cargo.toml b/crates/systeminfo/Cargo.toml index 3877eeb4..a15f1c6d 100644 --- a/crates/systeminfo/Cargo.toml +++ b/crates/systeminfo/Cargo.toml @@ -11,6 +11,7 @@ cli = [ ] [dependencies] +log = "0.4.21" serde = { version = "1.0.188", features = ["derive"] } serde_json = { version = "1.0.107", optional = true } walkdir = "2.4.0" diff --git a/crates/systeminfo/src/hwinfo/cpufreq.rs b/crates/systeminfo/src/hwinfo/cpufreq.rs index c30ae737..11c36013 100644 --- a/crates/systeminfo/src/hwinfo/cpufreq.rs +++ b/crates/systeminfo/src/hwinfo/cpufreq.rs @@ -43,7 +43,7 @@ impl Cpufreq { let cpuinfo_cur_freq: Option = read_usize(format!( "/sys/devices/system/cpu/cpu{cpu}/cpufreq/cpuinfo_cur_freq" )) - .map_or(None, |v| Some(v)); + .ok(); let cpuinfo_transition_latency = read_usize(format!( "/sys/devices/system/cpu/cpu{cpu}/cpufreq/cpuinfo_transition_latency" ))?; diff --git a/crates/systeminfo/src/hwinfo/net.rs b/crates/systeminfo/src/hwinfo/net.rs index 3812b5d8..6dfa6fc6 100644 --- a/crates/systeminfo/src/hwinfo/net.rs +++ b/crates/systeminfo/src/hwinfo/net.rs @@ -50,6 +50,8 @@ pub struct RxQueue { fn get_interface(name: &OsStr) -> Result> { let name = name.to_str().ok_or_else(Error::invalid_interface_name)?; + debug!("discovering network interface info for: {name}"); + // skip any that aren't "up" let operstate = read_string(format!("/sys/class/net/{name}/operstate"))?; if operstate != "up" { @@ -91,35 +93,33 @@ fn get_interface(name: &OsStr) -> Result> { let entry = entry.unwrap(); if entry.file_type().is_dir() { if let Some(name) = entry.file_name().to_str() { - if name.starts_with("tx-") { - if let Ok(id) = u32::from_str_radix(&name[3..], 10) { - let xps_cpus = read_hexbitmap(entry.path().join("xps_cpus")); - let xps_rxqs = read_hexbitmap(entry.path().join("xps_rxqs")); - let xps = xps_cpus.len() > 0 || xps_rxqs.len() > 0; - queues.tx.push(TxQueue { - id, - xps, - xps_cpus, - xps_rxqs, - }); - } - } else if name.starts_with("rx-") { - if let Ok(id) = u32::from_str_radix(&name[3..], 10) { - let rps_cpus = read_hexbitmap(entry.path().join("rps_cpus")); - let rps_flow_cnt = read_usize(entry.path().join("rps_flow_cnt")).unwrap(); - let rps = rps_cpus.len() != 0; - queues.rx.push(RxQueue { - id, - rps, - rps_cpus, - rps_flow_cnt, - }); - } + if let Some(Ok(id)) = name.strip_prefix("tx-").map(|v| v.parse::()) { + let xps_cpus = read_hexbitmap(entry.path().join("xps_cpus")); + let xps_rxqs = read_hexbitmap(entry.path().join("xps_rxqs")); + let xps = !xps_cpus.is_empty() || !xps_rxqs.is_empty(); + queues.tx.push(TxQueue { + id, + xps, + xps_cpus, + xps_rxqs, + }); + } else if let Some(Ok(id)) = name.strip_prefix("rx-").map(|v| v.parse::()) { + let rps_cpus = read_hexbitmap(entry.path().join("rps_cpus")); + let rps_flow_cnt = read_usize(entry.path().join("rps_flow_cnt")).unwrap(); + let rps = !rps_cpus.is_empty(); + queues.rx.push(RxQueue { + id, + rps, + rps_cpus, + rps_flow_cnt, + }); } } } } + debug!("completed discovery for network interface: {name}"); + Ok(Some(Interface { name: name.to_string(), carrier, diff --git a/crates/systeminfo/src/lib.rs b/crates/systeminfo/src/lib.rs index 47b29615..c7e40bfd 100644 --- a/crates/systeminfo/src/lib.rs +++ b/crates/systeminfo/src/lib.rs @@ -4,6 +4,9 @@ #[macro_use] extern crate serde; +#[macro_use] +extern crate log; + mod error; pub mod hwinfo; diff --git a/dashboard.json b/dashboard.json new file mode 100644 index 00000000..6383bacc --- /dev/null +++ b/dashboard.json @@ -0,0 +1,5093 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.4.2" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 50, + "panels": [], + "title": "Key Indicators", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total CPU utilization", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 51, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(job) (irate(cpu_usage{job=~\"$host\", state!=\"idle\"}[$__rate_interval])) / on(job) sum by(job) (irate(cpu_usage{job=~\"$host\", state=~\".+\"}[$__rate_interval]))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}} - correct", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "% CPU Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total memory occupancy", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 52, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "1 - (memory_available{job=~\"$host\"} / memory_total{job=~\"$host\"})", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "% Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "CPU utilization in kernel-mode", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 53, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(job) (irate(cpu_usage{job=~\"$host\", state=\"system\"}[$__rate_interval])) / on(job) sum by(job) (irate(cpu_usage{job=~\"$host\", state=~\".+\"}[$__rate_interval]))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "% CPU System", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "CPU utilization in user-mode", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 54, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(job) (irate(cpu_usage{job=~\"$host\", state=\"user\"}[$__rate_interval])) / on(job) sum by(job) (irate(cpu_usage{job=~\"$host\", state=~\".+\"}[$__rate_interval]))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "% CPU User", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Nvidia GPU Utilization", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 56, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "gpu_utilization{job=~\"$host\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "{{job}} - {{id}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU Utilization", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 40, + "panels": [], + "title": "Rezolus", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of cores Rezolus is using, executing in system-mode", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 34, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(rezolus_cpu_usage_system{job=~\"$host\",percentile!~\".+\"}[$__rate_interval])/1000000000", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "System Cores", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of cores Rezolus is using, in user-mode", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 35, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(rezolus_cpu_usage_user{job=~\"$host\",percentile!~\".+\"}[$__rate_interval])/1000000000", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "User Cores", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 49, + "panels": [], + "title": "Block IO", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 60, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(blockio_bytes_total{job=~\"$host\",op=\"read\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}} - {{op}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Read Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 35 + }, + "id": 61, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(blockio_bytes_total{job=~\"$host\",op=\"write\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Write Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "iops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 43 + }, + "id": 62, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(blockio_operations_total{job=~\"$host\",op=\"read\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}} - {{op}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Read IOPS", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "iops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 43 + }, + "id": 63, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(blockio_operations_total{job=~\"$host\",op=\"write\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}} - {{op}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Write IOPS", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Distribution of approximate block IO sizes (for all block IOs)", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 51 + }, + "id": 47, + "interval": "1s", + "options": { + "calculate": false, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Viridis", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "bytes" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "irate(blockio_size_distribution_bucket{job=~\"$host\"}[$__rate_interval])", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "interval": "", + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "BlockIO Size", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Distribution of block IO latencies", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 48, + "interval": "1s", + "options": { + "calculate": false, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Viridis", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "ns" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "irate(blockio_latency_distribution_bucket{job=~\"$host\"}[$__rate_interval])", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "interval": "", + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "BlockIO Latency", + "type": "heatmap" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 59 + }, + "id": 41, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 60 + }, + "id": 42, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "gpu_utilization{job=~\"$host\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "{{job}} - {{id}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 60 + }, + "id": 44, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "gpu_memory_utilization{job=~\"$host\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "{{job}} - {{id}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Memory Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 68 + }, + "id": 45, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "gpu_pcie_throughput{job=~\"$host\",direction=\"receive\"} / on(job, id) gpu_pcie_bandwidth{job=~\"$host\"}", + "hide": false, + "instant": false, + "legendFormat": "{{job}} - {{id}}", + "range": true, + "refId": "B" + } + ], + "title": "PCIe RX Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 68 + }, + "id": 46, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "gpu_pcie_throughput{job=~\"$host\",direction=\"transmit\"} / on(job, id) gpu_pcie_bandwidth{job=~\"$host\"}", + "hide": false, + "instant": false, + "legendFormat": "{{job}} - {{id}}", + "range": true, + "refId": "B" + } + ], + "title": "PCIe TX Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 76 + }, + "id": 37, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "gpu_power_usage{job=~\"$host\"} / 1000", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "{{job}} - {{id}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Power Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 76 + }, + "id": 39, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "gpu_temperature{job=~\"$host\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "{{job}} - {{id}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Temperature", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "rothz" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 84 + }, + "id": 43, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "gpu_clock{job=~\"$host\", type=\"memory\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "{{job}} - {{id}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Memory Clock", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "rothz" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 84 + }, + "id": 38, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "gpu_clock{job=~\"$host\", type=\"compute\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "{{job}} - {{id}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Compute Clock", + "type": "timeseries" + } + ], + "title": "GPU", + "type": "row" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 60 + }, + "id": 64, + "panels": [], + "title": "Network", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 61 + }, + "id": 65, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(network_transmit_packets{job=~\"$host\",percentile!~\".+\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Transmit Packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 61 + }, + "id": 66, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(network_receive_packets{job=~\"$host\",percentile!~\".+\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Receive Packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 69 + }, + "id": 67, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(network_transmit_bytes{job=~\"$host\",percentile!~\".+\"}[$__rate_interval]) * 8", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Transmit Bandwidth", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 69 + }, + "id": 68, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(network_receive_bytes{job=~\"$host\",percentile!~\".+\"}[$__rate_interval]) * 8", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Receive Bandwidth", + "type": "timeseries" + }, + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 77 + }, + "id": 33, + "title": "TCP", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 78 + }, + "id": 24, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(tcp_transmit_packets{job=~\"$host\",percentile!~\".+\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Transmit Packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 78 + }, + "id": 25, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(tcp_receive_packets{job=~\"$host\",percentile!~\".+\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Receive Packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 86 + }, + "id": 26, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(tcp_transmit_bytes{job=~\"$host\",percentile!~\".+\"}[$__rate_interval]) * 8", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Transmit Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 86 + }, + "id": 27, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(tcp_receive_bytes{job=~\"$host\",percentile!~\".+\"}[$__rate_interval]) * 8", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Receive Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 94 + }, + "id": 32, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(tcp_transmit_retransmit{job=~\"$host\",percentile!~\".+\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Retransmit Packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The delay from a packet being ready until it is read by the userspace application", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 94 + }, + "id": 69, + "interval": "1s", + "options": { + "calculate": false, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Viridis", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "ns" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(tcp_packet_latency_distribution_bucket{job=~\"$host\"}[$__rate_interval])", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "interval": "", + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "TCP Packet Latency", + "type": "heatmap" + }, + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 102 + }, + "id": 23, + "title": "Scheduler", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Distribution of the time tasks spent runnable, but waiting in the runqueue", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 103 + }, + "id": 21, + "interval": "1s", + "options": { + "calculate": false, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Viridis", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "ns" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(scheduler_runqueue_latency_distribution_bucket{job=~\"$host\"}[$__rate_interval])", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "interval": "", + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Runqueue Latency", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Distribution of the amount of time that tasks spent running on-cpu", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 103 + }, + "id": 22, + "interval": "1s", + "options": { + "calculate": false, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Viridis", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "ns" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(scheduler_running_distribution_bucket{job=~\"$host\"}[$__rate_interval])", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "interval": "", + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Running", + "type": "heatmap" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 111 + }, + "id": 14, + "panels": [], + "title": "CPU", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 112 + }, + "id": 7, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "irate(cpu_usage_total{job=~\"$host\", state=\"system\"}[$__rate_interval]) / 1000000000", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "System", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Per-core CPU usage in system time, visualized as a heatmap", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 112 + }, + "id": 9, + "interval": "1s", + "options": { + "calculate": false, + "cellGap": 0, + "cellValues": { + "unit": "percentunit" + }, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "max": 1, + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Viridis", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(id) (irate(cpu_usage{job=~\"$host\", state=\"system\"}[$__rate_interval])) / sum by(id) (irate(cpu_usage{job=~\"$host\", state=~\".+\"}[$__rate_interval]))", + "format": "time_series", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{id}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "System", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 120 + }, + "id": 8, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "irate(cpu_usage_total{job=~\"$host\", state=\"user\"}[$__rate_interval]) / 1000000000", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "User", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 120 + }, + "id": 10, + "interval": "1s", + "options": { + "calculate": false, + "cellGap": 0, + "cellValues": { + "unit": "percentunit" + }, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "max": 1, + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Viridis", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "avg by(id) (irate(cpu_usage{job=~\"$host\", state=\"user\"}[$__rate_interval])) / sum by(id) (irate(cpu_usage{job=~\"$host\", state=~\".+\"}[$__rate_interval]))", + "format": "time_series", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{id}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "User", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 128 + }, + "id": 59, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum(cpu_ipus{job=~\"$host\", id=~\".+\"})", + "format": "time_series", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{id}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "IPUS", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 128 + }, + "id": 20, + "interval": "1s", + "options": { + "calculate": false, + "cellGap": 0, + "cellValues": { + "unit": "short" + }, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Viridis", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "cpu_ipus{job=~\"$host\", id=~\".+\"}", + "format": "time_series", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{id}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "IPUS", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 136 + }, + "id": 57, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum(cpu_ipkc{job=~\"$host\", id=~\".+\"})", + "format": "time_series", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{id}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "IPKC", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Instructions per kilocycle per-CPU, visualized as a heatmap", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 136 + }, + "id": 19, + "interval": "1s", + "options": { + "calculate": false, + "cellGap": 0, + "cellValues": { + "unit": "short" + }, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Viridis", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "cpu_ipkc{job=~\"$host\", id=~\".+\"}", + "format": "time_series", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{id}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "IPKC", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 144 + }, + "id": 18, + "interval": "1s", + "options": { + "calculate": false, + "cellGap": 0, + "cellValues": { + "unit": "rotmhz" + }, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Viridis", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "cpu_frequency{job=~\"$host\", id=~\".+\"}", + "format": "time_series", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{id}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Frequency", + "type": "heatmap" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 152 + }, + "id": 13, + "panels": [], + "title": "Syscall", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 153 + }, + "id": 11, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(syscall_total{job=~\"$host\",percentile!~\".+\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Syscall Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 153 + }, + "id": 15, + "interval": "1s", + "options": { + "calculate": false, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Viridis", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "ns" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(syscall_total_latency_distribution_bucket{job=~\"$host\"}[$__rate_interval])", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "interval": "", + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Syscall Latency", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 161 + }, + "id": 2, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(syscall_read{job=~\"$host\",percentile!~\".+\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Read Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 161 + }, + "id": 12, + "interval": "1s", + "options": { + "calculate": false, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Viridis", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "ns" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(syscall_read_latency_distribution_bucket{job=~\"$host\"}[$__rate_interval])", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "interval": "", + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Read Latency", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 169 + }, + "id": 3, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(syscall_write{job=~\"$host\",percentile!~\".+\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Write Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 169 + }, + "id": 5, + "interval": "1s", + "options": { + "calculate": false, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Viridis", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "ns" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(syscall_write_latency_distribution_bucket{job=~\"$host\"}[$__rate_interval])", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "interval": "", + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Write Latency", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 177 + }, + "id": 1, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(syscall_lock{job=~\"$host\",percentile!~\".+\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Lock Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 177 + }, + "id": 6, + "interval": "1s", + "options": { + "calculate": false, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Viridis", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "ns" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "irate(syscall_lock_latency_distribution_bucket{job=~\"$host\"}[$__rate_interval])", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "interval": "", + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Lock Latency", + "type": "heatmap" + } + ], + "refresh": "5s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(job)", + "hide": 0, + "includeAll": false, + "multi": true, + "name": "host", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(job)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Rezolus", + "uid": "a1872700-ffda-496b-a2da-ed5a9be7eb02", + "version": 5, + "weekStart": "" +} \ No newline at end of file diff --git a/src/common/bpf/counters.rs b/src/common/bpf/counters.rs index f2f19a73..c5f71388 100644 --- a/src/common/bpf/counters.rs +++ b/src/common/bpf/counters.rs @@ -122,7 +122,7 @@ impl<'a> Counters<'a> { let start = (cpu * self.cachelines * CACHELINE_SIZE) + (idx * std::mem::size_of::()); let value = u64::from_ne_bytes([ - self.mmap[start + 0], + self.mmap[start], self.mmap[start + 1], self.mmap[start + 2], self.mmap[start + 3], diff --git a/src/common/bpf/distribution.rs b/src/common/bpf/distribution.rs index 92bfd472..2ea17f7a 100644 --- a/src/common/bpf/distribution.rs +++ b/src/common/bpf/distribution.rs @@ -24,67 +24,72 @@ pub struct Distribution<'a> { _map: &'a libbpf_rs::Map, mmap: memmap2::MmapMut, buffer: Vec, + buckets: usize, + aligned: bool, histogram: &'static RwLockHistogram, } impl<'a> Distribution<'a> { pub fn new(map: &'a libbpf_rs::Map, histogram: &'static RwLockHistogram) -> Self { + let buckets = histogram.config().total_buckets(); + + let mmap_len = histogram_pages(buckets) * PAGE_SIZE; + let fd = map.as_fd().as_raw_fd(); let file = unsafe { std::fs::File::from_raw_fd(fd as _) }; let mmap = unsafe { memmap2::MmapOptions::new() - .len(HISTOGRAM_PAGES * PAGE_SIZE) + .len(mmap_len) .map_mut(&file) .expect("failed to mmap() bpf distribution") }; + // check the alignment + let (_prefix, data, _suffix) = unsafe { mmap.align_to::() }; + let expected_len = mmap_len / std::mem::size_of::(); + + let aligned = if data.len() == expected_len { + true + } else { + warn!("mmap region misaligned or did not have expected number of values {} != {expected_len}", data.len()); + false + }; + Self { _map: map, mmap, buffer: Vec::new(), + buckets, + aligned, histogram, } } pub fn refresh(&mut self) { - // If the mmap'd region is properly aligned we can more efficiently - // update the histogram. Otherwise, fall-back to the old strategy. - - let (_prefix, buckets, _suffix) = unsafe { self.mmap.align_to::() }; - - let expected_len = HISTOGRAM_PAGES * PAGE_SIZE / 8; - - if buckets.len() == expected_len { - let _ = self.histogram.update_from(&buckets[0..HISTOGRAM_BUCKETS]); + if self.aligned { + let (_prefix, buckets, _suffix) = unsafe { self.mmap.align_to::() }; + let _ = self.histogram.update_from(&buckets[0..self.buckets]); } else { - warn!("mmap region misaligned or did not have expected number of values {} != {expected_len}", buckets.len()); - - self.buffer.resize(HISTOGRAM_BUCKETS, 0); + self.buffer.resize(self.buckets, 0); for (idx, bucket) in self.buffer.iter_mut().enumerate() { let start = idx * std::mem::size_of::(); - if start + 7 >= self.mmap.len() { + if start + std::mem::size_of::() > self.mmap.len() { break; } - let val = u64::from_ne_bytes([ - self.mmap[start + 0], - self.mmap[start + 1], - self.mmap[start + 2], - self.mmap[start + 3], - self.mmap[start + 4], - self.mmap[start + 5], - self.mmap[start + 6], - self.mmap[start + 7], - ]); + let val = u64::from_ne_bytes( + <[u8; std::mem::size_of::()]>::try_from( + &self.mmap[start..(start + std::mem::size_of::())], + ) + .unwrap(), + ); *bucket = val; } - let _ = self - .histogram - .update_from(&self.buffer[0..HISTOGRAM_BUCKETS]); + let _ = self.histogram.update_from(&self.buffer[0..self.buckets]); } } } diff --git a/src/common/bpf/histogram.h b/src/common/bpf/histogram.h index 3aaafcb5..eba4bb42 100644 --- a/src/common/bpf/histogram.h +++ b/src/common/bpf/histogram.h @@ -1,16 +1,16 @@ // Helpers for converting values to histogram indices. +#define HISTOGRAM_BUCKETS_POW_4 976 +#define HISTOGRAM_BUCKETS_POW_5 1920 +#define HISTOGRAM_BUCKETS_POW_6 3776 +#define HISTOGRAM_BUCKETS_POW_7 7424 + // Function to count leading zeros, since we cannot use the builtin CLZ from // within BPF. But since we also can't loop, this is implemented as a binary -// search with a maximum of 6 branches. +// search with a maximum of 6 branches. static u32 clz(u64 value) { u32 count = 0; - // quick return if value is 0 - if (!value) { - return 64; - } - // binary search to find number of leading zeros if (value & 0xFFFFFFFF00000000) { if (value & 0xFFFF000000000000) { @@ -203,25 +203,23 @@ static u32 clz(u64 value) { } else { return 63; } + + return 64; } // base-2 histogram indexing function that is compatible with Rust `histogram` -// crate for m = 0, r = 8, n = 64 this gives us the ability to store counts for -// values from 1 -> u64::MAX and uses 7424 buckets per histogram, which occupies -// 58KB of space in kernelspace (where we use 64bit counters) -static u32 value_to_index(u64 value) { - if (value == 0) { - return 0; - } - - u64 h = 63 - clz(value); - // h < r - if (h < 8) { +// crate. +// +// See the indexing logic here: +// https://github.com/pelikan-io/rustcommon/blob/main/histogram/src/config.rs +static u32 value_to_index(u64 value, u8 grouping_power) { + if (value < (2 << grouping_power)) { return value; } else { - // d = h - r + 1 - u64 d = h - 7; - // ((d + 1) * G + ((value - (1 << h)) >> (m + d))) - return ((d + 1) * 128) + ((value - (1 << h)) >> d); + u64 power = 63 - clz(value); + u64 bin = power - grouping_power + 1; + u64 offset = (value - (1 << power)) >> (power - grouping_power); + + return (bin * (1 << grouping_power) + offset); } } diff --git a/src/common/bpf/mod.rs b/src/common/bpf/mod.rs index dd36cde0..a9ad6e31 100644 --- a/src/common/bpf/mod.rs +++ b/src/common/bpf/mod.rs @@ -1,4 +1,5 @@ use super::*; +use core::time::Duration; use metriken::DynBoxedMetric; use metriken::RwLockHistogram; use ouroboros::*; @@ -18,21 +19,96 @@ pub use counters::PercpuCounters; const PAGE_SIZE: usize = 4096; const CACHELINE_SIZE: usize = 64; -/// The maximum number of CPUs supported. Used to make `CounterSet`s behave like +/// The maximum number of CPUs supported. Allows a normal bpf map behave like /// per-CPU counters by packing counters into cacheline sized chunks such that /// no CPUs will share cacheline sized segments of the counter map. static MAX_CPUS: usize = 1024; -/// The number of histogram buckets based on a rustcommon histogram with the -/// parameters `grouping_power = 7` `max_value_power = 64`. -/// -/// NOTE: this *must* remain in-sync across both C and Rust components of BPF -/// code. -const HISTOGRAM_BUCKETS: usize = 7424; -const HISTOGRAM_PAGES: usize = 15; +/// Returns the next nearest whole number of pages that fits a histogram with +/// the provided config. +pub fn histogram_pages(buckets: usize) -> usize { + ((buckets * std::mem::size_of::()) + PAGE_SIZE - 1) / PAGE_SIZE +} + +/// A type that builds the userspace components of a BPF program including +/// registering counters, distributions, and intiailizing a map with values. +pub struct BpfBuilder { + bpf: _Bpf, +} + +impl BpfBuilder { + pub fn new(skel: T) -> Self { + Self { + bpf: _Bpf::from_skel(skel), + } + } + + pub fn build(self) -> Bpf { + Bpf { bpf: self.bpf } + } + + pub fn counters(mut self, name: &str, counters: Vec) -> Self { + self.bpf.add_counters(name, counters); + self + } + + pub fn percpu_counters( + mut self, + name: &str, + counters: Vec, + percpu: Arc, + ) -> Self { + self.bpf.add_counters_with_percpu(name, counters, percpu); + self + } + + pub fn distribution(mut self, name: &str, histogram: &'static RwLockHistogram) -> Self { + self.bpf.add_distribution(name, histogram); + self + } + + pub fn map(self, name: &str, values: &[u64]) -> Self { + let fd = self.bpf.map(name).as_fd().as_raw_fd(); + let file = unsafe { std::fs::File::from_raw_fd(fd as _) }; + let mut mmap = unsafe { + memmap2::MmapOptions::new() + .len(std::mem::size_of_val(values)) + .map_mut(&file) + .expect("failed to mmap() bpf map") + }; + + for (index, bytes) in mmap + .chunks_exact_mut(std::mem::size_of::()) + .enumerate() + { + let value = bytes.as_mut_ptr() as *mut u64; + unsafe { + *value = values[index]; + } + } + + let _ = mmap.flush(); + + self + } +} -#[self_referencing] pub struct Bpf { + bpf: _Bpf, +} + +impl Bpf { + pub fn refresh_counters(&mut self, elapsed: Duration) { + self.bpf.refresh_counters(elapsed.as_secs_f64()) + } + + pub fn refresh_distributions(&mut self) { + self.bpf.refresh_distributions() + } +} + +#[self_referencing] +struct _Bpf { skel: T, #[borrows(skel)] #[covariant] @@ -46,9 +122,9 @@ pub trait GetMap { fn map(&self, name: &str) -> &libbpf_rs::Map; } -impl Bpf { +impl _Bpf { pub fn from_skel(skel: T) -> Self { - BpfBuilder { + _BpfBuilder { skel, counters_builder: |_| Vec::new(), distributions_builder: |_| Vec::new(), diff --git a/src/common/interval.rs b/src/common/interval.rs new file mode 100644 index 00000000..475702ca --- /dev/null +++ b/src/common/interval.rs @@ -0,0 +1,43 @@ +use crate::Instant; +use core::time::Duration; + +pub struct Interval { + prev: Instant, + next: Instant, + period: Duration, +} + +impl Interval { + pub fn new(start: Instant, period: Duration) -> Self { + Self { + prev: start, + next: start, + period, + } + } + + /// Try to tick the interval forward to the provided instant. Returns true + /// if the interval has fired and returns false otherwise. + pub fn try_wait(&mut self, now: Instant) -> Result { + if now < self.next { + return Err(()); + } + + let next = self.next + self.period; + + // check if we have fallen behind + if next > now { + self.next = next; + } else { + // if we fell behind, don't sample again until the interval has + // elapsed + self.next = now + self.period; + } + + let elapsed = now - self.prev; + + self.prev = now; + + Ok(elapsed) + } +} diff --git a/src/common/mod.rs b/src/common/mod.rs index 9437b1b4..20035e0f 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -4,10 +4,13 @@ pub mod bpf; pub mod classic; pub mod units; +mod interval; mod nop; use metriken::AtomicHistogram; use metriken::LazyCounter; + +pub use interval::Interval; pub use nop::Nop; pub const HISTOGRAM_GROUPING_POWER: u8 = 7; @@ -49,138 +52,6 @@ impl Counter { } } -#[macro_export] -#[rustfmt::skip] -/// A convenience macro for constructing a lazily initialized -/// `metriken::Counter` given an identifier, name, and optional description. -macro_rules! counter { - ($ident:ident, $name:tt) => { - #[metriken::metric( - name = $name, - crate = metriken - )] - pub static $ident: Lazy = metriken::Lazy::new(|| { - metriken::Counter::new() - }); - }; - ($ident:ident, $name:tt, $description:tt) => { - #[metriken::metric( - name = $name, - crate = metriken - )] - pub static $ident: Lazy = metriken::Lazy::new(|| { - metriken::Counter::new() - }); - }; -} - -#[macro_export] -#[rustfmt::skip] -/// A convenience macro for constructing a lazily initialized -/// `metriken::Gauge` given an identifier, name, and optional description. -macro_rules! gauge { - ($ident:ident, $name:tt) => { - #[metriken::metric( - name = $name, - crate = metriken - )] - pub static $ident: Lazy = metriken::Lazy::new(|| { - metriken::Gauge::new() - }); - }; - ($ident:ident, $name:tt, $description:tt) => { - #[metriken::metric( - name = $name, - crate = metriken - )] - pub static $ident: Lazy = metriken::Lazy::new(|| { - metriken::Gauge::new() - }); - }; -} - -#[macro_export] -#[rustfmt::skip] -/// A convenience macro for constructing a lazily initialized -/// `metriken::AtomicHistogram` given an identifier, name, and optional -/// description. -/// -/// The histogram configuration used here can record counts for all 64bit -/// integer values with a maximum error of 0.78%. -macro_rules! histogram { - ($ident:ident, $name:tt) => { - #[metriken::metric( - name = $name, - crate = metriken - )] - pub static $ident: metriken::AtomicHistogram = metriken::AtomicHistogram::new($crate::common::HISTOGRAM_GROUPING_POWER, 64); - }; - ($ident:ident, $name:tt, $description:tt) => { - #[metriken::metric( - name = $name, - description = $description, - crate = metriken - )] - pub static $ident: metriken::AtomicHistogram = metriken::AtomicHistogram::new($crate::common::HISTOGRAM_GROUPING_POWER, 64); - }; -} - -#[macro_export] -#[rustfmt::skip] -/// A convenience macro for constructing a lazily initialized -/// `metriken::RwLockHistogram` given an identifier, name, and optional -/// description. -/// -/// The histogram configuration used here can record counts for all 64bit -/// integer values with a maximum error of 0.78%. -macro_rules! bpfhistogram { - ($ident:ident, $name:tt) => { - #[metriken::metric( - name = $name, - crate = metriken - )] - pub static $ident: metriken::RwLockHistogram = metriken::RwLockHistogram::new($crate::common::HISTOGRAM_GROUPING_POWER, 64); - }; - ($ident:ident, $name:tt, $description:tt) => { - #[metriken::metric( - name = $name, - description = $description, - crate = metriken - )] - pub static $ident: metriken::RwLockHistogram = metriken::RwLockHistogram::new($crate::common::HISTOGRAM_GROUPING_POWER, 64); - }; -} - -#[macro_export] -#[rustfmt::skip] -/// A convenience macro for constructing a lazily initialized counter with a -/// histogram which will track secondly rates for the same counter. -macro_rules! counter_with_histogram { - ($counter:ident, $histogram:ident, $name:tt) => { - self::counter!($counter, $name); - self::histogram!($histogram, $name); - }; - ($counter:ident, $histogram:ident, $name:tt, $description:tt) => { - self::counter!($counter, $name, $description); - self::histogram!($histogram, $name, $description); - } -} - -#[macro_export] -#[rustfmt::skip] -/// A convenience macro for constructing a lazily initialized gauge with a -/// histogram which will track instantaneous readings for the same gauge. -macro_rules! gauge_with_histogram { - ($gauge:ident, $histogram:ident, $name:tt) => { - self::gauge!($gauge, $name); - self::histogram!($histogram, $name); - }; - ($gauge:ident, $histogram:ident, $name:tt, $description:tt) => { - self::gauge!($gauge, $name, $description); - self::histogram!($histogram, $name, $description); - } -} - #[macro_export] #[rustfmt::skip] /// A convenience macro for defining a top-level sampler which will contain diff --git a/src/config/mod.rs b/src/config/mod.rs index a12274c2..de0ea29f 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1,4 +1,5 @@ use crate::Duration; +use ringlog::Level; use serde::Deserialize; use std::collections::HashMap; use std::net::SocketAddr; @@ -6,11 +7,15 @@ use std::net::ToSocketAddrs; use std::path::Path; #[derive(Deserialize)] -// #[serde(deny_unknown_fields)] pub struct Config { general: General, + #[serde(default)] + log: Log, + #[serde(default)] prometheus: Prometheus, + #[serde(default)] defaults: SamplerConfig, + #[serde(default)] samplers: HashMap, } @@ -41,6 +46,10 @@ impl Config { Ok(config) } + pub fn log(&self) -> &Log { + &self.log + } + pub fn defaults(&self) -> &SamplerConfig { &self.defaults } @@ -57,16 +66,6 @@ impl Config { &self.prometheus } - #[cfg(feature = "bpf")] - pub fn bpf(&self) -> bool { - true - } - - #[cfg(not(feature = "bpf"))] - pub fn bpf(&self) -> bool { - false - } - pub fn enabled(&self, name: &str) -> bool { self.samplers .get(name) @@ -74,6 +73,13 @@ impl Config { .unwrap_or(self.defaults.enabled()) } + pub fn bpf(&self, name: &str) -> bool { + self.samplers + .get(name) + .map(|c| c.bpf()) + .unwrap_or(self.defaults.bpf()) + } + pub fn interval(&self, name: &str) -> Duration { self.samplers .get(name) @@ -118,6 +124,41 @@ impl General { } } +#[derive(Deserialize)] +pub struct Log { + #[serde(with = "LevelDef")] + #[serde(default = "log_level")] + level: Level, +} + +impl Default for Log { + fn default() -> Self { + Self { level: log_level() } + } +} + +impl Log { + pub fn level(&self) -> Level { + self.level + } +} + +#[derive(Clone, Debug, Deserialize)] +#[serde(rename_all = "lowercase")] +#[serde(remote = "Level")] +#[serde(deny_unknown_fields)] +enum LevelDef { + Error, + Warn, + Info, + Debug, + Trace, +} + +fn log_level() -> Level { + Level::Info +} + #[derive(Deserialize)] pub struct Prometheus { #[serde(default = "disabled")] @@ -126,6 +167,15 @@ pub struct Prometheus { histogram_grouping_power: u8, } +impl Default for Prometheus { + fn default() -> Self { + Self { + histograms: false, + histogram_grouping_power: 4, + } + } +} + impl Prometheus { pub fn check(&self) { if !(2..=(crate::common::HISTOGRAM_GROUPING_POWER)).contains(&self.histogram_grouping_power) @@ -171,12 +221,25 @@ pub fn distribution_interval() -> String { pub struct SamplerConfig { #[serde(default = "enabled")] enabled: bool, + #[serde(default = "enabled")] + bpf: bool, #[serde(default = "interval")] interval: String, #[serde(default = "distribution_interval")] distribution_interval: String, } +impl Default for SamplerConfig { + fn default() -> Self { + Self { + enabled: true, + bpf: true, + interval: interval(), + distribution_interval: distribution_interval(), + } + } +} + impl SamplerConfig { pub fn check(&self, name: &str) { if let Err(e) = self.interval.parse::() { @@ -205,6 +268,10 @@ impl SamplerConfig { self.enabled } + pub fn bpf(&self) -> bool { + self.bpf + } + pub fn interval(&self) -> Duration { Duration::from_nanos( self.interval diff --git a/src/main.rs b/src/main.rs index 4892e336..5efc1861 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,9 @@ use backtrace::Backtrace; use clap::{Arg, Command}; use linkme::distributed_slice; +use metriken::metric; use metriken::Lazy; +use metriken::LazyCounter; use metriken_exposition::Histogram; use ringlog::*; use std::collections::HashMap; @@ -92,7 +94,11 @@ pub static PERCENTILES: &[(&str, f64)] = &[ #[distributed_slice] pub static SAMPLERS: [fn(config: &Config) -> Box] = [..]; -counter!(RUNTIME_SAMPLE_LOOP, "runtime/sample/loop"); +#[metric( + name = "runtime/sample/loop", + description = "The total number sampler loops executed" +)] +pub static RUNTIME_SAMPLE_LOOP: LazyCounter = LazyCounter::new(metriken::Counter::default); fn main() { // custom panic hook to terminate whole process after unwinding @@ -131,7 +137,7 @@ fn main() { // configure debug log let debug_output: Box = Box::new(Stderr::new()); - let level = Level::Info; + let level = config.log().level(); let debug_log = if level <= Level::Info { LogBuilder::new().format(ringlog::default_format) diff --git a/src/samplers/block_io/linux/latency/mod.bpf.c b/src/samplers/block_io/linux/latency/mod.bpf.c index e84e7d3d..c9281a9c 100644 --- a/src/samplers/block_io/linux/latency/mod.bpf.c +++ b/src/samplers/block_io/linux/latency/mod.bpf.c @@ -11,6 +11,7 @@ extern int LINUX_KERNEL_VERSION __kconfig; #define COUNTER_GROUP_WIDTH 8 +#define HISTOGRAM_POWER 7 #define MAX_CPUS 1024 #define REQ_OP_BITS 8 @@ -106,7 +107,7 @@ static int handle_block_rq_complete(struct request *rq, int error, unsigned int } } - idx = value_to_index(nr_bytes); + idx = value_to_index(nr_bytes, HISTOGRAM_POWER); cnt = bpf_map_lookup_elem(&size, &idx); if (cnt) { @@ -121,7 +122,7 @@ static int handle_block_rq_complete(struct request *rq, int error, unsigned int if (*tsp <= ts) { delta = ts - *tsp; - idx = value_to_index(delta); + idx = value_to_index(delta, HISTOGRAM_POWER); cnt = bpf_map_lookup_elem(&latency, &idx); if (cnt) { diff --git a/src/samplers/block_io/linux/latency/mod.rs b/src/samplers/block_io/linux/latency/mod.rs index afb9ec57..07dd1e67 100644 --- a/src/samplers/block_io/linux/latency/mod.rs +++ b/src/samplers/block_io/linux/latency/mod.rs @@ -13,15 +13,12 @@ mod bpf { static NAME: &str = "block_io_latency"; -use metriken::MetricBuilder; - use bpf::*; use crate::common::bpf::*; use crate::common::*; use crate::samplers::block_io::stats::*; use crate::samplers::block_io::*; -use crate::samplers::hwinfo::hardware_info; impl GetMap for ModSkel<'_> { fn map(&self, name: &str) -> &libbpf_rs::Map { @@ -50,7 +47,7 @@ pub struct Biolat { impl Biolat { pub fn new(config: &Config) -> Result { // check if sampler should be enabled - if !config.enabled(NAME) { + if !(config.enabled(NAME) && config.bpf(NAME)) { return Err(()); } @@ -61,18 +58,22 @@ impl Biolat { .load() .map_err(|e| error!("failed to load bpf program: {e}"))?; + debug!( + "{NAME} block_rq_insert() BPF instruction count: {}", + skel.progs().block_rq_insert().insn_cnt() + ); + debug!( + "{NAME} block_rq_issue() BPF instruction count: {}", + skel.progs().block_rq_issue().insn_cnt() + ); + debug!( + "{NAME} block_rq_complete() BPF instruction count: {}", + skel.progs().block_rq_complete().insn_cnt() + ); + skel.attach() .map_err(|e| error!("failed to attach bpf program: {e}"))?; - let mut bpf = Bpf::from_skel(skel); - - let mut distributions = vec![("latency", &BLOCKIO_LATENCY), ("size", &BLOCKIO_SIZE)]; - - let cpus = match hardware_info() { - Ok(hwinfo) => hwinfo.get_cpus(), - Err(_) => return Err(()), - }; - let counters = vec![ Counter::new(&BLOCKIO_READ_OPS, Some(&BLOCKIO_READ_OPS_HISTOGRAM)), Counter::new(&BLOCKIO_WRITE_OPS, Some(&BLOCKIO_WRITE_OPS_HISTOGRAM)), @@ -87,11 +88,11 @@ impl Biolat { ), ]; - bpf.add_counters("counters", counters); - - for (name, histogram) in distributions.drain(..) { - bpf.add_distribution(name, histogram); - } + let bpf = BpfBuilder::new(skel) + .counters("counters", counters) + .distribution("latency", &BLOCKIO_LATENCY) + .distribution("size", &BLOCKIO_SIZE) + .build(); Ok(Self { bpf, @@ -109,7 +110,7 @@ impl Biolat { return; } - let elapsed = (now - self.counter_prev).as_secs_f64(); + let elapsed = now - self.counter_prev; self.bpf.refresh_counters(elapsed); diff --git a/src/samplers/block_io/stats.rs b/src/samplers/block_io/stats.rs index d90a4ae6..d00a48e1 100644 --- a/src/samplers/block_io/stats.rs +++ b/src/samplers/block_io/stats.rs @@ -1,5 +1,4 @@ use crate::common::HISTOGRAM_GROUPING_POWER; -use crate::*; use metriken::*; #[metric( diff --git a/src/samplers/cpu/linux/perf/mod.rs b/src/samplers/cpu/linux/perf/mod.rs index 18c3df86..ef998e7e 100644 --- a/src/samplers/cpu/linux/perf/mod.rs +++ b/src/samplers/cpu/linux/perf/mod.rs @@ -1,3 +1,4 @@ +use crate::common::Interval; use crate::common::Nop; use crate::samplers::cpu::*; use metriken::{DynBoxedMetric, MetricBuilder}; @@ -30,9 +31,7 @@ fn init(config: &Config) -> Box { const NAME: &str = "cpu_perf"; pub struct Perf { - prev: Instant, - next: Instant, - interval: Duration, + interval: Interval, groups: Vec, counters: Vec>>, gauges: Vec>>, @@ -45,8 +44,6 @@ impl Perf { return Err(()); } - let now = Instant::now(); - let cpus = match hardware_info() { Ok(hwinfo) => hwinfo.get_cpus(), Err(_) => return Err(()), @@ -101,19 +98,17 @@ impl Perf { }; } - if groups.len() == 0 { + if groups.is_empty() { error!("Failed to create the perf group on any CPU"); return Err(()); } - return Ok(Self { - prev: now, - next: now, - interval: config.interval(NAME), + Ok(Self { + interval: Interval::new(Instant::now(), config.interval(NAME)), groups, counters, gauges, - }); + }) } } @@ -121,7 +116,7 @@ impl Sampler for Perf { fn sample(&mut self) { let now = Instant::now(); - if now < self.next { + if self.interval.try_wait(now).is_err() { return; } @@ -178,20 +173,5 @@ impl Sampler for Perf { CPU_FREQUENCY_AVERAGE.set((avg_running_frequency / nr_active_groups) as i64); CPU_CORES.set(nr_active_groups as _); } - - // determine when to sample next - let next = self.next + self.interval; - - // it's possible we fell behind - if next > now { - // if we didn't, sample at the next planned time - self.next = next; - } else { - // if we did, sample after the interval has elapsed - self.next = now + self.interval; - } - - // mark when we last sampled - self.prev = now; } } diff --git a/src/samplers/cpu/linux/perf/perf_group.rs b/src/samplers/cpu/linux/perf/perf_group.rs index 868b05f6..ecd7d924 100644 --- a/src/samplers/cpu/linux/perf/perf_group.rs +++ b/src/samplers/cpu/linux/perf/perf_group.rs @@ -149,7 +149,7 @@ impl PerfGroup { continue; } - if let Ok(c) = counter.as_follower(cpu, &mut group[leader_id].as_mut().unwrap()) { + if let Ok(c) = counter.as_follower(cpu, group[leader_id].as_mut().unwrap()) { group.resize_with(*counter as usize + 1, || None); group[*counter as usize] = Some(c); } @@ -173,12 +173,12 @@ impl PerfGroup { .map(|inner| GroupData { inner }) .ok(); - return Ok(Self { + Ok(Self { cpu, leader_id, group, prev, - }); + }) } pub fn get_metrics(&mut self) -> Result { @@ -226,30 +226,30 @@ impl PerfGroup { let mut mperf = None; if let Some(Some(c)) = &self.group.get(Counter::Cycles as usize) { - cycles = current.delta(prev, &c); + cycles = current.delta(prev, c); } if let Some(Some(c)) = &self.group.get(Counter::Instructions as usize) { - instructions = current.delta(prev, &c); + instructions = current.delta(prev, c); } if let Some(Some(c)) = &self.group.get(Counter::Tsc as usize) { - tsc = current.delta(prev, &c); + tsc = current.delta(prev, c); } if let Some(Some(c)) = &self.group.get(Counter::Aperf as usize) { - aperf = current.delta(prev, &c); + aperf = current.delta(prev, c); } if let Some(Some(c)) = &self.group.get(Counter::Mperf as usize) { - mperf = current.delta(prev, &c); + mperf = current.delta(prev, c); } - let ipkc = if instructions.is_some() && cycles.is_some() { - if cycles.unwrap() == 0 { + let ipkc = if let (Some(instructions), Some(cycles)) = (instructions, cycles) { + if cycles == 0 { None } else { - Some(instructions.unwrap() * 1000 / cycles.unwrap()) + Some(instructions * 1000 / cycles) } } else { None diff --git a/src/samplers/cpu/linux/stats.rs b/src/samplers/cpu/linux/stats.rs index 5afc9569..3ea4585c 100644 --- a/src/samplers/cpu/linux/stats.rs +++ b/src/samplers/cpu/linux/stats.rs @@ -1,6 +1,5 @@ use super::super::stats::*; use crate::common::HISTOGRAM_GROUPING_POWER; -use crate::*; use metriken::{metric, AtomicHistogram, Counter, Gauge, LazyCounter, LazyGauge}; #[metric( diff --git a/src/samplers/cpu/linux/usage/bpf.rs b/src/samplers/cpu/linux/usage/bpf.rs index 80dbe66d..d30adceb 100644 --- a/src/samplers/cpu/linux/usage/bpf.rs +++ b/src/samplers/cpu/linux/usage/bpf.rs @@ -1,5 +1,6 @@ const ONLINE_CORES_REFRESH: Duration = Duration::from_secs(1); +#[allow(clippy::module_inception)] mod bpf { include!(concat!(env!("OUT_DIR"), "/cpu_usage.bpf.rs")); } @@ -34,21 +35,21 @@ pub struct CpuUsage { percpu_counters: Arc, sum_prev: u64, percpu_sum_prev: Vec, - counter_interval: Duration, - counter_next: Instant, - counter_prev: Instant, - distribution_interval: Duration, - distribution_next: Instant, - distribution_prev: Instant, + counter_interval: Interval, + distribution_interval: Interval, online_cores: usize, online_cores_file: std::fs::File, - online_cores_interval: Duration, - online_cores_next: Instant, + online_cores_interval: Interval, } const IDLE_CPUTIME_INDEX: usize = 5; impl CpuUsage { pub fn new(config: &Config) -> Result { + // check if sampler should be enabled + if !(config.enabled(NAME) && config.bpf(NAME)) { + return Err(()); + } + let builder = ModSkelBuilder::default(); let mut skel = builder .open() @@ -56,11 +57,14 @@ impl CpuUsage { .load() .map_err(|e| error!("failed to load bpf program: {e}"))?; + debug!( + "{NAME} cpuacct_account_field() BPF instruction count: {}", + skel.progs().cpuacct_account_field_kprobe().insn_cnt() + ); + skel.attach() .map_err(|e| error!("failed to attach bpf program: {e}"))?; - let mut bpf = Bpf::from_skel(skel); - let mut online_cores_file = std::fs::File::open("/sys/devices/system/cpu/online") .map_err(|e| error!("couldn't open: {e}"))?; @@ -118,13 +122,9 @@ impl CpuUsage { let percpu_counters = Arc::new(percpu_counters); - bpf.add_counters_with_percpu("counters", counters, percpu_counters.clone()); - - let mut distributions = vec![]; - - for (name, histogram) in distributions.drain(..) { - bpf.add_distribution(name, histogram); - } + let bpf = BpfBuilder::new(skel) + .percpu_counters("counters", counters, percpu_counters.clone()) + .build(); let now = Instant::now(); @@ -133,29 +133,19 @@ impl CpuUsage { percpu_counters, sum_prev: 0, percpu_sum_prev: vec![0; cpus.len()], - counter_interval: config.interval(NAME), - counter_next: now, - counter_prev: now, - distribution_interval: config.distribution_interval(NAME), - distribution_next: now, - distribution_prev: now, + counter_interval: Interval::new(now, config.interval(NAME)), + distribution_interval: Interval::new(now, config.distribution_interval(NAME)), online_cores, online_cores_file, - online_cores_interval: ONLINE_CORES_REFRESH, - online_cores_next: now + ONLINE_CORES_REFRESH, + online_cores_interval: Interval::new(now, ONLINE_CORES_REFRESH), }) } - pub fn refresh_counters(&mut self, now: Instant) { - if now < self.counter_next { - return; - } - - // get the amount of time since we last sampled - let elapsed = now - self.counter_prev; + pub fn refresh_counters(&mut self, now: Instant) -> Result<(), ()> { + let elapsed = self.counter_interval.try_wait(now)?; // refresh the counters from the kernel-space counters - self.bpf.refresh_counters(elapsed.as_secs_f64()); + self.bpf.refresh_counters(elapsed); // get the new sum of all the counters let sum_now: u64 = sum(); @@ -185,59 +175,24 @@ impl CpuUsage { // update the previous sums self.sum_prev += busy_delta + idle_delta; - // determine when to sample next - let next = self.counter_next + self.counter_interval; - - // check that next sample time is in the future - if next > now { - self.counter_next = next; - } else { - self.counter_next = now + self.counter_interval; - } - - // mark when we last sampled - self.counter_prev = now; + Ok(()) } - pub fn refresh_distributions(&mut self, now: Instant) { - if now < self.distribution_next { - return; - } - + pub fn refresh_distributions(&mut self, now: Instant) -> Result<(), ()> { + self.distribution_interval.try_wait(now)?; self.bpf.refresh_distributions(); - // determine when to sample next - let next = self.distribution_next + self.distribution_interval; - - // check that next sample time is in the future - if next > now { - self.distribution_next = next; - } else { - self.distribution_next = now + self.distribution_interval; - } - - // mark when we last sampled - self.distribution_prev = now; + Ok(()) } - pub fn update_online_cores(&mut self, now: Instant) { - if now < self.online_cores_next { - return; - } + pub fn update_online_cores(&mut self, now: Instant) -> Result<(), ()> { + self.online_cores_interval.try_wait(now)?; if let Ok(v) = online_cores(&mut self.online_cores_file) { self.online_cores = v; } - // determine when to update next - let next = self.online_cores_next + self.online_cores_interval; - - // check that next update time is in the future - if next > now { - self.online_cores_next = next; - } else { - self.online_cores_next = now + self.online_cores_interval; - } + Ok(()) } } @@ -260,8 +215,7 @@ fn sum() -> u64 { } fn online_cores(file: &mut std::fs::File) -> Result { - let _ = file - .rewind() + file.rewind() .map_err(|e| error!("failed to seek to start of file: {e}"))?; let mut count = 0; @@ -307,8 +261,8 @@ fn online_cores(file: &mut std::fs::File) -> Result { impl Sampler for CpuUsage { fn sample(&mut self) { let now = Instant::now(); - self.update_online_cores(now); - self.refresh_counters(now); - self.refresh_distributions(now); + let _ = self.update_online_cores(now); + let _ = self.refresh_counters(now); + let _ = self.refresh_distributions(now); } } diff --git a/src/samplers/cpu/linux/usage/proc_stat.rs b/src/samplers/cpu/linux/usage/proc_stat.rs index 89b9c8a9..1f97dd7b 100644 --- a/src/samplers/cpu/linux/usage/proc_stat.rs +++ b/src/samplers/cpu/linux/usage/proc_stat.rs @@ -1,4 +1,4 @@ -use crate::common::Counter; +use crate::common::{Counter, Interval}; use crate::samplers::cpu::*; use crate::samplers::hwinfo::hardware_info; use metriken::DynBoxedMetric; @@ -9,9 +9,7 @@ use std::io::{Read, Seek}; use super::NAME; pub struct ProcStat { - prev: Instant, - next: Instant, - interval: Duration, + interval: Interval, nanos_per_tick: u64, file: File, counters_total: Vec, @@ -25,8 +23,6 @@ impl ProcStat { return Err(()); } - let now = Instant::now(); - let cpus = match hardware_info() { Ok(hwinfo) => hwinfo.get_cpus(), Err(_) => return Err(()), @@ -90,41 +86,16 @@ impl ProcStat { counters_total, counters_percpu, nanos_per_tick, - prev: now, - next: now, - interval: config.interval(NAME), + interval: Interval::new(Instant::now(), config.interval(NAME)), }) } } impl Sampler for ProcStat { fn sample(&mut self) { - let now = Instant::now(); - - if now < self.next { - return; - } - - let elapsed = (now - self.prev).as_secs_f64(); - - if self.sample_proc_stat(elapsed).is_err() { - return; + if let Ok(elapsed) = self.interval.try_wait(Instant::now()) { + let _ = self.sample_proc_stat(elapsed.as_secs_f64()); } - - // determine when to sample next - let next = self.next + self.interval; - - // it's possible we fell behind - if next > now { - // if we didn't, sample at the next planned time - self.next = next; - } else { - // if we did, sample after the interval has elapsed - self.next = now + self.interval; - } - - // mark when we last sampled - self.prev = now; } } diff --git a/src/samplers/cpu/macos/usage/mod.rs b/src/samplers/cpu/macos/usage/mod.rs index 0d07908e..bfb6792b 100644 --- a/src/samplers/cpu/macos/usage/mod.rs +++ b/src/samplers/cpu/macos/usage/mod.rs @@ -1,7 +1,7 @@ -use crate::common::{Counter, Nop}; +use crate::common::{Counter, Interval, Nop}; use crate::samplers::cpu::*; use crate::{distributed_slice, Config, Sampler}; -use core::time::Duration; + use libc::mach_port_t; use metriken::{DynBoxedMetric, MetricBuilder}; use ringlog::error; @@ -19,9 +19,7 @@ fn init(config: &Config) -> Box { } struct CpuUsage { - prev: Instant, - next: Instant, - interval: Duration, + interval: Interval, port: mach_port_t, nanos_per_tick: u64, counters_total: Vec, @@ -35,8 +33,6 @@ impl CpuUsage { return Err(()); } - let now = Instant::now(); - let cpus = num_cpus::get(); let counters_total = vec![ @@ -73,9 +69,7 @@ impl CpuUsage { let nanos_per_tick = 1_000_000_000 / (sc_clk_tck as u64); Ok(Self { - prev: now, - next: now, - interval: config.interval(NAME), + interval: Interval::new(Instant::now(), config.interval(NAME)), port: unsafe { libc::mach_host_self() }, nanos_per_tick, counters_total, @@ -86,32 +80,11 @@ impl CpuUsage { impl Sampler for CpuUsage { fn sample(&mut self) { - let now = Instant::now(); - - if now < self.next { - return; - } - - let elapsed = (now - self.prev).as_secs_f64(); - - if unsafe { self.sample_processor_info(elapsed) }.is_err() { - return; - } - - // determine when to sample next - let next = self.next + self.interval; - - // it's possible we fell behind - if next > now { - // if we didn't, sample at the next planned time - self.next = next; - } else { - // if we did, sample after the interval has elapsed - self.next = now + self.interval; + if let Ok(elapsed) = self.interval.try_wait(Instant::now()) { + unsafe { + let _ = self.sample_processor_info(elapsed.as_secs_f64()); + } } - - // mark when we last sampled - self.prev = now; } } diff --git a/src/samplers/filesystem/linux/descriptors/mod.rs b/src/samplers/filesystem/linux/descriptors/mod.rs new file mode 100644 index 00000000..bf328dc3 --- /dev/null +++ b/src/samplers/filesystem/linux/descriptors/mod.rs @@ -0,0 +1,18 @@ +use crate::common::Nop; +use crate::samplers::filesystem::FILESYSTEM_SAMPLERS; +use crate::{distributed_slice, Config, Sampler}; + +const NAME: &str = "filesystem_descriptors"; + +mod procfs; + +use procfs::*; + +#[distributed_slice(FILESYSTEM_SAMPLERS)] +fn init(config: &Config) -> Box { + if let Ok(s) = Procfs::new(config) { + Box::new(s) + } else { + Box::new(Nop {}) + } +} diff --git a/src/samplers/filesystem/linux/descriptors/procfs.rs b/src/samplers/filesystem/linux/descriptors/procfs.rs new file mode 100644 index 00000000..4cb315aa --- /dev/null +++ b/src/samplers/filesystem/linux/descriptors/procfs.rs @@ -0,0 +1,63 @@ +use crate::common::Interval; +use crate::samplers::filesystem::*; +use crate::{error, Config, Instant, Sampler}; +use std::fs::File; +use std::io::{Read, Seek}; + +use super::NAME; + +pub struct Procfs { + interval: Interval, + file: File, +} + +impl Procfs { + pub fn new(config: &Config) -> Result { + // check if sampler should be enabled + if !config.enabled(NAME) { + return Err(()); + } + + let file = std::fs::File::open("/proc/sys/fs/file-nr").map_err(|e| { + error!("failed to open: {e}"); + })?; + + Ok(Self { + file, + interval: Interval::new(Instant::now(), config.interval(NAME)), + }) + } +} + +impl Sampler for Procfs { + fn sample(&mut self) { + if self.interval.try_wait(Instant::now()).is_err() { + return; + } + + let _ = self.sample_procfs(); + } +} + +impl Procfs { + fn sample_procfs(&mut self) -> Result<(), std::io::Error> { + self.file.rewind()?; + + let mut data = String::new(); + self.file.read_to_string(&mut data)?; + + let mut lines = data.lines(); + + if let Some(line) = lines.next() { + let parts: Vec<&str> = line.split_whitespace().collect(); + + if parts.len() == 3 { + if let Ok(open) = parts[0].parse::() { + FILESYSTEM_DESCRIPTORS_OPEN.set(open); + } + } + } + + Ok(()) + } +} diff --git a/src/samplers/filesystem/linux/mod.rs b/src/samplers/filesystem/linux/mod.rs new file mode 100644 index 00000000..10aaf486 --- /dev/null +++ b/src/samplers/filesystem/linux/mod.rs @@ -0,0 +1,3 @@ +mod descriptors; + +pub mod stats; diff --git a/src/samplers/filesystem/linux/stats.rs b/src/samplers/filesystem/linux/stats.rs new file mode 100644 index 00000000..0b003bdc --- /dev/null +++ b/src/samplers/filesystem/linux/stats.rs @@ -0,0 +1,7 @@ +use metriken::{metric, Gauge, LazyGauge}; + +#[metric( + name = "filesystem/descriptors/open", + description = "The number of file descriptors currently allocated" +)] +pub static FILESYSTEM_DESCRIPTORS_OPEN: LazyGauge = LazyGauge::new(Gauge::default); diff --git a/src/samplers/filesystem/mod.rs b/src/samplers/filesystem/mod.rs new file mode 100644 index 00000000..ed447ade --- /dev/null +++ b/src/samplers/filesystem/mod.rs @@ -0,0 +1,9 @@ +use crate::*; + +sampler!(Filesystem, "filesystem", FILESYSTEM_SAMPLERS); + +#[cfg(target_os = "linux")] +mod linux; + +#[cfg(target_os = "linux")] +pub use linux::stats::*; diff --git a/src/samplers/gpu/nvidia/mod.rs b/src/samplers/gpu/nvidia/mod.rs index 9291cb33..eec31a1a 100644 --- a/src/samplers/gpu/nvidia/mod.rs +++ b/src/samplers/gpu/nvidia/mod.rs @@ -1,5 +1,6 @@ use super::stats::*; use super::*; +use crate::common::Interval; use crate::common::Nop; use metriken::{DynBoxedMetric, MetricBuilder}; use nvml_wrapper::enum_wrappers::device::*; @@ -21,9 +22,7 @@ fn init(config: &Config) -> Box { const NAME: &str = "gpu_nvidia"; pub struct Nvidia { - prev: Instant, - next: Instant, - interval: Duration, + interval: Interval, nvml: Nvml, pergpu_metrics: Vec, } @@ -69,7 +68,6 @@ impl Nvidia { return Err(()); } - let now = Instant::now(); let nvml = Nvml::init().map_err(|e| { error!("error initializing: {e}"); })?; @@ -151,9 +149,7 @@ impl Nvidia { Ok(Self { nvml, - prev: now, - next: now, - interval: config.interval(NAME), + interval: Interval::new(Instant::now(), config.interval(NAME)), pergpu_metrics, }) } @@ -163,28 +159,13 @@ impl Sampler for Nvidia { fn sample(&mut self) { let now = Instant::now(); - if now < self.next { + if self.interval.try_wait(now).is_err() { return; } if let Err(e) = self.sample_nvml(now) { error!("error sampling: {e}"); } - - // determine when to sample next - let next = self.next + self.interval; - - // it's possible we fell behind - if next > now { - // if we didn't, sample at the next planned time - self.next = next; - } else { - // if we did, sample after the interval has elapsed - self.next = now + self.interval; - } - - // mark when we last sampled - self.prev = now; } } diff --git a/src/samplers/memory/linux/proc_meminfo/mod.rs b/src/samplers/memory/linux/proc_meminfo/mod.rs index e56284be..6c6bf5e4 100644 --- a/src/samplers/memory/linux/proc_meminfo/mod.rs +++ b/src/samplers/memory/linux/proc_meminfo/mod.rs @@ -1,8 +1,8 @@ use crate::common::units::KIBIBYTES; -use crate::common::Nop; +use crate::common::{Interval, Nop}; use crate::samplers::memory::stats::*; use crate::samplers::memory::*; -use metriken::{Gauge, Lazy}; +use metriken::Gauge; use std::collections::HashMap; use std::fs::File; use std::io::{Read, Seek}; @@ -19,9 +19,7 @@ fn init(config: &Config) -> Box { const NAME: &str = "memory_meminfo"; pub struct ProcMeminfo { - prev: Instant, - next: Instant, - interval: Duration, + interval: Interval, file: File, gauges: HashMap<&'static str, &'static Gauge>, } @@ -34,8 +32,6 @@ impl ProcMeminfo { return Err(()); } - let now = Instant::now(); - let gauges: HashMap<&str, &Gauge> = HashMap::from([ ("MemTotal:", &*MEMORY_TOTAL), ("MemFree:", &*MEMORY_FREE), @@ -47,9 +43,7 @@ impl ProcMeminfo { Ok(Self { file: File::open("/proc/meminfo").expect("file not found"), gauges, - prev: now, - next: now, - interval: config.interval(NAME), + interval: Interval::new(Instant::now(), config.interval(NAME)), }) } } @@ -58,28 +52,11 @@ impl Sampler for ProcMeminfo { fn sample(&mut self) { let now = Instant::now(); - if now < self.next { - return; - } - - if self.sample_proc_meminfo(now).is_err() { + if self.interval.try_wait(now).is_err() { return; } - // determine when to sample next - let next = self.next + self.interval; - - // it's possible we fell behind - if next > now { - // if we didn't, sample at the next planned time - self.next = next; - } else { - // if we did, sample after the interval has elapsed - self.next = now + self.interval; - } - - // mark when we last sampled - self.prev = now; + let _ = self.sample_proc_meminfo(now).is_err(); } } diff --git a/src/samplers/memory/linux/proc_vmstat/mod.rs b/src/samplers/memory/linux/proc_vmstat/mod.rs index a280e63d..285c9e3d 100644 --- a/src/samplers/memory/linux/proc_vmstat/mod.rs +++ b/src/samplers/memory/linux/proc_vmstat/mod.rs @@ -1,4 +1,4 @@ -use crate::common::{Counter, Nop}; +use crate::common::{Counter, Interval, Nop}; use crate::samplers::memory::stats::*; use crate::samplers::memory::*; use std::collections::HashMap; @@ -17,9 +17,7 @@ fn init(config: &Config) -> Box { const NAME: &str = "memory_vmstat"; pub struct ProcVmstat { - prev: Instant, - next: Instant, - interval: Duration, + interval: Interval, counters: HashMap<&'static str, Counter>, file: File, } @@ -32,8 +30,6 @@ impl ProcVmstat { return Err(()); } - let now = Instant::now(); - let counters = HashMap::from([ ("numa_hit", Counter::new(&MEMORY_NUMA_HIT, None)), ("numa_miss", Counter::new(&MEMORY_NUMA_MISS, None)), @@ -49,41 +45,16 @@ impl ProcVmstat { Ok(Self { file: File::open("/proc/vmstat").expect("file not found"), counters, - prev: now, - next: now, - interval: config.interval(NAME), + interval: Interval::new(Instant::now(), config.interval(NAME)), }) } } impl Sampler for ProcVmstat { fn sample(&mut self) { - let now = Instant::now(); - - if now < self.next { - return; - } - - let elapsed = (now - self.prev).as_secs_f64(); - - if self.sample_proc_vmstat(elapsed).is_err() { - return; + if let Ok(elapsed) = self.interval.try_wait(Instant::now()) { + let _ = self.sample_proc_vmstat(elapsed.as_secs_f64()); } - - // determine when to sample next - let next = self.next + self.interval; - - // it's possible we fell behind - if next > now { - // if we didn't, sample at the next planned time - self.next = next; - } else { - // if we did, sample after the interval has elapsed - self.next = now + self.interval; - } - - // mark when we last sampled - self.prev = now; } } diff --git a/src/samplers/memory/stats.rs b/src/samplers/memory/stats.rs index d7af1df1..0f32f441 100644 --- a/src/samplers/memory/stats.rs +++ b/src/samplers/memory/stats.rs @@ -1,4 +1,3 @@ -use crate::*; use metriken::{metric, Counter, Gauge, LazyCounter, LazyGauge}; #[metric( diff --git a/src/samplers/mod.rs b/src/samplers/mod.rs index 227b4771..987ede2a 100644 --- a/src/samplers/mod.rs +++ b/src/samplers/mod.rs @@ -1,7 +1,9 @@ +pub mod hwinfo; + mod block_io; mod cpu; +mod filesystem; mod gpu; -pub mod hwinfo; mod memory; mod network; mod rezolus; diff --git a/src/samplers/network/linux/interfaces/mod.rs b/src/samplers/network/linux/interfaces/mod.rs new file mode 100644 index 00000000..594f7731 --- /dev/null +++ b/src/samplers/network/linux/interfaces/mod.rs @@ -0,0 +1,104 @@ +use crate::common::{Interval, Nop}; +use crate::samplers::hwinfo::hardware_info; +use crate::samplers::network::stats::*; +use crate::samplers::network::*; +use metriken::Counter; +use std::fs::File; +use std::io::Read; +use std::io::Seek; + +#[distributed_slice(NETWORK_SAMPLERS)] +fn init(config: &Config) -> Box { + if let Ok(s) = Interfaces::new(config) { + Box::new(s) + } else { + Box::new(Nop::new(config)) + } +} + +const NAME: &str = "network_interfaces"; + +pub struct Interfaces { + interval: Interval, + stats: Vec<(&'static Lazy, &'static str, HashMap)>, +} + +impl Interfaces { + pub fn new(config: &Config) -> Result { + // check if sampler should be enabled + if !config.enabled(NAME) { + return Err(()); + } + + let hwinfo = hardware_info().map_err(|e| { + error!("failed to load hardware info: {e}"); + })?; + + let mut metrics = vec![ + (&NETWORK_RX_CRC_ERRORS, "rx_crc_errors"), + (&NETWORK_RX_DROPPED, "rx_dropped"), + (&NETWORK_RX_MISSED_ERRORS, "rx_missed_errors"), + (&NETWORK_TX_DROPPED, "tx_dropped"), + ]; + + let mut stats = Vec::new(); + let mut d = String::new(); + + for (counter, stat) in metrics.drain(..) { + let mut if_stats = HashMap::new(); + + for interface in &hwinfo.network { + if interface.driver.is_none() { + continue; + } + + if let Ok(mut f) = std::fs::File::open(&format!( + "/sys/class/net/{}/statistics/{stat}", + interface.name + )) { + if f.read_to_string(&mut d).is_ok() && d.parse::().is_ok() { + if_stats.insert(interface.name.to_string(), f); + } + } + } + + stats.push((counter, stat, if_stats)); + } + + Ok(Self { + stats, + interval: Interval::new(Instant::now(), config.interval(NAME)), + }) + } +} + +impl Sampler for Interfaces { + fn sample(&mut self) { + if self.interval.try_wait(Instant::now()).is_err() { + return; + } + + let mut data = String::new(); + + 'outer: for (counter, _stat, ref mut if_stats) in &mut self.stats { + let mut sum = 0; + + for file in if_stats.values_mut() { + if file.rewind().is_ok() { + if let Err(e) = file.read_to_string(&mut data) { + error!("error reading: {e}"); + continue 'outer; + } + + if let Ok(v) = data.parse::() { + sum += v; + } else { + continue 'outer; + } + } + } + + counter.set(sum); + } + } +} diff --git a/src/samplers/network/linux/mod.rs b/src/samplers/network/linux/mod.rs index e195352c..a4645ef3 100644 --- a/src/samplers/network/linux/mod.rs +++ b/src/samplers/network/linux/mod.rs @@ -1 +1,2 @@ +mod interfaces; mod traffic; diff --git a/src/samplers/network/linux/traffic/bpf.rs b/src/samplers/network/linux/traffic/bpf.rs index b1761cb3..70564b77 100644 --- a/src/samplers/network/linux/traffic/bpf.rs +++ b/src/samplers/network/linux/traffic/bpf.rs @@ -1,3 +1,4 @@ +#[allow(clippy::module_inception)] mod bpf { include!(concat!(env!("OUT_DIR"), "/network_traffic.bpf.rs")); } @@ -28,18 +29,14 @@ impl GetMap for ModSkel<'_> { /// * `network/transmit/frames` pub struct NetworkTraffic { bpf: Bpf>, - counter_interval: Duration, - counter_next: Instant, - counter_prev: Instant, - distribution_interval: Duration, - distribution_next: Instant, - distribution_prev: Instant, + counter_interval: Interval, + distribution_interval: Interval, } impl NetworkTraffic { pub fn new(config: &Config) -> Result { // check if sampler should be enabled - if !config.enabled(NAME) { + if !(config.enabled(NAME) && config.bpf(NAME)) { return Err(()); } @@ -50,11 +47,18 @@ impl NetworkTraffic { .load() .map_err(|e| error!("failed to load bpf program: {e}"))?; + debug!( + "{NAME} netif_receive_skb() BPF instruction count: {}", + skel.progs().netif_receive_skb().insn_cnt() + ); + debug!( + "{NAME} tcp_cleanup_rbuf() BPF instruction count: {}", + skel.progs().tcp_cleanup_rbuf().insn_cnt() + ); + skel.attach() .map_err(|e| error!("failed to attach bpf program: {e}"))?; - let mut bpf = Bpf::from_skel(skel); - let counters = vec![ Counter::new(&NETWORK_RX_BYTES, Some(&NETWORK_RX_BYTES_HISTOGRAM)), Counter::new(&NETWORK_TX_BYTES, Some(&NETWORK_TX_BYTES_HISTOGRAM)), @@ -62,68 +66,38 @@ impl NetworkTraffic { Counter::new(&NETWORK_TX_PACKETS, Some(&NETWORK_TX_PACKETS_HISTOGRAM)), ]; - bpf.add_counters("counters", counters); + let bpf = BpfBuilder::new(skel).counters("counters", counters).build(); + + let now = Instant::now(); Ok(Self { bpf, - counter_interval: config.interval(NAME), - counter_next: Instant::now(), - counter_prev: Instant::now(), - distribution_interval: config.distribution_interval(NAME), - distribution_next: Instant::now(), - distribution_prev: Instant::now(), + counter_interval: Interval::new(now, config.interval(NAME)), + distribution_interval: Interval::new(now, config.distribution_interval(NAME)), }) } - pub fn refresh_counters(&mut self, now: Instant) { - if now < self.counter_next { - return; - } - - let elapsed = (now - self.counter_prev).as_secs_f64(); + pub fn refresh_counters(&mut self, now: Instant) -> Result<(), ()> { + let elapsed = self.counter_interval.try_wait(now)?; self.bpf.refresh_counters(elapsed); - // determine when to sample next - let next = self.counter_next + self.counter_interval; - - // check that next sample time is in the future - if next > now { - self.counter_next = next; - } else { - self.counter_next = now + self.counter_interval; - } - - // mark when we last sampled - self.counter_prev = now; + Ok(()) } - pub fn refresh_distributions(&mut self, now: Instant) { - if now < self.distribution_next { - return; - } + pub fn refresh_distributions(&mut self, now: Instant) -> Result<(), ()> { + self.distribution_interval.try_wait(now)?; self.bpf.refresh_distributions(); - // determine when to sample next - let next = self.distribution_next + self.distribution_interval; - - // check that next sample time is in the future - if next > now { - self.distribution_next = next; - } else { - self.distribution_next = now + self.distribution_interval; - } - - // mark when we last sampled - self.distribution_prev = now; + Ok(()) } } impl Sampler for NetworkTraffic { fn sample(&mut self) { let now = Instant::now(); - self.refresh_counters(now); - self.refresh_distributions(now); + let _ = self.refresh_counters(now); + let _ = self.refresh_distributions(now); } } diff --git a/src/samplers/network/stats.rs b/src/samplers/network/stats.rs index 510bde93..9a1d5996 100644 --- a/src/samplers/network/stats.rs +++ b/src/samplers/network/stats.rs @@ -1,6 +1,5 @@ use crate::common::HISTOGRAM_GROUPING_POWER; -use crate::*; -use metriken::{metric, AtomicHistogram, Counter, Gauge, LazyCounter, LazyGauge}; +use metriken::{metric, AtomicHistogram, Counter, LazyCounter}; #[metric( name = "network/receive/bytes", @@ -17,6 +16,27 @@ pub static NETWORK_RX_BYTES: LazyCounter = LazyCounter::new(Counter::default); pub static NETWORK_RX_BYTES_HISTOGRAM: AtomicHistogram = AtomicHistogram::new(HISTOGRAM_GROUPING_POWER, 64); +#[metric( + name = "network/receive/errors/crc", + description = "The number of packets received which had CRC errors", + metadata = { unit = "packets" } +)] +pub static NETWORK_RX_CRC_ERRORS: LazyCounter = LazyCounter::new(Counter::default); + +#[metric( + name = "network/receive/dropped", + description = "The number of packets received but not processed. Usually due to lack of resources or unsupported protocol. Does not include hardware interface buffer exhaustion.", + metadata = { unit = "packets" } +)] +pub static NETWORK_RX_DROPPED: LazyCounter = LazyCounter::new(Counter::default); + +#[metric( + name = "network/receive/errors/missed", + description = "The number of packets missed due to buffer exhaustion.", + metadata = { unit = "packets" } +)] +pub static NETWORK_RX_MISSED_ERRORS: LazyCounter = LazyCounter::new(Counter::default); + #[metric( name = "network/receive/packets", description = "The number of packets received over the network", @@ -47,6 +67,13 @@ pub static NETWORK_TX_BYTES: LazyCounter = LazyCounter::new(Counter::default); pub static NETWORK_TX_BYTES_HISTOGRAM: AtomicHistogram = AtomicHistogram::new(HISTOGRAM_GROUPING_POWER, 64); +#[metric( + name = "network/transmit/dropped", + description = "The number of packets dropped on the transmit path. Usually due to lack of resources.", + metadata = { unit = "packets" } +)] +pub static NETWORK_TX_DROPPED: LazyCounter = LazyCounter::new(Counter::default); + #[metric( name = "network/transmit/packets", description = "The number of packets transmitted over the network", diff --git a/src/samplers/rezolus/rusage/mod.rs b/src/samplers/rezolus/rusage/mod.rs index 50096171..d6d953d3 100644 --- a/src/samplers/rezolus/rusage/mod.rs +++ b/src/samplers/rezolus/rusage/mod.rs @@ -1,8 +1,7 @@ use super::stats::*; use super::*; use crate::common::units::{KIBIBYTES, MICROSECONDS, SECONDS}; -use crate::common::Counter; -use crate::common::Nop; +use crate::common::{Counter, Interval, Nop}; #[distributed_slice(REZOLUS_SAMPLERS)] fn init(config: &Config) -> Box { @@ -16,9 +15,7 @@ fn init(config: &Config) -> Box { const NAME: &str = "rezolus_rusage"; pub struct Rusage { - prev: Instant, - next: Instant, - interval: Duration, + interval: Interval, ru_utime: Counter, ru_stime: Counter, } @@ -30,12 +27,8 @@ impl Rusage { return Err(()); } - let now = Instant::now(); - Ok(Self { - prev: now, - next: now, - interval: config.interval(NAME), + interval: Interval::new(Instant::now(), config.interval(NAME)), ru_utime: Counter::new(&RU_UTIME, Some(&RU_UTIME_HISTOGRAM)), ru_stime: Counter::new(&RU_STIME, Some(&RU_STIME_HISTOGRAM)), }) @@ -44,30 +37,9 @@ impl Rusage { impl Sampler for Rusage { fn sample(&mut self) { - let now = Instant::now(); - - if now < self.next { - return; - } - - let elapsed = (now - self.prev).as_secs_f64(); - - self.sample_rusage(elapsed); - - // determine when to sample next - let next = self.next + self.interval; - - // it's possible we fell behind - if next > now { - // if we didn't, sample at the next planned time - self.next = next; - } else { - // if we did, sample after the interval has elapsed - self.next = now + self.interval; + if let Ok(elapsed) = self.interval.try_wait(Instant::now()) { + self.sample_rusage(elapsed.as_secs_f64()); } - - // mark when we last sampled - self.prev = now; } } diff --git a/src/samplers/scheduler/linux/runqueue/mod.bpf.c b/src/samplers/scheduler/linux/runqueue/mod.bpf.c index cdb09f56..4c0f5062 100644 --- a/src/samplers/scheduler/linux/runqueue/mod.bpf.c +++ b/src/samplers/scheduler/linux/runqueue/mod.bpf.c @@ -7,7 +7,7 @@ // Rezolus. // This BPF program probes enqueue and dequeue from the scheduler runqueue -// to calculate the runqueue latency. +// to calculate the runqueue latency, running time, and off-cpu time. #include #include "../../../common/bpf/histogram.h" @@ -15,13 +15,15 @@ #include #define COUNTER_GROUP_WIDTH 8 -#define HISTOGRAM_BUCKETS 7424 +#define HISTOGRAM_POWER 7 #define MAX_CPUS 1024 #define MAX_PID 4194304 -#define IVCSW 0 #define TASK_RUNNING 0 +// counter positions +#define IVCSW 0 + /** * commit 2f064a59a1 ("sched: Change task_struct::state") changes * the name of task_struct::state to task_struct::__state @@ -45,7 +47,7 @@ static __always_inline __s64 get_task_state(void *task) return BPF_CORE_READ((struct task_struct___o *)task, state); } -// counters +// counters (see constants defined at top) struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(map_flags, BPF_F_MMAPABLE); @@ -54,6 +56,10 @@ struct { __uint(max_entries, MAX_CPUS * COUNTER_GROUP_WIDTH); } counters SEC(".maps"); +/* + * tracking structs + */ + struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, MAX_PID); @@ -61,6 +67,13 @@ struct { __type(value, u64); } enqueued_at SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, MAX_PID); + __type(key, u32); + __type(value, u64); +} offcpu_at SEC(".maps"); + struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, MAX_PID); @@ -68,12 +81,16 @@ struct { __type(value, u64); } running_at SEC(".maps"); +/* + * histograms + */ + struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(map_flags, BPF_F_MMAPABLE); __type(key, u32); __type(value, u64); - __uint(max_entries, HISTOGRAM_BUCKETS); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); } runqlat SEC(".maps"); struct { @@ -81,9 +98,17 @@ struct { __uint(map_flags, BPF_F_MMAPABLE); __type(key, u32); __type(value, u64); - __uint(max_entries, HISTOGRAM_BUCKETS); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); } running SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(map_flags, BPF_F_MMAPABLE); + __type(key, u32); + __type(value, u64); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); +} offcpu SEC(".maps"); + /* record enqueue timestamp */ static __always_inline int trace_enqueue(u32 tgid, u32 pid) @@ -126,9 +151,10 @@ int handle__sched_switch(u64 *ctx) struct task_struct *prev = (struct task_struct *)ctx[1]; struct task_struct *next = (struct task_struct *)ctx[2]; - u32 pid; - u64 *tsp, delta_ns, *cnt; + u32 pid, idx; + u64 *tsp, delta_ns, *cnt, offcpu_ns; + u32 processor_id = bpf_get_smp_processor_id(); u64 ts = bpf_ktime_get_ns(); @@ -138,10 +164,10 @@ int handle__sched_switch(u64 *ctx) // prev task is moving from running // - update prev->pid enqueued_at with now - // - calculate how long prev task was running, update hist + // - calculate how long prev task was running and update hist if (get_task_state(prev) == TASK_RUNNING) { // count involuntary context switch - u32 idx = COUNTER_GROUP_WIDTH * bpf_get_smp_processor_id() + IVCSW; + idx = COUNTER_GROUP_WIDTH * processor_id + IVCSW; cnt = bpf_map_lookup_elem(&counters, &idx); if (cnt) { @@ -153,11 +179,13 @@ int handle__sched_switch(u64 *ctx) // mark when it was enqueued bpf_map_update_elem(&enqueued_at, &pid, &ts, 0); - // calculate how long it was running, increment running histogram + // calculate how long it was running and increment stats tsp = bpf_map_lookup_elem(&running_at, &pid); if (tsp && *tsp) { delta_ns = ts - *tsp; - u32 idx = value_to_index(delta_ns); + + // update histogram + idx = value_to_index(delta_ns, HISTOGRAM_POWER); cnt = bpf_map_lookup_elem(&running, &idx); if (cnt) { __sync_fetch_and_add(cnt, 1); @@ -166,6 +194,12 @@ int handle__sched_switch(u64 *ctx) *tsp = 0; } } + + // for all tasks: track when it went off-cpu + pid = prev->pid; + + // mark off-cpu at + bpf_map_update_elem(&offcpu_at, &pid, &ts, 0); // next task has moved into running // - update next->pid running_at with now @@ -175,17 +209,39 @@ int handle__sched_switch(u64 *ctx) // update running_at bpf_map_update_elem(&running_at, &pid, &ts, 0); - // calculate how long it was enqueued, increment running histogram + // calculate how long it was enqueued and increment stats tsp = bpf_map_lookup_elem(&enqueued_at, &pid); if (tsp && *tsp) { delta_ns = ts - *tsp; - u32 idx = value_to_index(delta_ns); + + // update the histogram + idx = value_to_index(delta_ns, HISTOGRAM_POWER); cnt = bpf_map_lookup_elem(&runqlat, &idx); if (cnt) { __sync_fetch_and_add(cnt, 1); } *tsp = 0; + + // calculate how long it was off-cpu, not including runqueue wait, + // and increment stats + tsp = bpf_map_lookup_elem(&offcpu_at, &pid); + if (tsp && *tsp) { + offcpu_ns = ts - *tsp; + + if (offcpu_ns > delta_ns) { + offcpu_ns = offcpu_ns - delta_ns; + + // update the histogram + idx = value_to_index(offcpu_ns, HISTOGRAM_POWER); + cnt = bpf_map_lookup_elem(&offcpu, &idx); + if (cnt) { + __sync_fetch_and_add(cnt, 1); + } + } + + *tsp = 0; + } } return 0; diff --git a/src/samplers/scheduler/linux/runqueue/mod.rs b/src/samplers/scheduler/linux/runqueue/mod.rs index faa021f9..d3860f03 100644 --- a/src/samplers/scheduler/linux/runqueue/mod.rs +++ b/src/samplers/scheduler/linux/runqueue/mod.rs @@ -38,18 +38,14 @@ impl GetMap for ModSkel<'_> { /// * `scheduler/context_switch/voluntary` pub struct Runqlat { bpf: Bpf>, - counter_interval: Duration, - counter_next: Instant, - counter_prev: Instant, - distribution_interval: Duration, - distribution_next: Instant, - distribution_prev: Instant, + counter_interval: Interval, + distribution_interval: Interval, } impl Runqlat { pub fn new(config: &Config) -> Result { // check if sampler should be enabled - if !config.enabled(NAME) { + if !(config.enabled(NAME) && config.bpf(NAME)) { return Err(()); } @@ -60,84 +56,61 @@ impl Runqlat { .load() .map_err(|e| error!("failed to load bpf program: {e}"))?; + debug!( + "{NAME} handle__sched_wakeup() BPF instruction count: {}", + skel.progs().handle__sched_wakeup().insn_cnt() + ); + debug!( + "{NAME} handle__sched_wakeup_new() BPF instruction count: {}", + skel.progs().handle__sched_wakeup_new().insn_cnt() + ); + debug!( + "{NAME} handle__sched_switch() BPF instruction count: {}", + skel.progs().handle__sched_switch().insn_cnt() + ); + skel.attach() .map_err(|e| error!("failed to attach bpf program: {e}"))?; - let mut bpf = Bpf::from_skel(skel); - let counters = vec![Counter::new(&SCHEDULER_IVCSW, None)]; - bpf.add_counters("counters", counters); + let bpf = BpfBuilder::new(skel) + .counters("counters", counters) + .distribution("runqlat", &SCHEDULER_RUNQUEUE_LATENCY) + .distribution("running", &SCHEDULER_RUNNING) + .distribution("offcpu", &SCHEDULER_OFFCPU) + .build(); - let mut distributions = vec![ - ("runqlat", &SCHEDULER_RUNQUEUE_LATENCY), - ("running", &SCHEDULER_RUNNING), - ]; - - for (name, histogram) in distributions.drain(..) { - bpf.add_distribution(name, histogram); - } + let now = Instant::now(); Ok(Self { bpf, - counter_interval: config.interval(NAME), - counter_next: Instant::now(), - counter_prev: Instant::now(), - distribution_interval: config.distribution_interval(NAME), - distribution_next: Instant::now(), - distribution_prev: Instant::now(), + counter_interval: Interval::new(now, config.interval(NAME)), + distribution_interval: Interval::new(now, config.distribution_interval(NAME)), }) } - pub fn refresh_counters(&mut self, now: Instant) { - if now < self.counter_next { - return; - } - - let elapsed = (now - self.counter_prev).as_secs_f64(); + pub fn refresh_counters(&mut self, now: Instant) -> Result<(), ()> { + let elapsed = self.counter_interval.try_wait(now)?; self.bpf.refresh_counters(elapsed); - // determine when to sample next - let next = self.counter_next + self.counter_interval; - - // check that next sample time is in the future - if next > now { - self.counter_next = next; - } else { - self.counter_next = now + self.counter_interval; - } - - // mark when we last sampled - self.counter_prev = now; + Ok(()) } - pub fn refresh_distributions(&mut self, now: Instant) { - if now < self.distribution_next { - return; - } + pub fn refresh_distributions(&mut self, now: Instant) -> Result<(), ()> { + self.distribution_interval.try_wait(now)?; self.bpf.refresh_distributions(); - // determine when to sample next - let next = self.distribution_next + self.distribution_interval; - - // check that next sample time is in the future - if next > now { - self.distribution_next = next; - } else { - self.distribution_next = now + self.distribution_interval; - } - - // mark when we last sampled - self.distribution_prev = now; + Ok(()) } } impl Sampler for Runqlat { fn sample(&mut self) { let now = Instant::now(); - self.refresh_counters(now); - self.refresh_distributions(now); + let _ = self.refresh_counters(now); + let _ = self.refresh_distributions(now); } } diff --git a/src/samplers/scheduler/stats.rs b/src/samplers/scheduler/stats.rs index 954a3cef..52472ee9 100644 --- a/src/samplers/scheduler/stats.rs +++ b/src/samplers/scheduler/stats.rs @@ -1,5 +1,4 @@ use crate::common::HISTOGRAM_GROUPING_POWER; -use crate::*; use metriken::{metric, Counter, LazyCounter, RwLockHistogram}; #[metric( @@ -17,6 +16,13 @@ pub static SCHEDULER_RUNQUEUE_LATENCY: RwLockHistogram = )] pub static SCHEDULER_RUNNING: RwLockHistogram = RwLockHistogram::new(HISTOGRAM_GROUPING_POWER, 64); +#[metric( + name = "scheduler/offcpu", + description = "Distribution of the amount of time tasks were off-CPU", + metadata = { unit = "nanoseconds" } +)] +pub static SCHEDULER_OFFCPU: RwLockHistogram = RwLockHistogram::new(HISTOGRAM_GROUPING_POWER, 64); + #[metric( name = "scheduler/context_switch/involuntary", description = "The number of involuntary context switches" diff --git a/src/samplers/syscall/linux/latency/mod.bpf.c b/src/samplers/syscall/linux/latency/mod.bpf.c index 3c1e1452..1d0ead3a 100644 --- a/src/samplers/syscall/linux/latency/mod.bpf.c +++ b/src/samplers/syscall/linux/latency/mod.bpf.c @@ -18,7 +18,7 @@ #include #define COUNTER_GROUP_WIDTH 8 -#define HISTOGRAM_BUCKETS 7424 +#define HISTOGRAM_POWER 7 #define MAX_CPUS 1024 #define MAX_SYSCALL_ID 1024 #define MAX_PID 4194304 @@ -48,7 +48,7 @@ struct { __uint(map_flags, BPF_F_MMAPABLE); __type(key, u32); __type(value, u64); - __uint(max_entries, HISTOGRAM_BUCKETS); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); } total_latency SEC(".maps"); struct { @@ -56,7 +56,7 @@ struct { __uint(map_flags, BPF_F_MMAPABLE); __type(key, u32); __type(value, u64); - __uint(max_entries, HISTOGRAM_BUCKETS); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); } read_latency SEC(".maps"); struct { @@ -64,7 +64,7 @@ struct { __uint(map_flags, BPF_F_MMAPABLE); __type(key, u32); __type(value, u64); - __uint(max_entries, HISTOGRAM_BUCKETS); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); } write_latency SEC(".maps"); struct { @@ -72,7 +72,7 @@ struct { __uint(map_flags, BPF_F_MMAPABLE); __type(key, u32); __type(value, u64); - __uint(max_entries, HISTOGRAM_BUCKETS); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); } poll_latency SEC(".maps"); struct { @@ -80,7 +80,7 @@ struct { __uint(map_flags, BPF_F_MMAPABLE); __type(key, u32); __type(value, u64); - __uint(max_entries, HISTOGRAM_BUCKETS); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); } lock_latency SEC(".maps"); struct { @@ -88,7 +88,7 @@ struct { __uint(map_flags, BPF_F_MMAPABLE); __type(key, u32); __type(value, u64); - __uint(max_entries, HISTOGRAM_BUCKETS); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); } time_latency SEC(".maps"); struct { @@ -96,7 +96,7 @@ struct { __uint(map_flags, BPF_F_MMAPABLE); __type(key, u32); __type(value, u64); - __uint(max_entries, HISTOGRAM_BUCKETS); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); } sleep_latency SEC(".maps"); struct { @@ -104,7 +104,7 @@ struct { __uint(map_flags, BPF_F_MMAPABLE); __type(key, u32); __type(value, u64); - __uint(max_entries, HISTOGRAM_BUCKETS); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); } socket_latency SEC(".maps"); // provides a lookup table from syscall id to a counter index offset @@ -188,7 +188,7 @@ int sys_exit(struct trace_event_raw_sys_exit *args) *start_ts = 0; // calculate the histogram index for this latency value - idx = value_to_index(lat); + idx = value_to_index(lat, HISTOGRAM_POWER); // update the total latency histogram cnt = bpf_map_lookup_elem(&total_latency, &idx); diff --git a/src/samplers/syscall/linux/latency/mod.rs b/src/samplers/syscall/linux/latency/mod.rs index ca1a686f..d66427e7 100644 --- a/src/samplers/syscall/linux/latency/mod.rs +++ b/src/samplers/syscall/linux/latency/mod.rs @@ -13,6 +13,8 @@ mod bpf { const NAME: &str = "syscall_latency"; +const MAX_SYSCALL_ID: usize = 1024; + use bpf::*; use crate::common::bpf::*; @@ -20,8 +22,6 @@ use crate::common::*; use crate::samplers::syscall::stats::*; use crate::samplers::syscall::*; -use std::os::fd::{AsFd, AsRawFd, FromRawFd}; - impl GetMap for ModSkel<'_> { fn map(&self, name: &str) -> &libbpf_rs::Map { self.obj.map(name).unwrap() @@ -37,18 +37,14 @@ impl GetMap for ModSkel<'_> { /// * `syscall/total/latency` pub struct Syscall { bpf: Bpf>, - counter_interval: Duration, - counter_next: Instant, - counter_prev: Instant, - distribution_interval: Duration, - distribution_next: Instant, - distribution_prev: Instant, + counter_interval: Interval, + distribution_interval: Interval, } impl Syscall { pub fn new(config: &Config) -> Result { // check if sampler should be enabled - if !config.enabled(NAME) { + if !(config.enabled(NAME) && config.bpf(NAME)) { return Err(()); } @@ -59,62 +55,18 @@ impl Syscall { .load() .map_err(|e| error!("failed to load bpf program: {e}"))?; + debug!( + "{NAME} sys_enter() BPF instruction count: {}", + skel.progs().sys_enter().insn_cnt() + ); + debug!( + "{NAME} sys_exit() BPF instruction count: {}", + skel.progs().sys_exit().insn_cnt() + ); + skel.attach() .map_err(|e| error!("failed to attach bpf program: {e}"))?; - let mut bpf = Bpf::from_skel(skel); - - let fd = bpf.map("syscall_lut").as_fd().as_raw_fd(); - let file = unsafe { std::fs::File::from_raw_fd(fd as _) }; - let mut syscall_lut = unsafe { - memmap2::MmapOptions::new() - .len(1024 * 8) - .map_mut(&file) - .expect("failed to mmap() bpf syscall lut") - }; - - for (syscall_id, bytes) in syscall_lut.chunks_exact_mut(8).enumerate() { - let counter_offset = bytes.as_mut_ptr() as *mut u64; - if let Some(syscall_name) = syscall_numbers::native::sys_call_name(syscall_id as i64) { - let group = match syscall_name { - // read related - "pread64" | "preadv" | "preadv2" | "read" | "readv" | "recvfrom" - | "recvmmsg" | "recvmsg" => 1, - // write related - "pwrite64" | "pwritev" | "pwritev2" | "sendmmsg" | "sendmsg" | "sendto" - | "write" | "writev" => 2, - // poll/select/epoll - "epoll_create" | "epoll_create1" | "epoll_ctl" | "epoll_ctl_old" - | "epoll_pwait" | "epoll_pwait2" | "epoll_wait" | "epoll_wait_old" | "poll" - | "ppoll" | "ppoll_time64" | "pselect6" | "pselect6_time64" | "select" => 3, - // locking - "futex" => 4, - // time - "adjtimex" | "clock_adjtime" | "clock_getres" | "clock_gettime" - | "clock_settime" | "gettimeofday" | "settimeofday" | "time" => 5, - // sleep - "clock_nanosleep" | "nanosleep" => 6, - // socket creation and management - "accept" | "bind" | "connect" | "getpeername" | "getsockname" - | "getsockopt" | "listen" | "setsockopt" | "shutdown" | "socket" - | "socketpair" => 7, - _ => { - // no group defined for these syscalls - 0 - } - }; - unsafe { - *counter_offset = group; - } - } else { - unsafe { - *counter_offset = 0; - } - } - } - - let _ = syscall_lut.flush(); - let counters = vec![ Counter::new(&SYSCALL_TOTAL, Some(&SYSCALL_TOTAL_HISTOGRAM)), Counter::new(&SYSCALL_READ, Some(&SYSCALL_READ_HISTOGRAM)), @@ -126,83 +78,86 @@ impl Syscall { Counter::new(&SYSCALL_SOCKET, Some(&SYSCALL_SOCKET_HISTOGRAM)), ]; - bpf.add_counters("counters", counters); - - let mut distributions = vec![ - ("total_latency", &SYSCALL_TOTAL_LATENCY), - ("read_latency", &SYSCALL_READ_LATENCY), - ("write_latency", &SYSCALL_WRITE_LATENCY), - ("poll_latency", &SYSCALL_POLL_LATENCY), - ("lock_latency", &SYSCALL_LOCK_LATENCY), - ("time_latency", &SYSCALL_TIME_LATENCY), - ("sleep_latency", &SYSCALL_SLEEP_LATENCY), - ("socket_latency", &SYSCALL_SOCKET_LATENCY), - ]; + let syscall_lut: Vec = (0..MAX_SYSCALL_ID) + .map(|id| { + if let Some(syscall_name) = syscall_numbers::native::sys_call_name(id as i64) { + match syscall_name { + // read related + "pread64" | "preadv" | "preadv2" | "read" | "readv" | "recvfrom" + | "recvmmsg" | "recvmsg" => 1, + // write related + "pwrite64" | "pwritev" | "pwritev2" | "sendmmsg" | "sendmsg" | "sendto" + | "write" | "writev" => 2, + // poll/select/epoll + "epoll_create" | "epoll_create1" | "epoll_ctl" | "epoll_ctl_old" + | "epoll_pwait" | "epoll_pwait2" | "epoll_wait" | "epoll_wait_old" + | "poll" | "ppoll" | "ppoll_time64" | "pselect6" | "pselect6_time64" + | "select" => 3, + // locking + "futex" => 4, + // time + "adjtimex" | "clock_adjtime" | "clock_getres" | "clock_gettime" + | "clock_settime" | "gettimeofday" | "settimeofday" | "time" => 5, + // sleep + "clock_nanosleep" | "nanosleep" => 6, + // socket creation and management + "accept" | "bind" | "connect" | "getpeername" | "getsockname" + | "getsockopt" | "listen" | "setsockopt" | "shutdown" | "socket" + | "socketpair" => 7, + _ => { + // no group defined for these syscalls + 0 + } + } + } else { + 0 + } + }) + .collect(); + + let bpf = BpfBuilder::new(skel) + .counters("counters", counters) + .distribution("total_latency", &SYSCALL_TOTAL_LATENCY) + .distribution("read_latency", &SYSCALL_READ_LATENCY) + .distribution("write_latency", &SYSCALL_WRITE_LATENCY) + .distribution("poll_latency", &SYSCALL_POLL_LATENCY) + .distribution("lock_latency", &SYSCALL_LOCK_LATENCY) + .distribution("time_latency", &SYSCALL_TIME_LATENCY) + .distribution("sleep_latency", &SYSCALL_SLEEP_LATENCY) + .distribution("socket_latency", &SYSCALL_SOCKET_LATENCY) + .map("syscall_lut", &syscall_lut) + .build(); - for (name, histogram) in distributions.drain(..) { - bpf.add_distribution(name, histogram); - } + let now = Instant::now(); Ok(Self { bpf, - counter_interval: config.interval(NAME), - counter_next: Instant::now(), - counter_prev: Instant::now(), - distribution_interval: config.distribution_interval(NAME), - distribution_next: Instant::now(), - distribution_prev: Instant::now(), + counter_interval: Interval::new(now, config.interval(NAME)), + distribution_interval: Interval::new(now, config.distribution_interval(NAME)), }) } - pub fn refresh_counters(&mut self, now: Instant) { - if now < self.counter_next { - return; - } - - let elapsed = (now - self.counter_prev).as_secs_f64(); + pub fn refresh_counters(&mut self, now: Instant) -> Result<(), ()> { + let elapsed = self.counter_interval.try_wait(now)?; self.bpf.refresh_counters(elapsed); - // determine when to sample next - let next = self.counter_next + self.counter_interval; - - // check that next sample time is in the future - if next > now { - self.counter_next = next; - } else { - self.counter_next = now + self.counter_interval; - } - - // mark when we last sampled - self.counter_prev = now; + Ok(()) } - pub fn refresh_distributions(&mut self, now: Instant) { - if now < self.distribution_next { - return; - } + pub fn refresh_distributions(&mut self, now: Instant) -> Result<(), ()> { + self.distribution_interval.try_wait(now)?; self.bpf.refresh_distributions(); - // determine when to sample next - let next = self.distribution_next + self.distribution_interval; - - // check that next sample time is in the future - if next > now { - self.distribution_next = next; - } else { - self.distribution_next = now + self.distribution_interval; - } - - // mark when we last sampled - self.distribution_prev = now; + Ok(()) } } impl Sampler for Syscall { fn sample(&mut self) { let now = Instant::now(); - self.refresh_counters(now); - self.refresh_distributions(now); + let _ = self.refresh_counters(now); + let _ = self.refresh_distributions(now); } } diff --git a/src/samplers/syscall/stats.rs b/src/samplers/syscall/stats.rs index a7599a31..12d9eb3f 100644 --- a/src/samplers/syscall/stats.rs +++ b/src/samplers/syscall/stats.rs @@ -1,7 +1,5 @@ use crate::common::HISTOGRAM_GROUPING_POWER; -use metriken::{ - metric, AtomicHistogram, Counter, Format, LazyCounter, MetricEntry, RwLockHistogram, -}; +use metriken::{metric, AtomicHistogram, Counter, LazyCounter, RwLockHistogram}; #[metric( name = "syscall/total", diff --git a/src/samplers/tcp/linux/connection_state/mod.rs b/src/samplers/tcp/linux/connection_state/mod.rs index 41fa7d2f..52bd2536 100644 --- a/src/samplers/tcp/linux/connection_state/mod.rs +++ b/src/samplers/tcp/linux/connection_state/mod.rs @@ -1,4 +1,4 @@ -use crate::common::Nop; +use crate::common::{Interval, Nop}; use crate::samplers::tcp::stats::*; use crate::samplers::tcp::*; use metriken::Gauge; @@ -18,9 +18,7 @@ fn init(config: &Config) -> Box { const NAME: &str = "tcp_connection_state"; pub struct ConnectionState { - prev: Instant, - next: Instant, - interval: Duration, + interval: Interval, files: Vec, gauges: Vec<(&'static Lazy, i64)>, } @@ -32,8 +30,6 @@ impl ConnectionState { return Err(()); } - let now = Instant::now(); - let gauges: Vec<(&'static Lazy, i64)> = vec![ (&TCP_CONN_STATE_ESTABLISHED, 0), (&TCP_CONN_STATE_SYN_SENT, 0), @@ -69,18 +65,14 @@ impl ConnectionState { Ok(Self { files, gauges, - prev: now, - next: now, - interval: config.interval(NAME), + interval: Interval::new(Instant::now(), config.interval(NAME)), }) } } impl Sampler for ConnectionState { fn sample(&mut self) { - let now = Instant::now(); - - if now < self.next { + if self.interval.try_wait(Instant::now()).is_err() { return; } @@ -114,20 +106,5 @@ impl Sampler for ConnectionState { for (gauge, value) in self.gauges.iter() { gauge.set(*value); } - - // determine when to sample next - let next = self.next + self.interval; - - // it's possible we fell behind - if next > now { - // if we didn't, sample at the next planned time - self.next = next; - } else { - // if we did, sample after the interval has elapsed - self.next = now + self.interval; - } - - // mark when we last sampled - self.prev = now; } } diff --git a/src/samplers/tcp/linux/packet_latency/mod.bpf.c b/src/samplers/tcp/linux/packet_latency/mod.bpf.c index 8f15893f..a7ed9062 100644 --- a/src/samplers/tcp/linux/packet_latency/mod.bpf.c +++ b/src/samplers/tcp/linux/packet_latency/mod.bpf.c @@ -16,6 +16,8 @@ #include #include +#define HISTOGRAM_POWER 7 + #define MAX_ENTRIES 10240 #define AF_INET 2 #define NO_EXIST 1 @@ -32,7 +34,7 @@ struct { __uint(map_flags, BPF_F_MMAPABLE); __type(key, u32); __type(value, u64); - __uint(max_entries, 7424); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); } latency SEC(".maps"); static __always_inline __u64 get_sock_ident(struct sock *sk) @@ -83,7 +85,7 @@ static int handle_tcp_rcv_space_adjust(void *ctx, struct sock *sk) delta_ns = (now - *tsp); - idx = value_to_index(delta_ns); + idx = value_to_index(delta_ns, HISTOGRAM_POWER); cnt = bpf_map_lookup_elem(&latency, &idx); if (cnt) { diff --git a/src/samplers/tcp/linux/packet_latency/mod.rs b/src/samplers/tcp/linux/packet_latency/mod.rs index 944a5767..8b004373 100644 --- a/src/samplers/tcp/linux/packet_latency/mod.rs +++ b/src/samplers/tcp/linux/packet_latency/mod.rs @@ -35,18 +35,14 @@ impl GetMap for ModSkel<'_> { /// * `tcp/receive/packet_latency` pub struct PacketLatency { bpf: Bpf>, - counter_interval: Duration, - counter_next: Instant, - counter_prev: Instant, - distribution_interval: Duration, - distribution_next: Instant, - distribution_prev: Instant, + counter_interval: Interval, + distribution_interval: Interval, } impl PacketLatency { pub fn new(config: &Config) -> Result { // check if sampler should be enabled - if !config.enabled(NAME) { + if !(config.enabled(NAME) && config.bpf(NAME)) { return Err(()); } @@ -57,77 +53,56 @@ impl PacketLatency { .load() .map_err(|e| error!("failed to load bpf program: {e}"))?; + debug!( + "{NAME} tcp_probe() BPF instruction count: {}", + skel.progs().tcp_probe().insn_cnt() + ); + debug!( + "{NAME} tcp_rcv_space_adjust() BPF instruction count: {}", + skel.progs().tcp_rcv_space_adjust().insn_cnt() + ); + debug!( + "{NAME} tcp_destroy_sock() BPF instruction count: {}", + skel.progs().tcp_destroy_sock().insn_cnt() + ); + skel.attach() .map_err(|e| error!("failed to attach bpf program: {e}"))?; - let mut bpf = Bpf::from_skel(skel); - - let mut distributions = vec![("latency", &TCP_PACKET_LATENCY)]; + let bpf = BpfBuilder::new(skel) + .distribution("latency", &TCP_PACKET_LATENCY) + .build(); - for (name, histogram) in distributions.drain(..) { - bpf.add_distribution(name, histogram); - } + let now = Instant::now(); Ok(Self { bpf, - counter_interval: config.interval(NAME), - counter_next: Instant::now(), - counter_prev: Instant::now(), - distribution_interval: config.distribution_interval(NAME), - distribution_next: Instant::now(), - distribution_prev: Instant::now(), + counter_interval: Interval::new(now, config.interval(NAME)), + distribution_interval: Interval::new(now, config.distribution_interval(NAME)), }) } - pub fn refresh_counters(&mut self, now: Instant) { - if now < self.counter_next { - return; - } - - let elapsed = (now - self.counter_prev).as_secs_f64(); + pub fn refresh_counters(&mut self, now: Instant) -> Result<(), ()> { + let elapsed = self.counter_interval.try_wait(now)?; self.bpf.refresh_counters(elapsed); - // determine when to sample next - let next = self.counter_next + self.counter_interval; - - // check that next sample time is in the future - if next > now { - self.counter_next = next; - } else { - self.counter_next = now + self.counter_interval; - } - - // mark when we last sampled - self.counter_prev = now; + Ok(()) } - pub fn refresh_distributions(&mut self, now: Instant) { - if now < self.distribution_next { - return; - } + pub fn refresh_distributions(&mut self, now: Instant) -> Result<(), ()> { + self.distribution_interval.try_wait(now)?; self.bpf.refresh_distributions(); - // determine when to sample next - let next = self.distribution_next + self.distribution_interval; - - // check that next sample time is in the future - if next > now { - self.distribution_next = next; - } else { - self.distribution_next = now + self.distribution_interval; - } - - // mark when we last sampled - self.distribution_prev = now; + Ok(()) } } impl Sampler for PacketLatency { fn sample(&mut self) { let now = Instant::now(); - self.refresh_counters(now); - self.refresh_distributions(now); + let _ = self.refresh_counters(now); + let _ = self.refresh_distributions(now); } } diff --git a/src/samplers/tcp/linux/receive/mod.bpf.c b/src/samplers/tcp/linux/receive/mod.bpf.c index d4f24f7c..6bc9356b 100644 --- a/src/samplers/tcp/linux/receive/mod.bpf.c +++ b/src/samplers/tcp/linux/receive/mod.bpf.c @@ -16,12 +16,14 @@ #include #include +#define HISTOGRAM_POWER 7 + struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(map_flags, BPF_F_MMAPABLE); __type(key, u32); __type(value, u64); - __uint(max_entries, 7424); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); } jitter SEC(".maps"); struct { @@ -29,7 +31,7 @@ struct { __uint(map_flags, BPF_F_MMAPABLE); __type(key, u32); __type(value, u64); - __uint(max_entries, 7424); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); } srtt SEC(".maps"); SEC("kprobe/tcp_rcv_established") @@ -49,7 +51,7 @@ int BPF_KPROBE(tcp_rcv_kprobe, struct sock *sk) // record nanoseconds. srtt_ns = 1000 * (u64) srtt_us >> 3; - idx = value_to_index(srtt_ns); + idx = value_to_index(srtt_ns, HISTOGRAM_POWER); cnt = bpf_map_lookup_elem(&srtt, &idx); if (cnt) { @@ -60,7 +62,7 @@ int BPF_KPROBE(tcp_rcv_kprobe, struct sock *sk) // record nanoseconds. mdev_ns = 1000 * (u64) mdev_us >> 2; - idx = value_to_index(mdev_ns); + idx = value_to_index(mdev_ns, HISTOGRAM_POWER); cnt = bpf_map_lookup_elem(&jitter, &idx); if (cnt) { diff --git a/src/samplers/tcp/linux/receive/mod.rs b/src/samplers/tcp/linux/receive/mod.rs index 53b2cc7a..7ee11e26 100644 --- a/src/samplers/tcp/linux/receive/mod.rs +++ b/src/samplers/tcp/linux/receive/mod.rs @@ -34,18 +34,14 @@ impl GetMap for ModSkel<'_> { /// * `tcp/receive/srtt` pub struct Receive { bpf: Bpf>, - counter_interval: Duration, - counter_next: Instant, - counter_prev: Instant, - distribution_interval: Duration, - distribution_next: Instant, - distribution_prev: Instant, + counter_interval: Interval, + distribution_interval: Interval, } impl Receive { pub fn new(config: &Config) -> Result { // check if sampler should be enabled - if !config.enabled(NAME) { + if !(config.enabled(NAME) && config.bpf(NAME)) { return Err(()); } @@ -56,77 +52,49 @@ impl Receive { .load() .map_err(|e| error!("failed to load bpf program: {e}"))?; + debug!( + "{NAME} tcp_rcv() BPF instruction count: {}", + skel.progs().tcp_rcv_kprobe().insn_cnt() + ); + skel.attach() .map_err(|e| error!("failed to attach bpf program: {e}"))?; - let mut bpf = Bpf::from_skel(skel); - - let mut distributions = vec![("srtt", &TCP_SRTT), ("jitter", &TCP_JITTER)]; + let bpf = BpfBuilder::new(skel) + .distribution("srtt", &TCP_SRTT) + .distribution("jitter", &TCP_JITTER) + .build(); - for (name, histogram) in distributions.drain(..) { - bpf.add_distribution(name, histogram); - } + let now = Instant::now(); Ok(Self { bpf, - counter_interval: config.interval(NAME), - counter_next: Instant::now(), - counter_prev: Instant::now(), - distribution_interval: config.distribution_interval(NAME), - distribution_next: Instant::now(), - distribution_prev: Instant::now(), + counter_interval: Interval::new(now, config.interval(NAME)), + distribution_interval: Interval::new(now, config.distribution_interval(NAME)), }) } - pub fn refresh_counters(&mut self, now: Instant) { - if now < self.counter_next { - return; - } - - let elapsed = (now - self.counter_prev).as_secs_f64(); + pub fn refresh_counters(&mut self, now: Instant) -> Result<(), ()> { + let elapsed = self.counter_interval.try_wait(now)?; self.bpf.refresh_counters(elapsed); - // determine when to sample next - let next = self.counter_next + self.counter_interval; - - // check that next sample time is in the future - if next > now { - self.counter_next = next; - } else { - self.counter_next = now + self.counter_interval; - } - - // mark when we last sampled - self.counter_prev = now; + Ok(()) } - pub fn refresh_distributions(&mut self, now: Instant) { - if now < self.distribution_next { - return; - } + pub fn refresh_distributions(&mut self, now: Instant) -> Result<(), ()> { + self.distribution_interval.try_wait(now)?; self.bpf.refresh_distributions(); - // determine when to sample next - let next = self.distribution_next + self.distribution_interval; - - // check that next sample time is in the future - if next > now { - self.distribution_next = next; - } else { - self.distribution_next = now + self.distribution_interval; - } - - // mark when we last sampled - self.distribution_prev = now; + Ok(()) } } impl Sampler for Receive { fn sample(&mut self) { let now = Instant::now(); - self.refresh_counters(now); - self.refresh_distributions(now); + let _ = self.refresh_counters(now); + let _ = self.refresh_distributions(now); } } diff --git a/src/samplers/tcp/linux/retransmit/mod.rs b/src/samplers/tcp/linux/retransmit/mod.rs index 39217122..4b6260e7 100644 --- a/src/samplers/tcp/linux/retransmit/mod.rs +++ b/src/samplers/tcp/linux/retransmit/mod.rs @@ -33,18 +33,14 @@ impl GetMap for ModSkel<'_> { /// * `tcp/transmit/retransmit` pub struct Retransmit { bpf: Bpf>, - counter_interval: Duration, - counter_next: Instant, - counter_prev: Instant, - distribution_interval: Duration, - distribution_next: Instant, - distribution_prev: Instant, + counter_interval: Interval, + distribution_interval: Interval, } impl Retransmit { pub fn new(config: &Config) -> Result { // check if sampler should be enabled - if !config.enabled(NAME) { + if !(config.enabled(NAME) && config.bpf(NAME)) { return Err(()); } @@ -55,78 +51,51 @@ impl Retransmit { .load() .map_err(|e| error!("failed to load bpf program: {e}"))?; + debug!( + "{NAME} tcp_retransmit_skb() BPF instruction count: {}", + skel.progs().tcp_retransmit_skb().insn_cnt() + ); + skel.attach() .map_err(|e| error!("failed to attach bpf program: {e}"))?; - let mut bpf = Bpf::from_skel(skel); - let counters = vec![Counter::new( &TCP_TX_RETRANSMIT, Some(&TCP_TX_RETRANSMIT_HISTOGRAM), )]; - bpf.add_counters("counters", counters); + let bpf = BpfBuilder::new(skel).counters("counters", counters).build(); + + let now = Instant::now(); Ok(Self { bpf, - counter_interval: config.interval(NAME), - counter_next: Instant::now(), - counter_prev: Instant::now(), - distribution_interval: config.distribution_interval(NAME), - distribution_next: Instant::now(), - distribution_prev: Instant::now(), + counter_interval: Interval::new(now, config.interval(NAME)), + distribution_interval: Interval::new(now, config.distribution_interval(NAME)), }) } - pub fn refresh_counters(&mut self, now: Instant) { - if now < self.counter_next { - return; - } - - let elapsed = (now - self.counter_prev).as_secs_f64(); + pub fn refresh_counters(&mut self, now: Instant) -> Result<(), ()> { + let elapsed = self.counter_interval.try_wait(now)?; self.bpf.refresh_counters(elapsed); - // determine when to sample next - let next = self.counter_next + self.counter_interval; - - // check that next sample time is in the future - if next > now { - self.counter_next = next; - } else { - self.counter_next = now + self.counter_interval; - } - - // mark when we last sampled - self.counter_prev = now; + Ok(()) } - pub fn refresh_distributions(&mut self, now: Instant) { - if now < self.distribution_next { - return; - } + pub fn refresh_distributions(&mut self, now: Instant) -> Result<(), ()> { + self.distribution_interval.try_wait(now)?; self.bpf.refresh_distributions(); - // determine when to sample next - let next = self.distribution_next + self.distribution_interval; - - // check that next sample time is in the future - if next > now { - self.distribution_next = next; - } else { - self.distribution_next = now + self.distribution_interval; - } - - // mark when we last sampled - self.distribution_prev = now; + Ok(()) } } impl Sampler for Retransmit { fn sample(&mut self) { let now = Instant::now(); - self.refresh_counters(now); - self.refresh_distributions(now); + let _ = self.refresh_counters(now); + let _ = self.refresh_distributions(now); } } diff --git a/src/samplers/tcp/linux/traffic/bpf.rs b/src/samplers/tcp/linux/traffic/bpf.rs index 526f9b7b..64c9dd77 100644 --- a/src/samplers/tcp/linux/traffic/bpf.rs +++ b/src/samplers/tcp/linux/traffic/bpf.rs @@ -1,3 +1,4 @@ +#[allow(clippy::module_inception)] mod bpf { include!(concat!(env!("OUT_DIR"), "/tcp_traffic.bpf.rs")); } @@ -30,18 +31,14 @@ impl GetMap for ModSkel<'_> { /// * `tcp/transmit/size` pub struct TcpTraffic { bpf: Bpf>, - counter_interval: Duration, - counter_next: Instant, - counter_prev: Instant, - distribution_interval: Duration, - distribution_next: Instant, - distribution_prev: Instant, + counter_interval: Interval, + distribution_interval: Interval, } impl TcpTraffic { pub fn new(config: &Config) -> Result { // check if sampler should be enabled - if !config.enabled(NAME) { + if !(config.enabled(NAME) && config.bpf(NAME)) { return Err(()); } @@ -52,11 +49,18 @@ impl TcpTraffic { .load() .map_err(|e| error!("failed to load bpf program: {e}"))?; + debug!( + "{NAME} tcp_sendmsg() BPF instruction count: {}", + skel.progs().tcp_sendmsg().insn_cnt() + ); + debug!( + "{NAME} tcp_cleanup_rbuf() BPF instruction count: {}", + skel.progs().tcp_cleanup_rbuf().insn_cnt() + ); + skel.attach() .map_err(|e| error!("failed to attach bpf program: {e}"))?; - let mut bpf = Bpf::from_skel(skel); - let counters = vec![ Counter::new(&TCP_RX_BYTES, Some(&TCP_RX_BYTES_HISTOGRAM)), Counter::new(&TCP_TX_BYTES, Some(&TCP_TX_BYTES_HISTOGRAM)), @@ -64,74 +68,42 @@ impl TcpTraffic { Counter::new(&TCP_TX_PACKETS, Some(&TCP_TX_PACKETS_HISTOGRAM)), ]; - bpf.add_counters("counters", counters); + let mut bpf = BpfBuilder::new(skel) + .counters("counters", counters) + .distribution("rx_size", &TCP_RX_SIZE) + .distribution("tx_size", &TCP_TX_SIZE) + .build(); - let mut distributions = vec![("rx_size", &TCP_RX_SIZE), ("tx_size", &TCP_TX_SIZE)]; - - for (name, histogram) in distributions.drain(..) { - bpf.add_distribution(name, histogram); - } + let now = Instant::now(); Ok(Self { bpf, - counter_interval: config.interval(NAME), - counter_next: Instant::now(), - counter_prev: Instant::now(), - distribution_interval: config.distribution_interval(NAME), - distribution_next: Instant::now(), - distribution_prev: Instant::now(), + counter_interval: Interval::new(now, config.interval(NAME)), + distribution_interval: Interval::new(now, config.distribution_interval(NAME)), }) } - pub fn refresh_counters(&mut self, now: Instant) { - if now < self.counter_next { - return; - } - - let elapsed = (now - self.counter_prev).as_secs_f64(); + pub fn refresh_counters(&mut self, now: Instant) -> Result<(), ()> { + let elapsed = self.counter_interval.try_wait(now)?; self.bpf.refresh_counters(elapsed); - // determine when to sample next - let next = self.counter_next + self.counter_interval; - - // check that next sample time is in the future - if next > now { - self.counter_next = next; - } else { - self.counter_next = now + self.counter_interval; - } - - // mark when we last sampled - self.counter_prev = now; + Ok(()) } - pub fn refresh_distributions(&mut self, now: Instant) { - if now < self.distribution_next { - return; - } + pub fn refresh_distributions(&mut self, now: Instant) -> Result<(), ()> { + self.distribution_interval.try_wait(now)?; self.bpf.refresh_distributions(); - // determine when to sample next - let next = self.distribution_next + self.distribution_interval; - - // check that next sample time is in the future - if next > now { - self.distribution_next = next; - } else { - self.distribution_next = now + self.distribution_interval; - } - - // mark when we last sampled - self.distribution_prev = now; + Ok(()) } } impl Sampler for TcpTraffic { fn sample(&mut self) { let now = Instant::now(); - self.refresh_counters(now); - self.refresh_distributions(now); + let _ = self.refresh_counters(now); + let _ = self.refresh_distributions(now); } } diff --git a/src/samplers/tcp/linux/traffic/mod.bpf.c b/src/samplers/tcp/linux/traffic/mod.bpf.c index db81e312..8bcd1f1c 100644 --- a/src/samplers/tcp/linux/traffic/mod.bpf.c +++ b/src/samplers/tcp/linux/traffic/mod.bpf.c @@ -16,6 +16,8 @@ #include #include +#define HISTOGRAM_POWER 7 + /* Taken from kernel include/linux/socket.h. */ #define AF_INET 2 /* Internet IP Protocol */ #define AF_INET6 10 /* IP version 6 */ @@ -39,7 +41,7 @@ struct { __uint(map_flags, BPF_F_MMAPABLE); __type(key, u32); __type(value, u64); - __uint(max_entries, 7424); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); } rx_size SEC(".maps"); struct { @@ -47,7 +49,7 @@ struct { __uint(map_flags, BPF_F_MMAPABLE); __type(key, u32); __type(value, u64); - __uint(max_entries, 7424); + __uint(max_entries, HISTOGRAM_BUCKETS_POW_7); } tx_size SEC(".maps"); static int probe_ip(bool receiving, struct sock *sk, size_t size) @@ -72,7 +74,7 @@ static int probe_ip(bool receiving, struct sock *sk, size_t size) __sync_fetch_and_add(cnt, (u64) size); } - idx = value_to_index((u64) size); + idx = value_to_index((u64) size, HISTOGRAM_POWER); cnt = bpf_map_lookup_elem(&rx_size, &idx); if (cnt) { @@ -93,7 +95,7 @@ static int probe_ip(bool receiving, struct sock *sk, size_t size) __sync_fetch_and_add(cnt, (u64) size); } - idx = value_to_index((u64) size); + idx = value_to_index((u64) size, HISTOGRAM_POWER); cnt = bpf_map_lookup_elem(&tx_size, &idx); if (cnt) { diff --git a/src/samplers/tcp/linux/traffic/proc.rs b/src/samplers/tcp/linux/traffic/proc.rs index 3ce064c4..77ef1124 100644 --- a/src/samplers/tcp/linux/traffic/proc.rs +++ b/src/samplers/tcp/linux/traffic/proc.rs @@ -1,5 +1,5 @@ use crate::common::classic::NestedMap; -use crate::common::Counter; +use crate::common::{Counter, Interval}; use crate::samplers::tcp::stats::*; use crate::samplers::tcp::*; use std::fs::File; @@ -7,9 +7,7 @@ use std::fs::File; use super::NAME; pub struct ProcNetSnmp { - prev: Instant, - next: Instant, - interval: Duration, + interval: Interval, file: File, counters: Vec<(Counter, &'static str, &'static str)>, } @@ -21,8 +19,6 @@ impl ProcNetSnmp { return Err(()); } - let now = Instant::now(); - let counters = vec![ ( Counter::new(&TCP_RX_PACKETS, Some(&TCP_RX_PACKETS_HISTOGRAM)), @@ -39,44 +35,21 @@ impl ProcNetSnmp { Ok(Self { file: File::open("/proc/net/snmp").expect("file not found"), counters, - prev: now, - next: now, - interval: config.interval(NAME), + interval: Interval::new(Instant::now(), config.interval(NAME)), }) } } impl Sampler for ProcNetSnmp { fn sample(&mut self) { - let now = Instant::now(); - - if now < self.next { - return; - } - - let elapsed = (now - self.prev).as_secs_f64(); - - if let Ok(nested_map) = NestedMap::try_from_procfs(&mut self.file) { - for (counter, pkey, lkey) in self.counters.iter_mut() { - if let Some(curr) = nested_map.get(pkey, lkey) { - counter.set(elapsed, curr); + if let Ok(elapsed) = self.interval.try_wait(Instant::now()) { + if let Ok(nested_map) = NestedMap::try_from_procfs(&mut self.file) { + for (counter, pkey, lkey) in self.counters.iter_mut() { + if let Some(curr) = nested_map.get(pkey, lkey) { + counter.set(elapsed.as_secs_f64(), curr); + } } } } - - // determine when to sample next - let next = self.next + self.interval; - - // it's possible we fell behind - if next > now { - // if we didn't, sample at the next planned time - self.next = next; - } else { - // if we did, sample after the interval has elapsed - self.next = now + self.interval; - } - - // mark when we last sampled - self.prev = now; } }