From 8f305f9d0e491dd4218e88d489044c40f8fba9d2 Mon Sep 17 00:00:00 2001 From: turuslan Date: Wed, 24 Nov 2021 18:03:28 +0300 Subject: [PATCH] lotus metrics Signed-off-by: turuslan --- cmake/dependencies.cmake | 3 + core/common/CMakeLists.txt | 5 ++ core/common/prometheus/metrics.hpp | 23 +++++++ core/common/prometheus/rpc.hpp | 49 ++++++++++++++ core/common/prometheus/since.hpp | 23 +++++++ core/node/main/main.cpp | 4 ++ core/node/main/metrics.hpp | 38 +++++++++-- core/node/pubsub_gate.cpp | 31 +++++++++ core/vm/interpreter/impl/interpreter_impl.cpp | 65 +++++++++++++++++++ core/vm/runtime/CMakeLists.txt | 1 + core/vm/runtime/impl/env.cpp | 43 ++++++++++++ 11 files changed, 279 insertions(+), 6 deletions(-) create mode 100644 core/common/prometheus/metrics.hpp create mode 100644 core/common/prometheus/rpc.hpp create mode 100644 core/common/prometheus/since.hpp diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake index d2b7f54507..11c568c3ea 100644 --- a/cmake/dependencies.cmake +++ b/cmake/dependencies.cmake @@ -71,3 +71,6 @@ find_package(RapidJSON CONFIG REQUIRED) # https://thalhammer.it/projects/jwt_cpp hunter_add_package(jwt-cpp) find_package(jwt-cpp CONFIG REQUIRED) + +hunter_add_package(prometheus-cpp) +find_package(prometheus-cpp CONFIG REQUIRED) diff --git a/core/common/CMakeLists.txt b/core/common/CMakeLists.txt index 6dfee5cb66..2fabd2784a 100644 --- a/core/common/CMakeLists.txt +++ b/core/common/CMakeLists.txt @@ -33,6 +33,11 @@ target_link_libraries(outcome p2p::p2p ) +add_library(prometheus INTERFACE) +target_link_libraries(prometheus INTERFACE + prometheus-cpp::core + ) + add_library(logger logger.cpp ) diff --git a/core/common/prometheus/metrics.hpp b/core/common/prometheus/metrics.hpp new file mode 100644 index 0000000000..2b60451c99 --- /dev/null +++ b/core/common/prometheus/metrics.hpp @@ -0,0 +1,23 @@ +/** + * Copyright Soramitsu Co., Ltd. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include + +namespace fc { + inline auto &prometheusRegistry() { + static prometheus::Registry x; + return x; + } + + constexpr std::initializer_list kDefaultPrometheusMsBuckets{ + 0.01, 0.05, 0.1, 0.3, 0.6, 0.8, 1, 2, 3, 4, 5, + 6, 8, 10, 13, 16, 20, 25, 30, 40, 50, 65, + 80, 100, 130, 160, 200, 250, 300, 400, 500, 650, 800, + 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000, 100000, + }; +} // namespace fc diff --git a/core/common/prometheus/rpc.hpp b/core/common/prometheus/rpc.hpp new file mode 100644 index 0000000000..7beaaddc9b --- /dev/null +++ b/core/common/prometheus/rpc.hpp @@ -0,0 +1,49 @@ +/** + * Copyright Soramitsu Co., Ltd. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include "api/rpc/rpc.hpp" +#include "common/prometheus/metrics.hpp" +#include "common/prometheus/since.hpp" + +namespace fc::api::rpc { + inline auto &metricApiTime() { + static auto &x{prometheus::BuildHistogram() + .Name("lotus_api_request_duration_ms") + .Help("Duration of API requests") + .Register(prometheusRegistry())}; + return x; + } + + inline Method metricApiTime(std::string name, Method f) { + return [name{std::move(name)}, f{std::move(f)}]( + const Value &value, + Respond respond, + MakeChan make_chan, + Send send, + const Permissions &permissions) { + f( + value, + [name{std::move(name)}, respond{std::move(respond)}, since{Since{}}]( + auto &&value) { + const auto time{since.ms()}; + metricApiTime() + .Add({{"endpoint", name}}, kDefaultPrometheusMsBuckets) + .Observe(time); + respond(std::move(value)); + }, + std::move(make_chan), + std::move(send), + permissions); + }; + } + + inline void metricApiTime(Rpc &rpc) { + for (auto &[name, value] : rpc.ms) { + value = metricApiTime(name, value); + } + } +} // namespace fc::api::rpc diff --git a/core/common/prometheus/since.hpp b/core/common/prometheus/since.hpp new file mode 100644 index 0000000000..b7977b38e8 --- /dev/null +++ b/core/common/prometheus/since.hpp @@ -0,0 +1,23 @@ +/** + * Copyright Soramitsu Co., Ltd. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +namespace fc { + struct Since { + using Clock = std::chrono::steady_clock; + + Clock::time_point start{Clock::now()}; + + template + T ms() const { + return std::chrono::duration_cast>( + Clock::now() - start) + .count(); + } + }; +} // namespace fc diff --git a/core/node/main/main.cpp b/core/node/main/main.cpp index e0c6273992..46edac3433 100644 --- a/core/node/main/main.cpp +++ b/core/node/main/main.cpp @@ -17,6 +17,7 @@ #include "common/libp2p/peer/peer_info_helper.hpp" #include "common/libp2p/soralog.hpp" #include "common/logger.hpp" +#include "common/prometheus/rpc.hpp" #include "drand/impl/http.hpp" #include "markets/storage/types.hpp" #include "node/blocksync_server.hpp" @@ -209,6 +210,9 @@ namespace fc { *node_objects.api, std::bind(node_objects.api->AuthVerify, std::placeholders::_1))}; + metricApiTime(*rpc_v1); + metricApiTime(*rpc); + std::map> rpcs; rpcs.emplace("/rpc/v0", rpc_v1); rpcs.emplace("/rpc/v1", rpc); diff --git a/core/node/main/metrics.hpp b/core/node/main/metrics.hpp index 080b27b680..defb3e96c1 100644 --- a/core/node/main/metrics.hpp +++ b/core/node/main/metrics.hpp @@ -5,6 +5,7 @@ #pragma once +#include #include #include @@ -12,6 +13,7 @@ #include "clock/utc_clock.hpp" #include "common/fd_usage.hpp" #include "common/memory_usage.hpp" +#include "common/prometheus/metrics.hpp" #include "node/events.hpp" #include "node/main/builder.hpp" #include "node/sync_job.hpp" @@ -27,9 +29,19 @@ namespace fc::node { } std::string prometheus() const { - std::stringstream ss; - auto metric{[&](auto &&name, auto &&value) { - ss << name << ' ' << value << std::endl; + auto families{prometheusRegistry().Collect()}; + using ::prometheus::MetricType; + auto manual{[&](MetricType type, + std::string name, + std::string help) -> ::prometheus::ClientMetric & { + auto &family{families.emplace_back()}; + family.name = std::move(name); + family.help = std::move(help); + family.type = type; + return family.metric.emplace_back(); + }}; + auto metric{[&](std::string name, double value) { + manual(MetricType::Untyped, std::move(name), "").untyped.value = value; }}; metric("uptime", @@ -52,8 +64,9 @@ namespace fc::node { metric("height_attached", std::max(height_head, o.sync_job->metricAttachedHeight())); metric("height_known", std::max(height_head, height_known.load())); - metric("height_expected", - o.chain_epoch_clock->epochAtTime(o.utc_clock->nowUTC()).value()); + const auto height_expected{ + o.chain_epoch_clock->epochAtTime(o.utc_clock->nowUTC()).value()}; + metric("height_expected", height_expected); auto car{[&](auto _size, auto _count, auto _tmp, auto &ipld) { uint64_t size{}; @@ -85,7 +98,20 @@ namespace fc::node { } instances_lock.unlock(); - return ss.str(); + manual(MetricType::Gauge, + "lotus_chain_node_height", + "Current Height of the node") + .gauge.value = height_head; + manual(MetricType::Gauge, + "lotus_chain_node_height_expected", + "Expected Height of the node") + .gauge.value = height_expected; + manual(MetricType::Gauge, + "lotus_chain_node_worker_height", + "Height of workers on the node") + .gauge.value = height_head; + + return ::prometheus::TextSerializer{}.Serialize(families); } const NodeObjects &o; diff --git a/core/node/pubsub_gate.cpp b/core/node/pubsub_gate.cpp index 69a95b8144..c67a7222ba 100644 --- a/core/node/pubsub_gate.cpp +++ b/core/node/pubsub_gate.cpp @@ -7,6 +7,7 @@ #include "codec/cbor/cbor_codec.hpp" #include "common/logger.hpp" +#include "common/prometheus/metrics.hpp" #include "primitives/block/block.hpp" #include "primitives/cid/cid_of_cbor.hpp" @@ -85,6 +86,14 @@ namespace fc::sync { if (!gossip_->publish(blocks_topic_, buffer)) { log()->warn("cannot publish block"); } + + static auto &metric{prometheus::BuildCounter() + .Name("lotus_block_published") + .Help("Counter for total locally published blocks") + .Register(prometheusRegistry()) + .Add({})}; + metric.Increment(); + return outcome::success(); } @@ -93,6 +102,14 @@ namespace fc::sync { Bytes{codec::cbor::encode(msg).value()})) { log()->warn("cannot publish message"); } + + static auto &metric{ + prometheus::BuildCounter() + .Name("lotus_message_published") + .Help("Counter for total locally published messages") + .Register(prometheusRegistry()) + .Add({})}; + metric.Increment(); } bool PubSubGate::onBlock(const PeerId &from, const Bytes &raw) { @@ -102,6 +119,13 @@ namespace fc::sync { // TODO validate + static auto &metric{prometheus::BuildCounter() + .Name("lotus_block_received") + .Help("Counter for total received blocks") + .Register(prometheusRegistry()) + .Add({})}; + metric.Increment(); + events_->signalBlockFromPubSub( events::BlockFromPubSub{from, CbCid::hash(cbor), std::move(bm)}); @@ -129,6 +153,13 @@ namespace fc::sync { } else { auto res = codec::cbor::decode(raw); if (res) { + static auto &metric{prometheus::BuildCounter() + .Name("lotus_message_received") + .Help("Counter for total received messages") + .Register(prometheusRegistry()) + .Add({})}; + metric.Increment(); + events_->signalMessageFromPubSub(events::MessageFromPubSub{ from, std::move(cid_res.value()), diff --git a/core/vm/interpreter/impl/interpreter_impl.cpp b/core/vm/interpreter/impl/interpreter_impl.cpp index 15372aa3c5..f494e4d909 100644 --- a/core/vm/interpreter/impl/interpreter_impl.cpp +++ b/core/vm/interpreter/impl/interpreter_impl.cpp @@ -7,6 +7,8 @@ #include +#include "common/prometheus/metrics.hpp" +#include "common/prometheus/since.hpp" #include "const.hpp" #include "primitives/tipset/load.hpp" #include "vm/actor/builtin/v0/cron/cron_actor.hpp" @@ -70,6 +72,61 @@ namespace fc::vm::interpreter { std::vector *all_receipts) const { const auto &ipld{env_context_.ipld}; + static auto &metricFailure{ + prometheus::BuildCounter() + .Name("lotus_block_failure") + .Help("Counter for block validation failures") + .Register(prometheusRegistry()) + .Add({})}; + static auto &metricSuccess{ + prometheus::BuildCounter() + .Name("lotus_block_success") + .Help("Counter for block validation successes") + .Register(prometheusRegistry()) + .Add({})}; + static auto &metricTotal{prometheus::BuildHistogram() + .Name("lotus_vm_applyblocks_total_ms") + .Help("Time spent applying block state") + .Register(prometheusRegistry()) + .Add({}, kDefaultPrometheusMsBuckets)}; + static auto &metricMessages{prometheus::BuildHistogram() + .Name("lotus_vm_applyblocks_messages") + .Help("Time spent applying block messages") + .Register(prometheusRegistry()) + .Add({}, kDefaultPrometheusMsBuckets)}; + static auto &metricEarly{ + prometheus::BuildHistogram() + .Name("lotus_vm_applyblocks_early") + .Help("Time spent in early apply-blocks (null cron, upgrades)") + .Register(prometheusRegistry()) + .Add({}, kDefaultPrometheusMsBuckets)}; + static auto &metricCron{prometheus::BuildHistogram() + .Name("lotus_vm_applyblocks_cron") + .Help("Time spent in cron") + .Register(prometheusRegistry()) + .Add({}, kDefaultPrometheusMsBuckets)}; + static auto &metricFlush{prometheus::BuildHistogram() + .Name("lotus_vm_applyblocks_flush") + .Help("Time spent flushing vm state") + .Register(prometheusRegistry()) + .Add({}, kDefaultPrometheusMsBuckets)}; + + bool success{false}; + const Since since; + std::pair<::prometheus::Histogram *, Since> last_step; + auto nextStep{[&](auto metric) { + if (last_step.first) { + last_step.first->Observe(last_step.second.ms()); + } + last_step = std::make_pair(metric, Since{}); + }}; + auto BOOST_OUTCOME_TRY_UNIQUE_NAME{gsl::finally([&] { + metricTotal.Observe(since.ms()); + nextStep(nullptr); + (success ? metricSuccess : metricFailure).Increment(); + })}; + nextStep(&metricEarly); + auto on_receipt{[&](auto &receipt) { if (all_receipts) { all_receipts->push_back(receipt); @@ -111,6 +168,8 @@ namespace fc::vm::interpreter { env->setHeight(tipset->height()); } + nextStep(&metricMessages); + adt::Array receipts{ipld}; MessageVisitor message_visitor{ipld, true, true}; for (const auto &block : tipset->blks) { @@ -147,8 +206,12 @@ namespace fc::vm::interpreter { on_receipt(receipt); } + nextStep(&metricCron); + OUTCOME_TRY(cron()); + nextStep(&metricFlush); + OUTCOME_TRY(new_state_root, env->state_tree->flush()); OUTCOME_TRY(env->ipld->flush(new_state_root)); @@ -156,6 +219,8 @@ namespace fc::vm::interpreter { OUTCOME_TRY(weight, getWeight(tipset)); + success = true; + return Result{new_state_root, receipts.amt.cid(), std::move(weight)}; } diff --git a/core/vm/runtime/CMakeLists.txt b/core/vm/runtime/CMakeLists.txt index 6dc10ac61f..b6203a8b32 100644 --- a/core/vm/runtime/CMakeLists.txt +++ b/core/vm/runtime/CMakeLists.txt @@ -20,6 +20,7 @@ target_link_libraries(runtime ipfs_datastore_error keystore message + prometheus proofs tipset signature diff --git a/core/vm/runtime/impl/env.cpp b/core/vm/runtime/impl/env.cpp index 96221f07c2..36831e64f3 100644 --- a/core/vm/runtime/impl/env.cpp +++ b/core/vm/runtime/impl/env.cpp @@ -7,6 +7,8 @@ #include "cbor_blake/cid.hpp" #include "codec/cbor/light_reader/cid.hpp" +#include "common/prometheus/metrics.hpp" +#include "common/prometheus/since.hpp" #include "vm/actor/builtin/v0/miner/miner_actor.hpp" #include "vm/actor/cgo/actors.hpp" #include "vm/exit_code/exit_code.hpp" @@ -26,11 +28,34 @@ namespace fc::vm::runtime { using toolchain::Toolchain; using version::getNetworkVersion; + auto &metricVmApplyCount() { + static auto &x{prometheus::BuildCounter() + .Name("lotus_vm_applied") + .Help("Counter for messages (including internal " + "messages) processed by the VM") + .Register(prometheusRegistry()) + .Add({})}; + return x; + } + IpldBuffered::IpldBuffered(IpldPtr ipld) : ipld{std::move(ipld)} {} outcome::result IpldBuffered::flush(const CID &root) { assert(!flushed); flushed = true; + + static auto &metricTime{prometheus::BuildCounter() + .Name("lotus_vm_flush_copy_ms") + .Help("Time spent in VM Flush Copy") + .Register(prometheusRegistry()) + .Add({})}; + static auto &metricCount{prometheus::BuildCounter() + .Name("lotus_vm_flush_copy_count") + .Help("Number of copied objects") + .Register(prometheusRegistry()) + .Add({})}; + const Since since; + assert(isCbor(root)); auto _root{*asBlake(root)}; assert(write.count(_root)); @@ -56,6 +81,10 @@ namespace fc::vm::runtime { OUTCOME_TRY(ipld->set(CID{key}, std::move(write.at(key)))); } write.clear(); + + metricTime.Increment(since.ms()); + metricCount.Increment(queue.size()); + return outcome::success(); } @@ -103,6 +132,9 @@ namespace fc::vm::runtime { // NOLINTNEXTLINE(readability-function-cognitive-complexity) outcome::result Env::applyMessage(const UnsignedMessage &message, size_t size) { + auto BOOST_OUTCOME_TRY_UNIQUE_NAME{ + gsl::finally([] { metricVmApplyCount().Increment(); })}; + TokenAmount locked; auto add_locked{ [&](auto &address, const TokenAmount &add) -> outcome::result { @@ -229,6 +261,9 @@ namespace fc::vm::runtime { outcome::result Env::applyImplicitMessage( const UnsignedMessage &message) { + auto BOOST_OUTCOME_TRY_UNIQUE_NAME{ + gsl::finally([] { metricVmApplyCount().Increment(); })}; + auto execution = Execution::make(shared_from_this(), message); auto result = execution->send(message); MessageReceipt receipt; @@ -316,6 +351,14 @@ namespace fc::vm::runtime { dvm::onSend(message); DVM_INDENT; + static auto &metric{prometheus::BuildCounter() + .Name("lotus_vm_sends") + .Help("Counter for sends processed by the VM") + .Register(prometheusRegistry()) + .Add({})}; + auto BOOST_OUTCOME_TRY_UNIQUE_NAME{ + gsl::finally([] { metric.Increment(); })}; + const auto network_version = getNetworkVersion(env->epoch); OUTCOME_TRY(catchAbort(chargeGas(charge), network_version)); Actor to_actor;