diff --git a/src/v/config/BUILD b/src/v/config/BUILD index 3021fb7594c93..73c16681e54b4 100644 --- a/src/v/config/BUILD +++ b/src/v/config/BUILD @@ -8,6 +8,7 @@ redpanda_cc_library( "client_group_byte_rate_quota.cc", "configuration.cc", "node_config.cc", + "node_overrides.cc", "rest_authn_endpoint.cc", "rjson_serialization.cc", "throughput_control_group.cc", @@ -29,6 +30,7 @@ redpanda_cc_library( "fwd.h", "mock_property.h", "node_config.h", + "node_overrides.h", "property.h", "rest_authn_endpoint.h", "rjson_serialization.h", diff --git a/src/v/config/CMakeLists.txt b/src/v/config/CMakeLists.txt index ef59eb05c3032..0fd77254a1983 100644 --- a/src/v/config/CMakeLists.txt +++ b/src/v/config/CMakeLists.txt @@ -11,6 +11,7 @@ v_cc_library( validators.cc throughput_control_group.cc tls_config.cc + node_overrides.cc DEPS v::json v::model diff --git a/src/v/config/convert.h b/src/v/config/convert.h index fa69a28851221..231f79af7c6cd 100644 --- a/src/v/config/convert.h +++ b/src/v/config/convert.h @@ -633,4 +633,26 @@ struct convert { } }; +template<> +struct convert { + using type = model::node_uuid; + static Node encode(const type& rhs) { + return Node(ssx::sformat("{}", rhs)); + } + static bool decode(const Node& node, type& rhs) { + auto value = node.as(); + auto out = [&value]() -> std::optional { + try { + return model::node_uuid(uuid_t::from_string(value)); + } catch (const std::runtime_error& e) { + return std::nullopt; + } + }(); + if (out.has_value()) { + rhs = out.value(); + } + return out.has_value(); + } +}; + } // namespace YAML diff --git a/src/v/config/node_config.cc b/src/v/config/node_config.cc index 4fc763f5c3d2a..e9221fbbf45ee 100644 --- a/src/v/config/node_config.cc +++ b/src/v/config/node_config.cc @@ -234,6 +234,15 @@ node_config::node_config() noexcept "Path to the directory that contains the OpenSSL FIPS-compliant module.", {.visibility = visibility::user}, std::nullopt) + , node_id_overrides( + *this, + "node_id_overrides", + "List of node ID and UUID overrides to be applied at broker startup. " + "Each entry includes the current UUID and desired ID and UUID. Each " + "entry applies to a given node if and only if 'current' matches that " + "node's current UUID.", + {.visibility = visibility::user}, + {}) , _advertised_rpc_api( *this, "advertised_rpc_api", diff --git a/src/v/config/node_config.h b/src/v/config/node_config.h index 121d81df340fe..ee2621f03b889 100644 --- a/src/v/config/node_config.h +++ b/src/v/config/node_config.h @@ -14,9 +14,11 @@ #include "config/broker_endpoint.h" #include "config/convert.h" #include "config/data_directory_path.h" +#include "config/node_overrides.h" #include "config/property.h" #include "config/seed_server.h" #include "config_store.h" +#include "model/fundamental.h" #include #include @@ -99,6 +101,8 @@ struct node_config final : public config_store { // Path to the directory that holds the OpenSSL FIPS module property> openssl_module_directory; + property> node_id_overrides; + // build pidfile path: `/pid.lock` std::filesystem::path pidfile_path() const { return data_directory().path / "pid.lock"; diff --git a/src/v/config/node_overrides.cc b/src/v/config/node_overrides.cc new file mode 100644 index 0000000000000..5bfc7faa2f646 --- /dev/null +++ b/src/v/config/node_overrides.cc @@ -0,0 +1,76 @@ +/* + * Copyright 2024 Redpanda Data, Inc. + * + * Use of this software is governed by the Business Source License + * included in the file licenses/BSL.md + * + * As of the Change Date specified in that file, in accordance with + * the Business Source License, use of this software will be governed + * by the Apache License, Version 2.0 + */ + +#include "config/node_overrides.h" + +#include "re2/re2.h" + +#include +#include + +namespace config { + +std::ostream& operator<<(std::ostream& os, const config::node_id_override& v) { + return os << ssx::sformat("{}:{}:{}", v.key, v.uuid, v.id); +} + +std::istream& operator>>(std::istream& is, config::node_id_override& v) { + std::string s; + is >> s; + + const static re2::RE2 pattern{R"(^([^:]+):([^:]+):([^:]+)$)"}; + vassert(pattern.ok(), "Regex compilation failed"); + + std::string curr, uuid, id; + + if (!re2::RE2::FullMatch(s, pattern, &curr, &uuid, &id)) { + throw std::runtime_error(fmt::format( + R"(Formatting error: '{}', must be of form '::')", + s)); + } + + v.key = boost::lexical_cast(curr); + v.uuid = boost::lexical_cast(uuid); + v.id = boost::lexical_cast(id); + + return is; +} + +void node_override_store::maybe_set_overrides( + model::node_uuid node_uuid, + const std::vector& overrides) { + vassert(ss::this_shard_id() == 0, "Only set overrides on shard 0"); + for (const auto& o : overrides) { + if (o.key == node_uuid) { + if (_uuid_override.has_value() || _id_override.has_value()) { + throw std::runtime_error( + "Invalid node ID override: Limit one override per broker"); + break; + } + _uuid_override.emplace(o.uuid); + _id_override.emplace(o.id); + } + } +} + +const std::optional& +node_override_store::node_uuid() const noexcept { + vassert(ss::this_shard_id() == 0, "Only get overrides on shard 0"); + return _uuid_override; +} + +const std::optional& +node_override_store::node_id() const noexcept { + vassert(ss::this_shard_id() == 0, "Only get overrides on shard 0"); + return _id_override; +} + +} // namespace config diff --git a/src/v/config/node_overrides.h b/src/v/config/node_overrides.h new file mode 100644 index 0000000000000..393b4e61fb390 --- /dev/null +++ b/src/v/config/node_overrides.h @@ -0,0 +1,111 @@ +/* + * Copyright 2024 Redpanda Data, Inc. + * + * Use of this software is governed by the Business Source License + * included in the file licenses/BSL.md + * + * As of the Change Date specified in that file, in accordance with + * the Business Source License, use of this software will be governed + * by the Apache License, Version 2.0 + */ + +#pragma once + +#include "config/convert.h" +#include "model/fundamental.h" +#include "ssx/sformat.h" + +#include + +#include +#include + +namespace config { + +/** + * In-memory representation of a Node ID/UUID override candidate. + * + * Semantics surrounding when and how to apply an override + * are not represented here and should be carefully considered at the + * point of usage. + */ +struct node_id_override { + node_id_override() = default; + node_id_override( + model::node_uuid key, + model::node_uuid uuid_value, + model::node_id id_value) + : key(key) + , uuid(uuid_value) + , id(id_value) {} + model::node_uuid key{}; + model::node_uuid uuid{}; + model::node_id id{}; + +private: + friend std::ostream& + operator<<(std::ostream& os, const node_id_override& v); + friend std::istream& operator>>(std::istream& is, node_id_override& v); + friend bool operator==(const node_id_override&, const node_id_override&) + = default; +}; + +/** + * Thin data structure to encapsulate 'node_id_override' filtering and + * short-term storage. + * + * Given an array of 'node_id_override's, select the appropriate override as: + * + * For some instance 'node_id_override O', 'O.uuid' and 'O.id' should + * be applied to some node if and only if its _current_ UUID matches + * 'O.key'. + */ +struct node_override_store { + node_override_store() = default; + + /** + * From the provided vector, accept the override (if one exists) whose + * 'key' matches 'node_uuid'. + * + * Throws if multiple overrides match 'node_uuid' + */ + void maybe_set_overrides( + model::node_uuid node_uuid, + const std::vector& overrides); + + const std::optional& node_uuid() const noexcept; + const std::optional& node_id() const noexcept; + +private: + std::optional _uuid_override; + std::optional _id_override; +}; + +} // namespace config + +namespace YAML { + +template<> +struct convert { + using type = config::node_id_override; + static inline Node encode(const type& rhs) { + Node node; + node["current_uuid"] = ssx::sformat("{}", rhs.key); + node["new_uuid"] = ssx::sformat("{}", rhs.uuid); + node["new_id"] = ssx::sformat("{}", rhs.id); + + return node; + } + + static inline bool decode(const Node& node, type& rhs) { + if (!node["current_uuid"] || !node["new_uuid"] || !node["new_id"]) { + return false; + } + rhs.key = node["current_uuid"].as(); + rhs.uuid = node["new_uuid"].as(); + rhs.id = node["new_id"].as(); + return true; + } +}; + +} // namespace YAML diff --git a/src/v/config/property.h b/src/v/config/property.h index 5b494f6f8f087..e0a0f79b00eca 100644 --- a/src/v/config/property.h +++ b/src/v/config/property.h @@ -653,6 +653,10 @@ consteval std::string_view property_type_name() { return "string"; } else if constexpr (std::is_same_v) { return "string"; + } else if constexpr (std::is_same_v) { + return "string"; + } else if constexpr (std::is_same_v) { + return "node_id_override"; } else { static_assert( base::unsupported_type::value, "Type name not defined"); @@ -927,5 +931,4 @@ class retention_duration_property final } } }; - }; // namespace config diff --git a/src/v/config/rjson_serialization.cc b/src/v/config/rjson_serialization.cc index fc1ce9b4c52af..148f42fc90c95 100644 --- a/src/v/config/rjson_serialization.cc +++ b/src/v/config/rjson_serialization.cc @@ -222,4 +222,33 @@ void rjson_serialize( stringize(w, v); } +void rjson_serialize( + json::Writer& w, const model::node_uuid& v) { + stringize(w, v); +} + +void rjson_serialize( + json::Writer& w, const config::node_id_override& v) { + w.StartObject(); + + w.Key("current_uuid"); + stringize(w, v.key); + w.Key("new_uuid"); + stringize(w, v.uuid); + w.Key("new_id"); + stringize(w, v.id); + + w.EndObject(); +} + +void rjson_serialize( + json::Writer& w, + const std::vector& v) { + w.StartArray(); + for (const auto& e : v) { + rjson_serialize(w, e); + } + w.EndArray(); +} + } // namespace json diff --git a/src/v/config/rjson_serialization.h b/src/v/config/rjson_serialization.h index b02e823f3c6ef..40e65dcfb9198 100644 --- a/src/v/config/rjson_serialization.h +++ b/src/v/config/rjson_serialization.h @@ -14,6 +14,7 @@ #include "base/seastarx.h" #include "config/data_directory_path.h" #include "config/endpoint_tls_config.h" +#include "config/node_overrides.h" #include "config/seed_server.h" #include "config/tls_config.h" #include "config/types.h" @@ -122,4 +123,14 @@ void rjson_serialize( void rjson_serialize( json::Writer&, const config::tls_version& v); +void rjson_serialize( + json::Writer&, const model::node_uuid&); + +void rjson_serialize( + json::Writer&, const config::node_id_override&); + +void rjson_serialize( + json::Writer&, + const std::vector&); + } // namespace json diff --git a/src/v/config/tests/CMakeLists.txt b/src/v/config/tests/CMakeLists.txt index 7155be68b0e47..d20e62cb2ba84 100644 --- a/src/v/config/tests/CMakeLists.txt +++ b/src/v/config/tests/CMakeLists.txt @@ -10,7 +10,8 @@ set(srcs seed_server_property_test.cc cloud_credentials_source_test.cc validator_tests.cc - throughput_control_group_test.cc) + throughput_control_group_test.cc + node_override_test.cc) rp_test( UNIT_TEST diff --git a/src/v/config/tests/node_override_test.cc b/src/v/config/tests/node_override_test.cc new file mode 100644 index 0000000000000..d676e23714d42 --- /dev/null +++ b/src/v/config/tests/node_override_test.cc @@ -0,0 +1,187 @@ +/* + * Copyright 2024 Redpanda Data, Inc. + * + * Use of this software is governed by the Business Source License + * included in the file licenses/BSL.md + * + * As of the Change Date specified in that file, in accordance with + * the Business Source License, use of this software will be governed + * by the Apache License, Version 2.0 + */ + +#include "config/configuration.h" +#include "config/node_overrides.h" +#include "model/fundamental.h" +#include "utils/uuid.h" + +#include +#include + +#include +#include + +std::vector +read_from_yaml(const ss::sstring& yaml_string) { + auto node = YAML::Load(yaml_string); + return node["node_id_overrides"] + .as>(); +} + +SEASTAR_THREAD_TEST_CASE(test_overrides_decode_empty) { + auto empty_uuid = "node_id_overrides: []\n"; + auto empty_uuid_cfg = read_from_yaml(empty_uuid); + BOOST_CHECK(empty_uuid_cfg.empty()); +} + +SEASTAR_THREAD_TEST_CASE(test_overrides_decode) { + static model::node_uuid node_uuid{uuid_t::create()}; + static model::node_uuid other_uuid{uuid_t::create()}; + auto node_id = model::node_id{0}; + static auto some_override = ssx::sformat( + "node_id_overrides:\n" + " - current_uuid: {}\n" + " new_uuid: {}\n" + " new_id: {}\n" + " - current_uuid: {}\n" + " new_uuid: {}\n" + " new_id: {}\n", + node_uuid, + other_uuid, + node_id, + node_uuid, + other_uuid, + node_id); + auto some_override_cfg = read_from_yaml(some_override); + BOOST_CHECK_EQUAL(some_override_cfg.size(), 2); + for (const auto& u : some_override_cfg) { + BOOST_CHECK_EQUAL(u.key, node_uuid); + BOOST_CHECK_EQUAL(u.uuid, other_uuid); + BOOST_CHECK_EQUAL(u.id, node_id); + } +} + +SEASTAR_THREAD_TEST_CASE(test_overrides_decode_errors) { + static constexpr std::string_view entry_fmt = "node_id_overrides:\n" + " - current_uuid: {}\n" + " new_uuid: {}\n" + " new_id: {}\n"; + + BOOST_CHECK_THROW( + read_from_yaml(fmt::format( + entry_fmt, + model::node_uuid{uuid_t::create()}, + 23 /* does not parse to uuid */, + model::node_id{0})), + YAML::TypedBadConversion); + + BOOST_CHECK_THROW( + read_from_yaml(fmt::format( + entry_fmt, + model::node_uuid{uuid_t::create()}, + model::node_uuid{uuid_t::create()}, + model::node_uuid{uuid_t::create()} /* does not parse to node ID */)), + YAML::TypedBadConversion); + + BOOST_CHECK_THROW( + read_from_yaml(fmt::format( + "node_id_overrides:\n" + " - current_uuid: {}\n" + " new_uuid: {}\n" /* missing new_id field */, + model::node_uuid{uuid_t::create()}, + model::node_uuid{uuid_t::create()})), + YAML::TypedBadConversion); + + BOOST_CHECK_THROW( + read_from_yaml(fmt::format( + "node_id_overrides:\n" + " - current_uuid: {}\n" + " new_id: {}\n" /* missing new_uuid field */, + model::node_uuid{uuid_t::create()}, + model::node_id{0})), + YAML::TypedBadConversion); +} + +SEASTAR_THREAD_TEST_CASE(test_overrides_lexical_cast) { + model::node_uuid uuid_a{uuid_t::create()}; + model::node_uuid uuid_b{uuid_t::create()}; + model::node_id id{0}; + + auto option = ssx::sformat("{}:{}:{}", uuid_a, uuid_b, id); + + config::node_id_override ovr; + auto convert = [&ovr](std::string_view s) { + ovr = boost::lexical_cast(s); + }; + + BOOST_REQUIRE_NO_THROW(std::invoke(convert, option)); + BOOST_CHECK_EQUAL(ovr.key, uuid_a); + BOOST_CHECK_EQUAL(ovr.uuid, uuid_b); + BOOST_CHECK_EQUAL(ovr.id, id); + std::stringstream ss; + ss << ovr; + BOOST_CHECK_EQUAL(ss.str(), option); + + option = ssx::sformat("{}", uuid_a); + BOOST_CHECK_THROW(std::invoke(convert, option), std::runtime_error); + option = ssx::sformat("{}:", uuid_a); + BOOST_CHECK_THROW(std::invoke(convert, option), std::runtime_error); + option = ssx::sformat("{}:{}", uuid_a, uuid_b); + BOOST_CHECK_THROW(std::invoke(convert, option), std::runtime_error); + option = ssx::sformat("{}:{}:", uuid_a, uuid_b); + BOOST_CHECK_THROW(std::invoke(convert, option), std::runtime_error); + option = ssx::sformat("{}:{}:{}", "not-a-uuid", uuid_b, id); + BOOST_CHECK_THROW(std::invoke(convert, option), boost::bad_lexical_cast); + option = ssx::sformat("{}:{}:{}", uuid_a, uuid_b, "not-an-id"); + BOOST_CHECK_THROW(std::invoke(convert, option), boost::bad_lexical_cast); +} + +SEASTAR_THREAD_TEST_CASE(test_overrides_store) { + static model::node_uuid some_uuid{uuid_t::create()}; + static model::node_uuid other_uuid{uuid_t::create()}; + static constexpr model::node_id some_id{23}; + static constexpr model::node_id other_id{0}; + + std::vector ovr_vec{ + config::node_id_override{some_uuid, some_uuid, some_id}, + config::node_id_override{other_uuid, other_uuid, other_id}, + }; + + // Picks up the specified override based on key + for (const auto& o : ovr_vec) { + config::node_override_store store; + store.maybe_set_overrides(o.key, ovr_vec); + BOOST_CHECK_EQUAL(store.node_uuid(), o.uuid); + BOOST_CHECK_EQUAL(store.node_id(), o.id); + } + + // Ignores anything that doesn't match curr_uuid + { + config::node_override_store store; + store.maybe_set_overrides(model::node_uuid{uuid_t::create()}, ovr_vec); + BOOST_CHECK(!store.node_uuid().has_value()); + BOOST_CHECK(!store.node_id().has_value()); + } + + // It's an error to set multiple overrides for a single target UUID + { + config::node_override_store store; + std::vector ovr_vec{ + config::node_id_override{some_uuid, some_uuid, some_id}, + config::node_id_override{some_uuid, other_uuid, other_id}, + }; + + BOOST_CHECK_THROW( + store.maybe_set_overrides(some_uuid, ovr_vec), std::runtime_error); + } + + // It's also an error to set the same override twice + { + config::node_override_store store; + std::vector ovr_vec{ + config::node_id_override{some_uuid, some_uuid, some_id}}; + + BOOST_CHECK_NO_THROW(store.maybe_set_overrides(some_uuid, ovr_vec)); + BOOST_CHECK_THROW( + store.maybe_set_overrides(some_uuid, ovr_vec), std::runtime_error); + } +} diff --git a/src/v/redpanda/application.cc b/src/v/redpanda/application.cc index 288ecc4cbc1a1..9c3d63a8b88dc 100644 --- a/src/v/redpanda/application.cc +++ b/src/v/redpanda/application.cc @@ -414,6 +414,11 @@ int application::run(int ac, char** av) { "redpanda-cfg", po::value(), ".yaml file config for redpanda"); + app.add_options()( + "node-id-overrides", + po::value>()->multitoken(), + "Override node UUID and ID iff current UUID matches " + "- usage: ::"); // Validate command line args using options registered by the app and // seastar. Keep the resulting variables in a temporary map so they don't @@ -433,6 +438,14 @@ int application::run(int ac, char** av) { std::cout << redpanda_version() << std::endl; return 0; } + + if (!vm["node-id-overrides"].empty()) { + fmt::print( + std::cout, + "Node ID overrides: {}", + vm["node-id-overrides"] + .as>()); + } } // use endl for explicit flushing std::cout << community_msg << std::endl; @@ -852,6 +865,15 @@ void application::hydrate_config(const po::variables_map& cfg) { config::node().rack.set_value(std::nullopt); } + // load ID overrides + if (!cfg["node-id-overrides"].empty()) { + ss::smp::invoke_on_all([&cfg] { + config::node().node_id_overrides.set_value( + cfg["node-id-overrides"] + .as>()); + }).get(); + } + // This includes loading from local bootstrap file or legacy // config file on first-start or upgrade cases. _config_preload = cluster::config_manager::preload(config).get0(); @@ -2567,6 +2589,25 @@ void application::start_bootstrap_services() { serde::to_iobuf(node_uuid)) .get(); } + + _node_overrides.maybe_set_overrides( + node_uuid, config::node().node_id_overrides()); + + // Apply UUID override to node config if present + if (auto u = _node_overrides.node_uuid(); u.has_value()) { + vlog( + _log.warn, + "Overriding UUID for node: {} -> {}", + node_uuid, + u.value()); + node_uuid = u.value(); + kvs + .put( + storage::kvstore::key_space::controller, + node_uuid_key, + serde::to_iobuf(node_uuid)) + .get(); + } storage .invoke_on_all([node_uuid](storage::api& storage) mutable { storage.set_node_uuid(node_uuid); @@ -2622,6 +2663,18 @@ void application::wire_up_and_start(::stop_signal& app_signal, bool test_mode) { "Running with already-established node ID {}", config::node().node_id()); node_id = config::node().node_id().value(); + } else if (auto id = _node_overrides.node_id(); id.has_value()) { + vlog( + _log.warn, + "Overriding node ID: {} -> {}", + config::node().node_id(), + id); + node_id = id.value(); + // null out the config'ed ID indiscriminately; it will be set outside + // the conditional + ss::smp::invoke_on_all([] { + config::node().node_id.set_value(std::nullopt); + }).get(); } else { auto registration_result = cd.register_with_cluster().get(); node_id = registration_result.assigned_node_id; diff --git a/src/v/redpanda/application.h b/src/v/redpanda/application.h index 17b24f7cb953c..73905176d0660 100644 --- a/src/v/redpanda/application.h +++ b/src/v/redpanda/application.h @@ -338,6 +338,8 @@ class application { // instantiated only in recovery mode std::unique_ptr _tx_manager_migrator; + config::node_override_store _node_overrides{}; + ss::sharded _as; }; diff --git a/src/v/utils/uuid.cc b/src/v/utils/uuid.cc index ac652cf6cf13a..d4cb3e46e4dca 100644 --- a/src/v/utils/uuid.cc +++ b/src/v/utils/uuid.cc @@ -17,7 +17,8 @@ #include #include -#include +#include +#include uuid_t::uuid_t(const std::vector& v) { if (v.size() != length) { @@ -42,4 +43,15 @@ std::ostream& operator<<(std::ostream& os, const uuid_t& u) { return os << fmt::format("{}", u._uuid); } +std::istream& operator>>(std::istream& is, uuid_t& u) { + std::string s; + is >> s; + try { + u = uuid_t::from_string(s); + } catch (const std::runtime_error&) { + is.setstate(std::ios::failbit); + } + return is; +} + uuid_t::operator ss::sstring() const { return fmt::format("{}", _uuid); } diff --git a/src/v/utils/uuid.h b/src/v/utils/uuid.h index d5c6e2f199637..5dade5c808e05 100644 --- a/src/v/utils/uuid.h +++ b/src/v/utils/uuid.h @@ -41,6 +41,7 @@ class uuid_t { } friend std::ostream& operator<<(std::ostream& os, const uuid_t& u); + friend std::istream& operator>>(std::istream& is, uuid_t& u); friend bool operator==(const uuid_t& u, const uuid_t& v) = default; operator ss::sstring() const; diff --git a/tests/rptest/services/redpanda.py b/tests/rptest/services/redpanda.py index ab43ce2155b6c..c2ce672591ed3 100644 --- a/tests/rptest/services/redpanda.py +++ b/tests/rptest/services/redpanda.py @@ -1395,7 +1395,8 @@ def restart_nodes(self, start_timeout=None, stop_timeout=None, auto_assign_node_id=False, - omit_seeds_on_idx_one=True): + omit_seeds_on_idx_one=True, + extra_cli: list[str] = []): nodes = [nodes] if isinstance(nodes, ClusterNode) else nodes with concurrent.futures.ThreadPoolExecutor( @@ -1412,7 +1413,8 @@ def restart_nodes(self, override_cfg_params=override_cfg_params, timeout=start_timeout, auto_assign_node_id=auto_assign_node_id, - omit_seeds_on_idx_one=omit_seeds_on_idx_one), nodes)) + omit_seeds_on_idx_one=omit_seeds_on_idx_one, + extra_cli=extra_cli), nodes)) def set_extra_rp_conf(self, conf): self._extra_rp_conf = conf @@ -2905,7 +2907,7 @@ def write_tls_certs(self): self._audit_log_config.truststore_file = RedpandaService.TLS_CA_CRT_FILE self._audit_log_config.crl_file = RedpandaService.TLS_CA_CRL_FILE - def start_redpanda(self, node): + def start_redpanda(self, node, extra_cli: list[str] = []): preamble, res_args = self._resource_settings.to_cli( dedicated_node=self._dedicated_nodes) @@ -2927,6 +2929,7 @@ def start_redpanda(self, node): " --abort-on-seastar-bad-alloc " " --dump-memory-diagnostics-on-alloc-failure-kind=all " f" {res_args} " + f" {' '.join(extra_cli)}" f" >> {RedpandaService.STDOUT_STDERR_CAPTURE} 2>&1 &") node.account.ssh(cmd) @@ -3063,7 +3066,8 @@ def start_node(self, auto_assign_node_id: bool = False, omit_seeds_on_idx_one: bool = True, skip_readiness_check: bool = False, - node_id_override: int | None = None): + node_id_override: int | None = None, + extra_cli: list[str] = []): """ Start a single instance of redpanda. This function will not return until redpanda appears to have started successfully. If redpanda does not @@ -3099,7 +3103,7 @@ def start_node(self, ) def start_rp(): - self.start_redpanda(node) + self.start_redpanda(node, extra_cli=extra_cli) if expect_fail: wait_until( diff --git a/tests/rptest/tests/admin_uuid_operations_test.py b/tests/rptest/tests/admin_uuid_operations_test.py index 7f2e363877c56..80261467b976e 100644 --- a/tests/rptest/tests/admin_uuid_operations_test.py +++ b/tests/rptest/tests/admin_uuid_operations_test.py @@ -7,6 +7,7 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0 +import json import random import requests from enum import IntEnum @@ -16,14 +17,27 @@ from rptest.tests.redpanda_test import RedpandaTest from rptest.services.admin import Admin from rptest.services.cluster import cluster +from rptest.util import expect_exception from ducktape.cluster.cluster import ClusterNode +from ducktape.errors import TimeoutError +from ducktape.mark import parametrize -from rptest.util import wait_until_result +from rptest.services.utils import LogSearchLocal +from rptest.util import wait_until_result, wait_until + + +class TestMode(IntEnum): + CFG_OVERRIDE = 1 + NO_OVERRIDE = 2 + CLI_OVERRIDE = 3 class AdminUUIDOperationsTest(RedpandaTest): def __init__(self, ctx): super().__init__(test_context=ctx, num_brokers=3) + self.admin = Admin(self.redpanda) + self.log_searcher = LogSearchLocal(ctx, [], self.redpanda.logger, + self.redpanda.STDOUT_STDERR_CAPTURE) def setUp(self): self.redpanda.start(auto_assign_node_id=True, @@ -33,7 +47,7 @@ def setUp(self): @cluster(num_nodes=3) def test_getting_node_id_to_uuid_map(self): admin = Admin(self.redpanda) - uuids = admin.get_broker_uuids() + uuids = self.admin.get_broker_uuids() assert len(uuids) == 3, "UUID map should contain 3 brokers" all_ids = set() for n in uuids: @@ -41,13 +55,19 @@ def test_getting_node_id_to_uuid_map(self): assert 'uuid' in n all_ids.add(n['node_id']) - brokers = admin.get_brokers() + brokers = self.admin.get_brokers() for b in brokers: assert b['node_id'] in all_ids + def _uuids_updated(self, nodes_n=4): + uuids = self.admin.get_broker_uuids() + if len(uuids) != nodes_n: + return False, None + + return True, uuids + @cluster(num_nodes=3) def test_overriding_node_id(self): - admin = Admin(self.redpanda) to_stop = self.redpanda.nodes[0] initial_to_stop_id = self.redpanda.node_id(to_stop) # Stop node and clear its data directory @@ -60,20 +80,13 @@ def test_overriding_node_id(self): auto_assign_node_id=True, omit_seeds_on_idx_one=False) - def _uuids_updated(): - uuids = admin.get_broker_uuids() - if len(uuids) != 4: - return False, None - - return True, uuids - # wait for the node to join with new ID uuids = wait_until_result( - _uuids_updated, + lambda: self._uuids_updated(), timeout_sec=30, err_msg="Node was unable to join the cluster") - uuids = admin.get_broker_uuids() + uuids = self.admin.get_broker_uuids() old_uuid = None for n in uuids: @@ -82,18 +95,292 @@ def _uuids_updated(): old_uuid = n['uuid'] # get current node id and UUID - current = admin.get_broker_uuid(to_stop) + current = self.admin.get_broker_uuid(to_stop) - admin.override_node_id(to_stop, - current_uuid=current['node_uuid'], - new_node_id=initial_to_stop_id, - new_node_uuid=old_uuid) + self.admin.override_node_id(to_stop, + current_uuid=current['node_uuid'], + new_node_id=initial_to_stop_id, + new_node_uuid=old_uuid) self.redpanda.restart_nodes(to_stop, auto_assign_node_id=True, omit_seeds_on_idx_one=False) - after_restart = admin.get_broker_uuid(to_stop) + after_restart = self.admin.get_broker_uuid(to_stop) assert after_restart['node_id'] == initial_to_stop_id assert after_restart['node_uuid'] == old_uuid + + def scrape_uuid(self, node: ClusterNode) -> str | None: + UUID_LOG = "'Generated new UUID for node'" + lines = [ + s.strip() for s in self.log_searcher._capture_log(node, UUID_LOG) + ] + if len(lines) < 1: + return None + self.logger.info(f"UUID Lines: {json.dumps(lines, indent=1)}") + assert len(lines) == 1, f"Too many: {json.dumps(lines, indent=1)}" + return lines[0].split(":")[-1].strip() + + def _restart_node(self, + node: ClusterNode, + overrides: dict | None = None, + extra_cli: list[str] = [], + drop_disk: bool = False): + self.redpanda.stop_node(node) + if drop_disk: + self.redpanda.clean_node(node, + preserve_current_install=True, + preserve_logs=False) + + self.redpanda.start_node( + node, + auto_assign_node_id=True, + omit_seeds_on_idx_one=False, + override_cfg_params=overrides, + extra_cli=extra_cli, + ) + + def _decommission(self, node_id, node=None): + def decommissioned(): + try: + + results = [] + for n in self.redpanda.nodes: + if self.redpanda.node_id(n) == node_id: + continue + + brokers = self.admin.get_brokers(node=n) + for b in brokers: + if b['node_id'] == node_id: + results.append(b['membership_status'] != 'active') + + if all(results): + return True + + self.admin.decommission_broker(node_id, node=node) + return False + except requests.exceptions.RetryError: + return False + except requests.exceptions.ConnectionError: + return False + except requests.exceptions.HTTPError: + return False + + wait_until(decommissioned, 30, 1) + + def wait_until_cluster_healthy(self, timeout_sec=30): + wait_until(lambda: self.redpanda.healthy(), + timeout_sec=timeout_sec, + backoff_sec=1) + # Wait for the cluster to agree on a controller leader. + return self.redpanda.get_node_by_id( + self.admin.await_stable_leader( + topic="controller", + partition=0, + namespace="redpanda", + hosts=[n.account.hostname for n in self.redpanda._started], + timeout_s=timeout_sec, + backoff_s=1)) + + @cluster(num_nodes=3) + @parametrize(mode=TestMode.CFG_OVERRIDE) + @parametrize(mode=TestMode.NO_OVERRIDE) + @parametrize(mode=TestMode.CLI_OVERRIDE) + def test_force_uuid_override(self, mode): + to_stop = self.redpanda.nodes[0] + initial_to_stop_id = self.redpanda.node_id(to_stop) + + self._restart_node(to_stop, drop_disk=True) + + # wait for the node to join with new ID + uuids = wait_until_result( + lambda: self._uuids_updated(), + timeout_sec=30, + backoff_sec=2, + err_msg="Node was unable to join the cluster") + + old_uuid = None + for n in uuids: + id = n['node_id'] + if id == initial_to_stop_id: + old_uuid = n['uuid'] + + assert old_uuid is not None, "Old uuid unexpectedly None" + + ghost_node_id = self.admin.get_broker_uuid(to_stop)['node_id'] + + self.logger.debug( + "When we drop the disk again, node restart should fail (controller will have lost consensus)" + ) + with expect_exception(TimeoutError, lambda _: True): + self._restart_node(to_stop, drop_disk=True) + + self.logger.debug( + "Grab the last generated UUID from logs since the node was not able to join the cluster" + ) + current_uuid = self.scrape_uuid(to_stop) + assert current_uuid is not None, "Didn't find UUID in logs" + + self.logger.debug("Restart the node again (but keep the disk)") + + THE_OVERRIDE = f"{current_uuid} -> ID: '{initial_to_stop_id}' ; UUID: '{old_uuid}'" + if mode == TestMode.CFG_OVERRIDE: + self.logger.debug( + f"Override with known-good uuid/id via node config: {THE_OVERRIDE}" + ) + self._restart_node( + to_stop, + dict(node_id_overrides=[ + dict(current_uuid=current_uuid, + new_uuid=old_uuid, + new_id=initial_to_stop_id) + ], ), + drop_disk=False, + ) + elif mode == TestMode.CLI_OVERRIDE: + self.logger.debug( + f"Override with known-good uuid/id via command line options: {THE_OVERRIDE}" + ) + self._restart_node( + to_stop, + extra_cli=[ + "--node-id-overrides", + f"{current_uuid}:{old_uuid}:{initial_to_stop_id}", + ], + drop_disk=False, + ) + elif mode == TestMode.NO_OVERRIDE: + self.logger.debug( + "Omit the override to confirm that we're still stuck in that case" + ) + with expect_exception(TimeoutError, lambda _: True): + self._restart_node(to_stop, drop_disk=False) + self.logger.debug("And short circuit the test case") + return + else: + assert False, f"Unexpected mode: '{mode}'" + + self.logger.debug( + "Wait until the target node reflects the given overrides") + + wait_until(lambda: self.admin.get_broker_uuid(to_stop)['node_id'] == + initial_to_stop_id, + timeout_sec=30, + backoff_sec=2, + err_msg=f"{to_stop.name} did not take the ID override") + + wait_until(lambda: self.admin.get_broker_uuid(to_stop)['node_uuid'] == + old_uuid, + timeout_sec=30, + backoff_sec=2, + err_msg=f"{to_stop.name} did not take the UUID override") + + self.logger.debug(f"Wait for the cluster to become healthy...") + + self.wait_until_cluster_healthy(timeout_sec=30) + + self.logger.debug( + f".. and decommission ghost node [{ghost_node_id}]...") + self._decommission(ghost_node_id) + + self.logger.debug( + "Check that all this state sticks across a rolling restart") + + self.redpanda.rolling_restart_nodes(self.redpanda.nodes, + auto_assign_node_id=True) + + self.wait_until_cluster_healthy(timeout_sec=30) + + def expect_ids(node: ClusterNode, uuid: str, id: int): + resp = self.admin.get_broker_uuid(node) + try: + assert resp[ + 'node_id'] == id, f"Bad node id after override: '{resp['node_id']}', expected '{id}'" + assert resp[ + 'node_uuid'] == uuid, f"Bad node uuid after override: '{resp['node_uuid']}', expected '{uuid}'" + except AssertionError as e: + self.logger.debug(e) + return False + return True + + wait_until(lambda: expect_ids(to_stop, old_uuid, initial_to_stop_id), + timeout_sec=30, + backoff_sec=1, + retry_on_exc=True) + + @cluster(num_nodes=3) + @parametrize(mode=TestMode.CFG_OVERRIDE) + @parametrize(mode=TestMode.CLI_OVERRIDE) + def test_force_uuid_override_multinode(self, mode): + to_stop = self.redpanda.nodes[1:] + initial_to_stop_ids = [self.redpanda.node_id(n) for n in to_stop] + + self.logger.debug("Kill one node, all is good") + + self._restart_node(to_stop[0], drop_disk=True) + + uuids = wait_until_result( + lambda: self._uuids_updated(), + timeout_sec=30, + backoff_sec=2, + err_msg="Node was unable to join the cluster") + + ghost_node_id = self.admin.get_broker_uuid(to_stop[0])['node_id'] + + old_uuids = {} + for n in uuids: + id = n['node_id'] + if id in initial_to_stop_ids: + old_uuids[id] = n['uuid'] + + assert len( + old_uuids) == 2, f"Unexpected old_uuids: {json.dumps(old_uuids)}" + + self.logger.debug("Drop another node, this time restart should fail") + + for n in to_stop: + with expect_exception(TimeoutError, lambda _: True): + self._restart_node(n, drop_disk=True) + + current_uuids = [self.scrape_uuid(n) for n in to_stop] + assert len(current_uuids + ) == 2, f"Missing some UUIDs: {json.dumps(current_uuids)}" + + self.logger.debug( + "Restart both nodes again, with overrides. Keep both disks") + + if mode == TestMode.CFG_OVERRIDE: + self.redpanda.restart_nodes( + to_stop, + override_cfg_params=dict(node_id_overrides=[ + dict(current_uuid=current_uuids[n], + new_uuid=old_uuids[initial_to_stop_ids[n]], + new_id=initial_to_stop_ids[n]) + for n in range(0, len(to_stop)) + ]), + auto_assign_node_id=True, + ) + elif mode == TestMode.CLI_OVERRIDE: + self.redpanda.restart_nodes( + to_stop, + extra_cli=[ + "--node-id-overrides", + ] + [ + f"{current_uuids[n]}:{old_uuids[initial_to_stop_ids[n]]}:{initial_to_stop_ids[n]}" + for n in range(0, len(to_stop)) + ], + auto_assign_node_id=True, + ) + + self.logger.debug("Wait for the cluster to become healthy...") + + controller_leader = self.wait_until_cluster_healthy(timeout_sec=30) + + assert controller_leader is not None, "Didn't elect a controller leader" + assert controller_leader not in to_stop, f"Unexpected controller leader {controller_leader.account.hostname}" + + self.logger.debug( + f"...and decommission ghost node [{ghost_node_id}]...") + + self._decommission(ghost_node_id)