From 026fa9b14ebedaff9b5ca2d5b77a7c7288617489 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Ma=C5=9Blanka?= Date: Wed, 18 Dec 2024 18:02:26 +0100 Subject: [PATCH 1/3] tests: added validation of offset translator consistency in ducktape MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added code that validates if deltas are consistent across different replicas. Signed-off-by: Michał Maślanka --- tests/rptest/tests/ot_consistency_test.py | 50 +++++++++++++++++++++++ tests/rptest/utils/node_operations.py | 47 +++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 tests/rptest/tests/ot_consistency_test.py diff --git a/tests/rptest/tests/ot_consistency_test.py b/tests/rptest/tests/ot_consistency_test.py new file mode 100644 index 0000000000000..de9b401a1c7d9 --- /dev/null +++ b/tests/rptest/tests/ot_consistency_test.py @@ -0,0 +1,50 @@ +# Copyright 2020 Redpanda Data, Inc. +# +# Use of this software is governed by the Business Source License +# included in the file licenses/BSL.md +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0 +import json +import os +import re +import sys +import time +import traceback +from collections import namedtuple, defaultdict +from typing import DefaultDict, List, Optional + +from ducktape.mark import matrix +from ducktape.utils.util import wait_until + +from rptest.clients.kafka_cat import KafkaCat +from rptest.clients.kafka_cli_tools import KafkaCliTools +from rptest.clients.rpk import RpkTool +from rptest.clients.types import TopicSpec +from rptest.services.cluster import cluster +from rptest.services.redpanda import RedpandaService, SISettings, CloudStorageTypeAndUrlStyle, get_cloud_storage_type, get_cloud_storage_type_and_url_style +from rptest.tests.redpanda_test import RedpandaTest +from rptest.utils.node_operations import verify_offset_translator_state_consistent + + +class OffsetTranslatorConsistencyTest(RedpandaTest): + def __init__(self, test_ctx, *args, **kwargs): + self._ctx = test_ctx + super(OffsetTranslatorConsistencyTest, self).__init__( + test_ctx, + si_settings=SISettings(test_ctx, + log_segment_size=1024 * 1024, + fast_uploads=True), + *args, + **kwargs, + ) + + @cluster(num_nodes=3) + def test_offset_translator_state_consistent(self): + cli = KafkaCliTools(self.redpanda) + topic = TopicSpec(partition_count=3, replication_factor=3) + + cli.create_topic(topic) + cli.produce(topic.name, 1000, 100) + verify_offset_translator_state_consistent(self.redpanda) diff --git a/tests/rptest/utils/node_operations.py b/tests/rptest/utils/node_operations.py index 816d2dbe94725..3b00581d18115 100644 --- a/tests/rptest/utils/node_operations.py +++ b/tests/rptest/utils/node_operations.py @@ -10,6 +10,7 @@ from collections import defaultdict from enum import Enum import random +import re import threading import time import requests @@ -20,6 +21,7 @@ from rptest.services.failure_injector import FailureInjector, FailureSpec from rptest.services.redpanda import RedpandaService from rptest.services.redpanda_installer import VERSION_RE, int_tuple +from rptest.util import wait_until_result class OperationType(Enum): @@ -79,6 +81,51 @@ def add(idx): yield NodeOperation(op, idx, random.choice([True, False])) +def verify_offset_translator_state_consistent(redpanda: RedpandaService): + logger = redpanda.logger + last_delta_pattern = re.compile('^\\{.*, last delta: (?P\\d+)\\}$') + admin = Admin(redpanda) + + for n in redpanda.started_nodes(): + node_id = redpanda.node_id(n) + all_partitions = admin.get_partitions(node=n) + + def _state_consistent(ns, topic, partition): + + state = admin.get_partition_state(ns, topic, partition, node=n) + dirty_offset = state['replicas'][0]['dirty_offset'] + if all(r['dirty_offset'] == dirty_offset + for r in state['replicas']): + return True, state + return False, None + + for p in all_partitions: + namespace = p['ns'] + topic = p['topic'] + partition = p['partition_id'] + partition_name = f"{namespace}/{topic}/{partition}" + state = wait_until_result( + lambda: _state_consistent(namespace, topic, partition), + timeout_sec=180, + backoff_sec=1, + err_msg="Error waiting for offsets to be consistent") + + logger.debug( + f"debug state of {partition_name} replica on node {node_id}: {state}" + ) + last_deltas = set() + for r_state in state['replicas']: + ot_state = r_state['raft_state']['offset_translator_state'] + if "empty" in ot_state: + continue + m = last_delta_pattern.match(ot_state) + assert m, f"offset translator state {ot_state} does not match expected pattern" + last_deltas.add(m['delta']) + assert len( + last_deltas + ) <= 1, f"partition {p} has inconsistent offset translation. Last deltas: {last_deltas}" + + class NodeDecommissionWaiter(): def __init__(self, redpanda, From 59eba45a567add9689a2bf5a4d8bbab56660b3e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Ma=C5=9Blanka?= Date: Wed, 18 Dec 2024 18:03:55 +0100 Subject: [PATCH 2/3] tests: validate offset translator state in random_node_operations_test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Maślanka --- tests/rptest/tests/random_node_operations_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/rptest/tests/random_node_operations_test.py b/tests/rptest/tests/random_node_operations_test.py index c152396bc20a3..223672e795c2a 100644 --- a/tests/rptest/tests/random_node_operations_test.py +++ b/tests/rptest/tests/random_node_operations_test.py @@ -26,7 +26,7 @@ from rptest.services.redpanda import CHAOS_LOG_ALLOW_LIST, PREV_VERSION_LOG_ALLOW_LIST, CloudStorageType, LoggingConfig, PandaproxyConfig, SISettings, SchemaRegistryConfig, get_cloud_storage_type from rptest.services.redpanda_installer import RedpandaInstaller from rptest.utils.mode_checks import cleanup_on_early_exit, skip_debug_mode, skip_fips_mode -from rptest.utils.node_operations import FailureInjectorBackgroundThread, NodeOpsExecutor, generate_random_workload +from rptest.utils.node_operations import FailureInjectorBackgroundThread, NodeOpsExecutor, generate_random_workload, verify_offset_translator_state_consistent from rptest.clients.offline_log_viewer import OfflineLogViewer from rptest.tests.datalake.utils import supported_storage_types @@ -593,6 +593,7 @@ def cluster_version_updated(): err_msg="Error waiting for cluster to report consistent version" ) + verify_offset_translator_state_consistent(self.redpanda) # Validate that the controller log written during the test is readable by offline log viewer log_viewer = OfflineLogViewer(self.redpanda) for node in self.redpanda.started_nodes(): From 9d885f5a4e93493559d52adacd65041ea6b2b777 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Ma=C5=9Blanka?= Date: Wed, 18 Dec 2024 18:50:39 +0100 Subject: [PATCH 3/3] tests: made segment size smaller in random_node_operations_test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Maślanka --- tests/rptest/tests/random_node_operations_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/rptest/tests/random_node_operations_test.py b/tests/rptest/tests/random_node_operations_test.py index 223672e795c2a..8ff8fc9ad924b 100644 --- a/tests/rptest/tests/random_node_operations_test.py +++ b/tests/rptest/tests/random_node_operations_test.py @@ -58,6 +58,8 @@ def __init__(self, test_context, *args, **kwargs): # set disk timeout to value greater than max suspend time # not to emit spurious errors "raft_io_timeout_ms": 20000, + "compacted_log_segment_size": 1024 * 1024, + "log_segment_size": 2 * 1024 * 1024, }, # 2 nodes for kgo producer/consumer workloads node_prealloc_count=3,