Skip to content

Commit

Permalink
[#10360] DST: PITR - Disable Tablet Splitting during PITR restores
Browse files Browse the repository at this point in the history
Summary:
Tablet Splitting during PITR restorations causes hindrances, leading to race conditions. This diff disables it during restores. When a restore request is received, we disable splitting for 10 mins and also wait for pending splits to complete before beginning restore. When restore completes, we re-enable splitting. 10 mins is an upper bound and we don't expect any type of restoration to take more than 10 mins. If restore finishes earlier then splitting is re-enabled as soon as restore finishes. We don't disable splitting indefinitely because it can happen that restore never completes and in such cases splitting will be disabled forever.

This diff also fixes another issue for PITR+tablet splitting. One simple scenario to explain the issue:
1. Say you have a tablet - t1
2. Split t1 into t11 and t12, mark t1 as HIDDEN, t11 and t12 as RUNNING
3. After the retention period, DELETE t1
4. Restore to time after 2, in our current implementation we'll send a restore RPC to t1 also which has already been deleted

Currently, we send `RESTORE_ON_TABLET` RPCs to DELETED/HIDDEN tablets that are a part of RUNNING tables.

Test Plan:
ybd --cxx_test yb-admin-snapshot-schedule-test --gtest-filter YbAdminSnapshotScheduleTest.VerifyRestoreWithDeletedTablets
ybd --cxx_test yb-admin-snapshot-schedule-test --gtest-filter YbAdminSnapshotScheduleTest.SplitDisabledDuringRestore

Reviewers: slingam, bogdan, sergei, asrivastava

Reviewed By: asrivastava

Subscribers: jenkins-bot, bogdan, ybase

Differential Revision: https://phabricator.dev.yugabyte.com/D17148
  • Loading branch information
sanketkedia committed Jun 1, 2022
1 parent beae963 commit 13736e1
Show file tree
Hide file tree
Showing 21 changed files with 401 additions and 74 deletions.
2 changes: 2 additions & 0 deletions ent/src/yb/master/catalog_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,8 @@ class CatalogManager : public yb::master::CatalogManager, SnapshotCoordinatorCon

void PrepareRestore() override;

void EnableTabletSplitting(const std::string& feature) override;

private:
friend class SnapshotLoader;
friend class yb::master::ClusterLoadBalancer;
Expand Down
4 changes: 4 additions & 0 deletions ent/src/yb/master/catalog_manager_ent.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5307,6 +5307,10 @@ void CatalogManager::PrepareRestore() {
is_catalog_loaded_ = false;
}

void CatalogManager::EnableTabletSplitting(const std::string& feature) {
DisableTabletSplittingInternal(MonoDelta::FromMilliseconds(0), feature);
}

} // namespace enterprise
} // namespace master
} // namespace yb
11 changes: 11 additions & 0 deletions ent/src/yb/master/restore_sys_catalog_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ bool TableDeleted(const SysTablesEntryPB& table) {
table.hide_state() == SysTablesEntryPB::HIDDEN;
}

bool TabletDeleted(const SysTabletsEntryPB& tablet) {
return tablet.state() == SysTabletsEntryPB::REPLACED ||
tablet.state() == SysTabletsEntryPB::DELETED ||
tablet.hide_hybrid_time() != 0;
}

bool IsSequencesDataObject(const NamespaceId& id, const SysNamespaceEntryPB& pb) {
return id == kPgSequencesDataNamespaceId;
}
Expand Down Expand Up @@ -411,6 +417,11 @@ Status RestoreSysCatalogState::DetermineEntries(
if (it == tables.end()) {
continue;
}
// We could have DELETED/HIDDEN tablets for a RUNNING table,
// for instance in the case of tablet splitting.
if (TabletDeleted(id_and_metadata.second)) {
continue;
}
RETURN_NOT_OK(process_entry(id_and_metadata.first, &id_and_metadata.second));
VLOG(2) << "Tablet to restore: " << id_and_metadata.first << ", "
<< id_and_metadata.second.ShortDebugString();
Expand Down
6 changes: 6 additions & 0 deletions ent/src/yb/tools/yb-admin_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ namespace enterprise {
YB_DEFINE_ENUM(ListSnapshotsFlag, (SHOW_DETAILS)(NOT_SHOW_RESTORED)(SHOW_DELETED)(JSON));
using ListSnapshotsFlags = EnumBitSet<ListSnapshotsFlag>;

// Constants for disabling tablet splitting during PITR restores.
static constexpr double kPitrSplitDisableDurationSecs = 600;
static constexpr double kPitrSplitDisableCheckFreqMs = 500;

class ClusterAdminClient : public yb::tools::ClusterAdminClient {
typedef yb::tools::ClusterAdminClient super;
public:
Expand Down Expand Up @@ -134,6 +138,8 @@ class ClusterAdminClient : public yb::tools::ClusterAdminClient {
void CleanupEnvironmentOnSetupUniverseReplicationFailure(
const std::string& producer_uuid, const Status& failure_status);

Status DisableTabletSplitsDuringRestore(CoarseTimePoint deadline);

DISALLOW_COPY_AND_ASSIGN(ClusterAdminClient);
};

Expand Down
56 changes: 53 additions & 3 deletions ent/src/yb/tools/yb-admin_client_ent.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@
// or implied. See the License for the specific language governing permissions and limitations
// under the License.

#include "yb/tools/yb-admin_cli.h"
#include "yb/tools/yb-admin_client.h"

#include <iostream>

#include <boost/algorithm/string.hpp>
Expand Down Expand Up @@ -41,6 +38,11 @@
#include "yb/master/master_ddl.proxy.h"
#include "yb/master/master_encryption.proxy.h"
#include "yb/master/master_replication.proxy.h"
#include "yb/master/master_admin.pb.h"
#include "yb/master/master_admin.proxy.h"

#include "yb/tools/yb-admin_cli.h"
#include "yb/tools/yb-admin_client.h"

#include "yb/rpc/messenger.h"
#include "yb/rpc/rpc_controller.h"
Expand Down Expand Up @@ -529,10 +531,58 @@ Result<TxnSnapshotId> ClusterAdminClient::SuitableSnapshotId(
}
}

Status ClusterAdminClient::DisableTabletSplitsDuringRestore(CoarseTimePoint deadline) {
// TODO(Sanket): Eventually all of this logic needs to be moved
// to the master and exposed as APIs for the clients to consume.
const std::string feature_name = "PITR";
const auto splitting_disabled_until =
CoarseMonoClock::Now() + MonoDelta::FromSeconds(kPitrSplitDisableDurationSecs);
// Disable splitting and then wait for all pending splits to complete before
// starting restoration.
VERIFY_RESULT_PREPEND(
DisableTabletSplitsInternal(kPitrSplitDisableDurationSecs * 1000, feature_name),
"Failed to disable tablet split before restore.");

while (CoarseMonoClock::Now() < std::min(splitting_disabled_until, deadline)) {
// Wait for existing split operations to complete.
const auto resp =
VERIFY_RESULT_PREPEND(IsTabletSplittingCompleteInternal(),
"Tablet splitting did not complete. Cannot restore.");
if (resp.is_tablet_splitting_complete()) {
break;
}
SleepFor(MonoDelta::FromMilliseconds(kPitrSplitDisableCheckFreqMs));
}

if (CoarseMonoClock::now() >= deadline) {
return STATUS(TimedOut, "Timed out waiting for tablet splitting to complete.");
}

// Return if we have used almost all of our time in waiting for splitting to complete,
// since we can't guarantee that another split does not start.
if (CoarseMonoClock::now() + MonoDelta::FromSeconds(3) >= splitting_disabled_until) {
return STATUS(TimedOut, "Not enough time after disabling splitting to disable ",
"splitting again.");
}

// Disable for kPitrSplitDisableDurationSecs again so the restore has the full amount of time with
// splitting disables. This overwrites the previous value since the feature_name is the same so
// overall the time is still kPitrSplitDisableDurationSecs.
VERIFY_RESULT_PREPEND(
DisableTabletSplitsInternal(kPitrSplitDisableDurationSecs * 1000, feature_name),
"Failed to disable tablet split before restore.");

return Status::OK();
}

Result<rapidjson::Document> ClusterAdminClient::RestoreSnapshotSchedule(
const SnapshotScheduleId& schedule_id, HybridTime restore_at) {
auto deadline = CoarseMonoClock::now() + timeout_;

// Disable splitting for the entire run of restore.
RETURN_NOT_OK(DisableTabletSplitsDuringRestore(deadline));

// Get the suitable snapshot to restore from.
auto snapshot_id = VERIFY_RESULT(SuitableSnapshotId(schedule_id, restore_at, deadline));

for (;;) {
Expand Down
4 changes: 2 additions & 2 deletions managed/devops/bin/yb_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@

DISABLE_SPLITTING_MS = 30000
DISABLE_SPLITTING_FREQ_SEC = 10
IS_SPLITTING_DISABLED_MAX_RETRIES = 10
IS_SPLITTING_DISABLED_MAX_RETRIES = 100
TEST_SLEEP_AFTER_FIND_SNAPSHOT_DIRS_SEC = 100

DEFAULT_TS_WEB_PORT = 9000
Expand Down Expand Up @@ -2791,7 +2791,7 @@ def bg_disable_splitting(self):
time.sleep(DISABLE_SPLITTING_FREQ_SEC)

def disable_tablet_splitting(self):
self.run_yb_admin(["disable_tablet_splitting", str(DISABLE_SPLITTING_MS)])
self.run_yb_admin(["disable_tablet_splitting", str(DISABLE_SPLITTING_MS), "yb_backup"])

def backup_table(self):
"""
Expand Down
22 changes: 2 additions & 20 deletions src/yb/client/meta_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@

#include "yb/client/meta_cache.h"

#include <shared_mutex>
#include <stdint.h>

#include <atomic>
#include <list>
#include <memory>
#include <shared_mutex>
#include <string>
#include <unordered_map>
#include <unordered_set>
Expand Down Expand Up @@ -1051,25 +1051,7 @@ Status MetaCache::ProcessTabletLocations(

CHECK(tablets_by_id_.emplace(tablet_id, remote).second);
if (tablets_by_key) {
auto emplace_result = tablets_by_key->emplace(partition.partition_key_start(), remote);
if (!emplace_result.second) {
const auto& old_tablet = emplace_result.first->second;
if (old_tablet->split_depth() < remote->split_depth()) {
// Only replace with tablet of higher split_depth.
emplace_result.first->second = remote;
} else {
// If split_depth is the same - it should be the same tablet.
if (old_tablet->split_depth() == loc.split_depth()
&& old_tablet->tablet_id() != tablet_id) {
const auto error_msg = Format(
"Can't replace tablet $0 with $1 at partition_key_start $2, split_depth $3",
old_tablet->tablet_id(), tablet_id, loc.partition().partition_key_start(),
old_tablet->split_depth());
LOG_WITH_PREFIX(DFATAL) << error_msg;
// Just skip updating this tablet for release build.
}
}
}
(*tablets_by_key)[partition.partition_key_start()] = remote;
}
MaybeUpdateClientRequests(*remote);
}
Expand Down
11 changes: 8 additions & 3 deletions src/yb/integration-tests/tablet-split-itest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1054,22 +1054,25 @@ TEST_F(AutomaticTabletSplitITest, IsTabletSplittingComplete) {

// Create a split task by pausing when trying to get split key. IsTabletSplittingComplete should
// include this ongoing task.
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_pause_tserver_get_split_key) = 1;
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_pause_tserver_get_split_key) = true;
ANNOTATE_UNPROTECTED_WRITE(FLAGS_enable_automatic_tablet_splitting) = true;
std::this_thread::sleep_for(FLAGS_catalog_manager_bg_task_wait_ms * 2ms);
ASSERT_FALSE(ASSERT_RESULT(IsSplittingComplete(master_admin_proxy.get())));

// Now let the split occur on master but not tserver.
// IsTabletSplittingComplete should include splits that are only complete on master.
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_fail_tablet_split_probability) = 1;
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_pause_tserver_get_split_key) = 0;
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_pause_tserver_get_split_key) = false;
ASSERT_FALSE(ASSERT_RESULT(IsSplittingComplete(master_admin_proxy.get())));

// Verify that the split finishes, and that IsTabletSplittingComplete returns true even though
// compactions are not done.
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_fail_tablet_split_probability) = 0;
ASSERT_OK(WaitForTabletSplitCompletion(2));
ASSERT_TRUE(ASSERT_RESULT(IsSplittingComplete(master_admin_proxy.get())));
ASSERT_OK(WaitFor([&]() -> Result<bool> {
return VERIFY_RESULT(IsSplittingComplete(master_admin_proxy.get()));
}, MonoDelta::FromMilliseconds(FLAGS_catalog_manager_bg_task_wait_ms * 2),
"IsTabletSplittingComplete did not return true."));
}

// This test tests both FLAGS_enable_automatic_tablet_splitting and the DisableTabletSplitting API
Expand All @@ -1078,6 +1081,7 @@ TEST_F(AutomaticTabletSplitITest, DisableTabletSplitting) {
// Must disable splitting for at least as long as we wait in WaitForTabletSplitCompletion.
const auto kExtraSleepDuration = 5s * kTimeMultiplier;
const auto kDisableDuration = split_completion_timeout_sec_ + kExtraSleepDuration;
const std::string kSplitDisableFeatureName = "DisableTabletSplittingTest";

ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_shard_count_per_node) = 1;
ANNOTATE_UNPROTECTED_WRITE(FLAGS_tablet_split_low_phase_size_threshold_bytes) = 0;
Expand Down Expand Up @@ -1106,6 +1110,7 @@ TEST_F(AutomaticTabletSplitITest, DisableTabletSplitting) {

master::DisableTabletSplittingRequestPB disable_req;
disable_req.set_disable_duration_ms(kDisableDuration.ToMilliseconds());
disable_req.set_feature_name(kSplitDisableFeatureName);
master::DisableTabletSplittingResponsePB disable_resp;
ASSERT_OK(master_admin_proxy->DisableTabletSplitting(disable_req, &disable_resp, &controller));

Expand Down
29 changes: 29 additions & 0 deletions src/yb/master/catalog_entity_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,35 @@ TabletInfos TableInfo::GetTablets(IncludeInactive include_inactive) const {
return result;
}

bool TableInfo::HasOutstandingSplits() const {
SharedLock<decltype(lock_)> l(lock_);
std::unordered_set<TabletId> partitions_tablets;
for (const auto& p : partitions_) {
auto tablet_lock = p.second->LockForRead();
if (tablet_lock->pb.has_split_parent_tablet_id() && !tablet_lock->is_running()) {
YB_LOG_EVERY_N_SECS(INFO, 10) << "Tablet Splitting: Child tablet " << p.second->tablet_id()
<< " belonging to table " << id() << " is not yet running";
return true;
}
partitions_tablets.insert(p.second->tablet_id());
}
for (const auto& p : tablets_) {
// If any parents have not been deleted yet, the split is not yet complete.
if (!partitions_tablets.contains(p.second->tablet_id())) {
auto tablet_lock = p.second->LockForRead();
if (!tablet_lock->is_deleted() && !tablet_lock->is_hidden()) {
YB_LOG_EVERY_N_SECS(INFO, 10) << "Tablet Splitting: Parent tablet " << p.second->tablet_id()
<< " belonging to table " << id()
<< " is not yet deleted or hidden";
return true;
}
}
}
YB_LOG_EVERY_N_SECS(INFO, 10) << "Tablet Splitting: Table "
<< id() << " does not have any outstanding splits";
return false;
}

TabletInfoPtr TableInfo::GetColocatedTablet() const {
SharedLock<decltype(lock_)> l(lock_);
if (colocated() && !tablets_.empty()) {
Expand Down
2 changes: 2 additions & 0 deletions src/yb/master/catalog_entity_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,8 @@ class TableInfo : public RefCountedThreadSafe<TableInfo>,
// Return whether given partition start keys match partitions_.
bool HasPartitions(const std::vector<PartitionKey> other) const;

bool HasOutstandingSplits() const;

// Get all tablets of the table.
// If include_inactive is true then it also returns inactive tablets along with the active ones.
// See the declaration of partitions_ structure to understand what constitutes inactive tablets.
Expand Down
7 changes: 6 additions & 1 deletion src/yb/master/catalog_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8580,11 +8580,16 @@ Status CatalogManager::ListUDTypes(const ListUDTypesRequestPB* req,
return Status::OK();
}

void CatalogManager::DisableTabletSplittingInternal(
const MonoDelta& duration, const std::string& feature) {
tablet_split_manager_.DisableSplittingFor(duration, feature);
}

Status CatalogManager::DisableTabletSplitting(
const DisableTabletSplittingRequestPB* req, DisableTabletSplittingResponsePB* resp,
rpc::RpcContext* rpc) {
const MonoDelta disable_duration = MonoDelta::FromMilliseconds(req->disable_duration_ms());
tablet_split_manager_.DisableSplittingFor(disable_duration);
DisableTabletSplittingInternal(disable_duration, req->feature_name());
return Status::OK();
}

Expand Down
2 changes: 2 additions & 0 deletions src/yb/master/catalog_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,8 @@ class CatalogManager :
const DisableTabletSplittingRequestPB* req, DisableTabletSplittingResponsePB* resp,
rpc::RpcContext* rpc);

void DisableTabletSplittingInternal(const MonoDelta& duration, const std::string& feature);

// Returns true if there are no outstanding tablets and the tablet split manager is not currently
// processing tablet splits.
Status IsTabletSplittingComplete(
Expand Down
1 change: 1 addition & 0 deletions src/yb/master/master_admin.proto
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ message DeleteNotServingTabletResponsePB {

message DisableTabletSplittingRequestPB {
optional uint64 disable_duration_ms = 1;
optional string feature_name = 2;
}

message DisableTabletSplittingResponsePB {
Expand Down
3 changes: 3 additions & 0 deletions src/yb/master/master_snapshot_coordinator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1416,6 +1416,9 @@ class MasterSnapshotCoordinator::Impl {
return;
}
SubmitWrite(std::move(write_batch), leader_term, &context_);

// Enable tablet splitting again.
context_.EnableTabletSplitting("PITR");
}

void UpdateSchedule(const SnapshotState& snapshot) REQUIRES(mutex_) {
Expand Down
2 changes: 2 additions & 0 deletions src/yb/master/snapshot_coordinator_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ class SnapshotCoordinatorContext {

virtual Result<size_t> GetNumLiveTServersForActiveCluster() = 0;

virtual void EnableTabletSplitting(const std::string& feature) = 0;

virtual ~SnapshotCoordinatorContext() = default;
};

Expand Down
Loading

0 comments on commit 13736e1

Please sign in to comment.