From 101c2033924ad69e4f4b5cde6796cd88e8072b6a Mon Sep 17 00:00:00 2001 From: Myth Date: Fri, 11 Aug 2023 10:56:10 +0800 Subject: [PATCH 1/3] Try to recover automatically from the background error about sst corrupt --- src/common/string_util.cc | 14 +++++++ src/common/string_util.h | 1 + src/storage/event_listener.cc | 65 +++++++++++++++++++++---------- tests/cppunit/string_util_test.cc | 15 +++++++ 4 files changed, 74 insertions(+), 21 deletions(-) diff --git a/src/common/string_util.cc b/src/common/string_util.cc index 1b3eb39eedd..b3004a3afa1 100644 --- a/src/common/string_util.cc +++ b/src/common/string_util.cc @@ -22,6 +22,7 @@ #include +#include #include #include "parse_util.h" @@ -214,6 +215,19 @@ int StringMatchLen(const char *pattern, size_t pattern_len, const char *string, return 0; } +std::vector RegexMatch(const std::string &str, const std::string ®ex) { + std::regex base_regex(regex); + std::smatch pieces_match; + std::vector out; + + if (std::regex_match(str, pieces_match, base_regex)) { + for (const auto &piece : pieces_match) { + out.emplace_back(piece.str()); + } + } + return out; +} + std::string StringToHex(const std::string &input) { static const char hex_digits[] = "0123456789ABCDEF"; std::string output; diff --git a/src/common/string_util.h b/src/common/string_util.h index dfb00706f3c..2ebd7639673 100644 --- a/src/common/string_util.h +++ b/src/common/string_util.h @@ -34,6 +34,7 @@ std::vector Split2KV(const std::string &in, const std::string &deli bool HasPrefix(const std::string &str, const std::string &prefix); int StringMatch(const std::string &pattern, const std::string &in, int nocase); int StringMatchLen(const char *p, size_t plen, const char *s, size_t slen, int nocase); +std::vector RegexMatch(const std::string &str, const std::string ®ex); std::string StringToHex(const std::string &input); std::vector TokenizeRedisProtocol(const std::string &value); std::string EscapeString(const std::string &s); diff --git a/src/storage/event_listener.cc b/src/storage/event_listener.cc index 46f399fc029..f4db9a73933 100644 --- a/src/storage/event_listener.cc +++ b/src/storage/event_listener.cc @@ -20,10 +20,20 @@ #include "event_listener.h" -#include #include #include +#include "fmt/format.h" + +std::string BackgroundErrorReason2String(const rocksdb::BackgroundErrorReason reason) { + std::vector background_error_reason = { + "flush", "compaction", "write_callback", "memtable", "manifest_write", "flush_no_wal", "manifest_write_no_wal"}; + if (static_cast(reason) < background_error_reason.size()) { + return background_error_reason[static_cast(reason)]; + } + return "unknown"; +} + std::string FileCreatedReason2String(const rocksdb::TableFileCreationReason reason) { std::vector file_created_reason = {"flush", "compaction", "recovery", "misc"}; if (static_cast(reason) < file_created_reason.size()) { @@ -49,6 +59,14 @@ std::string CompressType2String(const rocksdb::CompressionType type) { return "unknown"; } +std::string ExtractSSTFileNameFromError(const std::string &error) { + auto match_results = util::RegexMatch(error, "(.*)(/\\w*\\.sst)(.*)"); + if (match_results.size() > 2) { + return match_results[2]; + } + return {}; +} + bool IsDiskQuotaExceeded(const rocksdb::Status &bg_error) { // EDQUOT: Disk quota exceeded (POSIX.1-2001) std::string exceeded_quota_str = "Disk quota exceeded"; @@ -58,7 +76,7 @@ bool IsDiskQuotaExceeded(const rocksdb::Status &bg_error) { } void EventListener::OnCompactionCompleted(rocksdb::DB *db, const rocksdb::CompactionJobInfo &ci) { - LOG(INFO) << "[event_listener/compaction_completed] column family: " << ci.cf_name + LOG(INFO) << "[event_listener/compaction_completed] column family: " << ci.cf_name << ", job_id: " << ci.job_id << ", compaction reason: " << static_cast(ci.compaction_reason) << ", output compression type: " << CompressType2String(ci.compression) << ", base input level(files): " << ci.base_input_level << "(" << ci.input_files.size() << ")" @@ -87,30 +105,35 @@ void EventListener::OnFlushCompleted(rocksdb::DB *db, const rocksdb::FlushJobInf } void EventListener::OnBackgroundError(rocksdb::BackgroundErrorReason reason, rocksdb::Status *bg_error) { - std::string reason_str; - switch (reason) { - case rocksdb::BackgroundErrorReason::kCompaction: - reason_str = "compact"; - break; - case rocksdb::BackgroundErrorReason::kFlush: - reason_str = "flush"; - break; - case rocksdb::BackgroundErrorReason::kMemTable: - reason_str = "memtable"; - break; - case rocksdb::BackgroundErrorReason::kWriteCallback: - reason_str = "writecallback"; - break; - default: - // Should not arrive here - break; + auto reason_str = BackgroundErrorReason2String(reason); + auto error_str = bg_error->ToString(); + if (bg_error->IsCorruption() || bg_error->IsIOError()) { + // Background error may occur when SST are generated during flush/compaction. If those files are not applied + // to Version, we consider them non-fatal background error. We can override bg_error to recover from + // background error. + // Note that we cannot call Resume() manually because the error severity is unrecoverable. + auto corrupt_sst = ExtractSSTFileNameFromError(error_str); + if (!corrupt_sst.empty()) { + std::vector live_files; + uint64_t manifest_size = 0; + auto s = storage_->GetDB()->GetLiveFiles(live_files, &manifest_size, false /* flush_memtable */); + if (s.ok() && std::find(live_files.begin(), live_files.end(), corrupt_sst) == live_files.end()) { + *bg_error = rocksdb::Status::OK(); + LOG(WARNING) << fmt::format( + "[event_listener/background_error] ignore no-fatal background error about sst file, reason: {}, bg_error: " + "{}", + reason_str, error_str); + return; + } + } } + if ((bg_error->IsNoSpace() || IsDiskQuotaExceeded(*bg_error)) && bg_error->severity() < rocksdb::Status::kFatalError) { storage_->SetDBInRetryableIOError(true); } - LOG(ERROR) << "[event_listener/background_error] reason: " << reason_str << ", bg_error: " << bg_error->ToString(); + LOG(WARNING) << fmt::format("[event_listener/background_error] reason: {}, bg_error: {}", reason_str, error_str); } void EventListener::OnTableFileDeleted(const rocksdb::TableFileDeletionInfo &info) { @@ -126,6 +149,6 @@ void EventListener::OnStallConditionsChanged(const rocksdb::WriteStallInfo &info void EventListener::OnTableFileCreated(const rocksdb::TableFileCreationInfo &info) { LOG(INFO) << "[event_listener/table_file_created] column family: " << info.cf_name - << ", file path: " << info.file_path << ", file size: " << info.file_size << ", job id: " << info.job_id + << ", file path: " << info.file_path << ", file size: " << info.file_size << ", job_id: " << info.job_id << ", reason: " << FileCreatedReason2String(info.reason) << ", status: " << info.status.ToString(); } diff --git a/tests/cppunit/string_util_test.cc b/tests/cppunit/string_util_test.cc index a3092fed495..5484fdb5847 100644 --- a/tests/cppunit/string_util_test.cc +++ b/tests/cppunit/string_util_test.cc @@ -97,3 +97,18 @@ TEST(StringUtil, EscapeString) { ASSERT_TRUE(util::EscapeString(origin) == escaped); } } + +TEST(StringUtil, RegexMatchExtractSSTFile) { + // Test for ExtractSSTFileNameFromError() in event_listener.cc + auto bg_error_str = {"Corruption: Corrupt or unsupported format_version: 1005 in /tmp/kvrocks/data/db/000038.sst", + "Corruption: Bad table magic number: expected 9863518390377041911, found 9863518390377041912 in " + "/tmp/kvrocks_db/data/db/000038.sst", + "Corruption: block checksum mismatch: stored = 3308200672, computed = 51173877, type = 4 in " + "/tmp/kvrocks_db/data/db/000038.sst offset 0 size 15715"}; + + for (const auto &str : bg_error_str) { + auto match_results = util::RegexMatch(str, "(.*)(/\\w*\\.sst)(.*)"); + ASSERT_TRUE(match_results.size() > 2); + ASSERT_TRUE(match_results[2] == "/000038.sst"); + } +} From 6802ed3c44142c3f3bdb054bcf5788bbfb20a082 Mon Sep 17 00:00:00 2001 From: Myth Date: Fri, 11 Aug 2023 16:19:12 +0800 Subject: [PATCH 2/3] Minor changes --- src/storage/event_listener.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/storage/event_listener.cc b/src/storage/event_listener.cc index f4db9a73933..d78b8eda575 100644 --- a/src/storage/event_listener.cc +++ b/src/storage/event_listener.cc @@ -133,7 +133,7 @@ void EventListener::OnBackgroundError(rocksdb::BackgroundErrorReason reason, roc storage_->SetDBInRetryableIOError(true); } - LOG(WARNING) << fmt::format("[event_listener/background_error] reason: {}, bg_error: {}", reason_str, error_str); + LOG(ERROR) << fmt::format("[event_listener/background_error] reason: {}, bg_error: {}", reason_str, error_str); } void EventListener::OnTableFileDeleted(const rocksdb::TableFileDeletionInfo &info) { From a007caf9c179c719fca82445d502af23961b2c5e Mon Sep 17 00:00:00 2001 From: Myth Date: Mon, 14 Aug 2023 17:48:32 +0800 Subject: [PATCH 3/3] Change regex match --- src/storage/event_listener.cc | 6 +++--- tests/cppunit/string_util_test.cc | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/storage/event_listener.cc b/src/storage/event_listener.cc index d78b8eda575..6e054945669 100644 --- a/src/storage/event_listener.cc +++ b/src/storage/event_listener.cc @@ -60,9 +60,9 @@ std::string CompressType2String(const rocksdb::CompressionType type) { } std::string ExtractSSTFileNameFromError(const std::string &error) { - auto match_results = util::RegexMatch(error, "(.*)(/\\w*\\.sst)(.*)"); - if (match_results.size() > 2) { - return match_results[2]; + auto match_results = util::RegexMatch(error, ".*(/\\w*\\.sst).*"); + if (match_results.size() == 2) { + return match_results[1]; } return {}; } diff --git a/tests/cppunit/string_util_test.cc b/tests/cppunit/string_util_test.cc index 5484fdb5847..f95ccbff3dd 100644 --- a/tests/cppunit/string_util_test.cc +++ b/tests/cppunit/string_util_test.cc @@ -107,8 +107,8 @@ TEST(StringUtil, RegexMatchExtractSSTFile) { "/tmp/kvrocks_db/data/db/000038.sst offset 0 size 15715"}; for (const auto &str : bg_error_str) { - auto match_results = util::RegexMatch(str, "(.*)(/\\w*\\.sst)(.*)"); - ASSERT_TRUE(match_results.size() > 2); - ASSERT_TRUE(match_results[2] == "/000038.sst"); + auto match_results = util::RegexMatch(str, ".*(/\\w*\\.sst).*"); + ASSERT_TRUE(match_results.size() == 2); + ASSERT_TRUE(match_results[1] == "/000038.sst"); } }