Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Try to recover automatically from the background error about sst corrupt #1667

Merged
merged 5 commits into from
Aug 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions src/common/string_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

#include <fmt/format.h>

#include <regex>
#include <string>

#include "parse_util.h"
Expand Down Expand Up @@ -214,6 +215,19 @@ int StringMatchLen(const char *pattern, size_t pattern_len, const char *string,
return 0;
}

std::vector<std::string> RegexMatch(const std::string &str, const std::string &regex) {
std::regex base_regex(regex);
std::smatch pieces_match;
std::vector<std::string> out;

if (std::regex_match(str, pieces_match, base_regex)) {
for (const auto &piece : pieces_match) {
out.emplace_back(piece.str());
}
}
return out;
}

std::string StringToHex(const std::string &input) {
static const char hex_digits[] = "0123456789ABCDEF";
std::string output;
Expand Down
1 change: 1 addition & 0 deletions src/common/string_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ std::vector<std::string> Split2KV(const std::string &in, const std::string &deli
bool HasPrefix(const std::string &str, const std::string &prefix);
int StringMatch(const std::string &pattern, const std::string &in, int nocase);
int StringMatchLen(const char *p, size_t plen, const char *s, size_t slen, int nocase);
std::vector<std::string> RegexMatch(const std::string &str, const std::string &regex);
std::string StringToHex(const std::string &input);
std::vector<std::string> TokenizeRedisProtocol(const std::string &value);
std::string EscapeString(const std::string &s);
Expand Down
65 changes: 44 additions & 21 deletions src/storage/event_listener.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,20 @@

#include "event_listener.h"

#include <map>
#include <string>
#include <vector>

#include "fmt/format.h"

std::string BackgroundErrorReason2String(const rocksdb::BackgroundErrorReason reason) {
std::vector<std::string> background_error_reason = {
"flush", "compaction", "write_callback", "memtable", "manifest_write", "flush_no_wal", "manifest_write_no_wal"};
if (static_cast<size_t>(reason) < background_error_reason.size()) {
return background_error_reason[static_cast<size_t>(reason)];
}
return "unknown";
}

std::string FileCreatedReason2String(const rocksdb::TableFileCreationReason reason) {
std::vector<std::string> file_created_reason = {"flush", "compaction", "recovery", "misc"};
if (static_cast<size_t>(reason) < file_created_reason.size()) {
Expand All @@ -49,6 +59,14 @@ std::string CompressType2String(const rocksdb::CompressionType type) {
return "unknown";
}

std::string ExtractSSTFileNameFromError(const std::string &error) {
auto match_results = util::RegexMatch(error, ".*(/\\w*\\.sst).*");
if (match_results.size() == 2) {
return match_results[1];
}
return {};
}

bool IsDiskQuotaExceeded(const rocksdb::Status &bg_error) {
// EDQUOT: Disk quota exceeded (POSIX.1-2001)
std::string exceeded_quota_str = "Disk quota exceeded";
Expand All @@ -58,7 +76,7 @@ bool IsDiskQuotaExceeded(const rocksdb::Status &bg_error) {
}

void EventListener::OnCompactionCompleted(rocksdb::DB *db, const rocksdb::CompactionJobInfo &ci) {
LOG(INFO) << "[event_listener/compaction_completed] column family: " << ci.cf_name
LOG(INFO) << "[event_listener/compaction_completed] column family: " << ci.cf_name << ", job_id: " << ci.job_id
<< ", compaction reason: " << static_cast<int>(ci.compaction_reason)
<< ", output compression type: " << CompressType2String(ci.compression)
<< ", base input level(files): " << ci.base_input_level << "(" << ci.input_files.size() << ")"
Expand Down Expand Up @@ -87,30 +105,35 @@ void EventListener::OnFlushCompleted(rocksdb::DB *db, const rocksdb::FlushJobInf
}

void EventListener::OnBackgroundError(rocksdb::BackgroundErrorReason reason, rocksdb::Status *bg_error) {
std::string reason_str;
switch (reason) {
case rocksdb::BackgroundErrorReason::kCompaction:
reason_str = "compact";
break;
case rocksdb::BackgroundErrorReason::kFlush:
reason_str = "flush";
break;
case rocksdb::BackgroundErrorReason::kMemTable:
reason_str = "memtable";
break;
case rocksdb::BackgroundErrorReason::kWriteCallback:
reason_str = "writecallback";
break;
default:
// Should not arrive here
break;
auto reason_str = BackgroundErrorReason2String(reason);
auto error_str = bg_error->ToString();
if (bg_error->IsCorruption() || bg_error->IsIOError()) {
// Background error may occur when SST are generated during flush/compaction. If those files are not applied
// to Version, we consider them non-fatal background error. We can override bg_error to recover from
// background error.
// Note that we cannot call Resume() manually because the error severity is unrecoverable.
auto corrupt_sst = ExtractSSTFileNameFromError(error_str);
if (!corrupt_sst.empty()) {
std::vector<std::string> live_files;
uint64_t manifest_size = 0;
auto s = storage_->GetDB()->GetLiveFiles(live_files, &manifest_size, false /* flush_memtable */);
if (s.ok() && std::find(live_files.begin(), live_files.end(), corrupt_sst) == live_files.end()) {
*bg_error = rocksdb::Status::OK();
LOG(WARNING) << fmt::format(
"[event_listener/background_error] ignore no-fatal background error about sst file, reason: {}, bg_error: "
"{}",
reason_str, error_str);
return;
}
}
}

if ((bg_error->IsNoSpace() || IsDiskQuotaExceeded(*bg_error)) &&
bg_error->severity() < rocksdb::Status::kFatalError) {
storage_->SetDBInRetryableIOError(true);
}

LOG(ERROR) << "[event_listener/background_error] reason: " << reason_str << ", bg_error: " << bg_error->ToString();
LOG(ERROR) << fmt::format("[event_listener/background_error] reason: {}, bg_error: {}", reason_str, error_str);
}

void EventListener::OnTableFileDeleted(const rocksdb::TableFileDeletionInfo &info) {
Expand All @@ -126,6 +149,6 @@ void EventListener::OnStallConditionsChanged(const rocksdb::WriteStallInfo &info

void EventListener::OnTableFileCreated(const rocksdb::TableFileCreationInfo &info) {
LOG(INFO) << "[event_listener/table_file_created] column family: " << info.cf_name
<< ", file path: " << info.file_path << ", file size: " << info.file_size << ", job id: " << info.job_id
<< ", file path: " << info.file_path << ", file size: " << info.file_size << ", job_id: " << info.job_id
<< ", reason: " << FileCreatedReason2String(info.reason) << ", status: " << info.status.ToString();
}
15 changes: 15 additions & 0 deletions tests/cppunit/string_util_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,18 @@ TEST(StringUtil, EscapeString) {
ASSERT_TRUE(util::EscapeString(origin) == escaped);
}
}

TEST(StringUtil, RegexMatchExtractSSTFile) {
// Test for ExtractSSTFileNameFromError() in event_listener.cc
auto bg_error_str = {"Corruption: Corrupt or unsupported format_version: 1005 in /tmp/kvrocks/data/db/000038.sst",
"Corruption: Bad table magic number: expected 9863518390377041911, found 9863518390377041912 in "
"/tmp/kvrocks_db/data/db/000038.sst",
"Corruption: block checksum mismatch: stored = 3308200672, computed = 51173877, type = 4 in "
"/tmp/kvrocks_db/data/db/000038.sst offset 0 size 15715"};

for (const auto &str : bg_error_str) {
auto match_results = util::RegexMatch(str, ".*(/\\w*\\.sst).*");
ASSERT_TRUE(match_results.size() == 2);
ASSERT_TRUE(match_results[1] == "/000038.sst");
}
}