-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 42ea226
Showing
4 changed files
with
226 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
/dist | ||
*.bin | ||
*.bak |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
### ClickHouse Bit Flip | ||
The tool to fix single bit flip error in binary data files of ClickHouse. | ||
|
||
The tool makes the copy of origin file with .bak extension before to recovery data. | ||
**However, don't forget to make backup by himself** | ||
|
||
Checked on ClickHouse 19.16.2.2. | ||
|
||
### Usage | ||
|
||
```bash | ||
./clickhouse-bitflip filename | ||
``` | ||
|
||
### For example | ||
> Code: 40, e.displayText() = DB::Exception: Checksum doesn't match: corrupted data. Reference: 32eaca6117ab50a10b91e82b65de78e1. Actual: c44af50ae79ae7639ae96b59d8032bbc. Size of compressed block: 36031. The mismatch is caused by single bit flip in data block at byte 8732, bit 2. This is most likely due to hardware failure. If you receive broken data over network and the error does not repeat every time, this can be caused by bad RAM on network interface controller or bad controller itself or bad RAM on network switches or bad CPU on network switches (look at the logs on related network switches; note that TCP checksums don't help) or bad RAM on host (look at dmesg or kern.log for enormous amount of EDAC errors, ECC-related reports, Machine Check Exceptions, mcelog; note that ECC memory can fail if the number of errors is huge) or bad CPU on host. If you read data from disk, this can be caused by disk bit rott. This exception protects ClickHouse from data corruption due to hardware failures.: (while reading column Name): (while reading from part /var/lib/clickhouse/data/default/tbl/20200420_946605_946651_20/ from mark 0 with max_rows_to_read = 8192) (version 19.16.2.2 (official build)) (from 172.18.0.1:51644) (in query: SELECT Name FROM tbl) | ||
|
||
Corrupted partition: /var/lib/clickhouse/data/default/tbl/20200420_946605_946651_20/ | ||
Corrupted column: Name | ||
|
||
```bash | ||
./clickhouse-bitflip /var/lib/clickhouse/data/default/tbl/20200420_946605_946651_20/Name.bin | ||
``` | ||
|
||
To fix many files | ||
```bash | ||
find /var/lib/clickhouse/data/default/tbl/202004* -maxdepth 2 -type f -name '*.bin' -exec ./clickhouse-bitflip {} \; > result.log | ||
``` | ||
|
||
Also, you can remove .bak files. | ||
|
||
### Links | ||
- https://github.com/ClickHouse/ClickHouse/blob/2b569cf26063bc13f6443199cff8c955f16d2edc/src/Compression/CompressedReadBufferBase.cpp#L42 | ||
- https://github.com/ClickHouse/ClickHouse/blob/ea6f90b4f2c3cc2f7d8b846c769b7f3e84907e47/src/Compression/CompressionInfo.h#L10 | ||
- https://habr.com/ru/company/oleg-bunin/blog/497334/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
go build -o ./dist | ||
set GOOS=linux | ||
go build -o ./dist |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,185 @@ | ||
package main | ||
|
||
import ( | ||
"encoding/binary" | ||
"flag" | ||
"fmt" | ||
"github.com/ClickHouse/clickhouse-go/lib/cityhash102" | ||
"io" | ||
"os" | ||
) | ||
|
||
type ClickHouseChecksum struct { | ||
First, Second uint64 | ||
} | ||
|
||
func (c ClickHouseChecksum) String() string { | ||
return fmt.Sprintf("%x%x", c.First, c.Second) | ||
} | ||
|
||
type ClickHouseHeader struct { | ||
Method uint8 | ||
CompressedSize, UncompressedSize uint32 | ||
} | ||
|
||
var uncorrectedErrors int32 = 0 | ||
var correctedErrors int32 = 0 | ||
|
||
func main() { | ||
flag.Parse() | ||
srcFile := flag.Arg(0) | ||
|
||
f, err := os.OpenFile(srcFile, os.O_RDWR, 0) | ||
if err != nil { | ||
panic(err) | ||
} | ||
defer f.Close() | ||
|
||
for { | ||
expected := readChecksum(f) | ||
//fmt.Println("Reference:", expected) | ||
|
||
startPosOfBlock, err := f.Seek(0, io.SeekCurrent) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
data, _ := readData(f) | ||
//fmt.Println("Size of compressed block:", header.CompressedSize) | ||
|
||
bitFlipPos, fixed := fixData(data, expected) | ||
if fixed { | ||
backup(srcFile) | ||
fixFile(f, data, int64(bitFlipPos), startPosOfBlock) | ||
} | ||
} | ||
} | ||
|
||
func fixFile(f *os.File, data []byte, bitPos int64, startPosOfBlock int64) { | ||
pos := bitPos / 8 | ||
filePos := startPosOfBlock + pos | ||
|
||
fmt.Printf(" Fixing in file at byte %v. Old value: 0x%x. New value: 0x%x\n", filePos, data[pos]^1<<(bitPos%8), data[pos]) | ||
|
||
_, err := f.WriteAt(data[pos:pos+1], filePos) | ||
if err != nil { | ||
panic(err) | ||
} | ||
correctedErrors++ | ||
} | ||
|
||
func fixData(data []byte, expected ClickHouseChecksum) (int, bool) { | ||
isEqual, actual := compareChecksum(data, expected) | ||
if isEqual { | ||
//fmt.Printf(" Checksums are equal. Reference: %v\n", expected) | ||
return 0, false | ||
} | ||
fmt.Printf("Checksum doesn't match: corrupted data. Reference: %v. Actual: %v. Size of compressed block: %v\n", expected, actual, len(data)) | ||
|
||
for bitPos := 0; bitPos < len(data)*8; bitPos++ { | ||
flipBit(data, bitPos) | ||
isEqual, actual = compareChecksum(data, expected) | ||
if isEqual { | ||
fmt.Println(" The mismatch is caused by single bit flip in data block at byte", bitPos/8, ", bit", bitPos%8) | ||
return bitPos, true | ||
} | ||
flipBit(data, bitPos) | ||
} | ||
|
||
uncorrectedErrors++ | ||
fmt.Println(" Error: the mismatch is not caused by single bit flip. It can't be fixed!") | ||
return 0, false | ||
} | ||
|
||
func flipBit(data []byte, pos int) { | ||
data[pos/8] ^= 1 << (pos % 8) | ||
} | ||
|
||
func readChecksum(f *os.File) ClickHouseChecksum { | ||
var checksum ClickHouseChecksum | ||
err := binary.Read(f, binary.LittleEndian, &checksum) | ||
if err == io.EOF { | ||
if uncorrectedErrors == 0 && correctedErrors == 0 { | ||
fmt.Println("No errors") | ||
} else { | ||
fmt.Println("Completed. Corrected errors:", correctedErrors, ". Uncorrected errors", uncorrectedErrors) | ||
} | ||
os.Exit(0) | ||
} | ||
if err != nil { | ||
fmt.Println("binary.Read failed:", err) | ||
os.Exit(1) | ||
} | ||
|
||
return checksum | ||
} | ||
|
||
func readData(f *os.File) (data []byte, header ClickHouseHeader) { | ||
err := binary.Read(f, binary.LittleEndian, &header) | ||
if err != nil { | ||
fmt.Println("binary.Read failed:", err) | ||
os.Exit(1) | ||
} | ||
var headerSize = binary.Size(header) | ||
_, err = f.Seek(int64(headerSize*(-1)), io.SeekCurrent) // go back, checksum contains header | ||
if err != nil { | ||
fmt.Println("f.Seek failed:", err) | ||
os.Exit(1) | ||
} | ||
data = make([]byte, header.CompressedSize) | ||
n, err := f.Read(data) | ||
if err != nil { | ||
fmt.Println("f.Read failed:", err) | ||
os.Exit(1) | ||
} | ||
if n != int(header.CompressedSize) { | ||
fmt.Printf("error: file is clipped, expected compressed size: %v, actual: %v\n", header.CompressedSize, n) | ||
os.Exit(1) | ||
} | ||
|
||
return | ||
} | ||
|
||
func backup(src string) { | ||
dst := src + ".bak" | ||
if _, err := os.Stat(dst); os.IsNotExist(err) { | ||
fmt.Println(" backup file to", dst) | ||
_, err = copyFile(src, dst) | ||
if err != nil { | ||
panic(err) | ||
} | ||
} | ||
} | ||
|
||
func copyFile(src string, dst string) (int64, error) { | ||
sourceFileStat, err := os.Stat(src) | ||
if err != nil { | ||
return 0, err | ||
} | ||
|
||
if !sourceFileStat.Mode().IsRegular() { | ||
return 0, fmt.Errorf("%s is not a regular file", src) | ||
} | ||
|
||
source, err := os.Open(src) | ||
if err != nil { | ||
return 0, err | ||
} | ||
defer source.Close() | ||
|
||
destination, err := os.Create(dst) | ||
if err != nil { | ||
return 0, err | ||
} | ||
defer destination.Close() | ||
nBytes, err := io.Copy(destination, source) | ||
return nBytes, err | ||
} | ||
|
||
func compareChecksum(data []byte, expected ClickHouseChecksum) (result bool, actual ClickHouseChecksum) { | ||
actualUint128 := cityhash102.CityHash128(data, uint32(len(data))) | ||
actual = ClickHouseChecksum{actualUint128[0], actualUint128[1]} | ||
result = expected == actual | ||
|
||
return | ||
} |