Skip to content

Commit

Permalink
first version
Browse files Browse the repository at this point in the history
  • Loading branch information
marliotto committed May 1, 2020
0 parents commit 42ea226
Show file tree
Hide file tree
Showing 4 changed files with 226 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/dist
*.bin
*.bak
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
### ClickHouse Bit Flip
The tool to fix single bit flip error in binary data files of ClickHouse.

The tool makes the copy of origin file with .bak extension before to recovery data.
**However, don't forget to make backup by himself**

Checked on ClickHouse 19.16.2.2.

### Usage

```bash
./clickhouse-bitflip filename
```

### For example
> Code: 40, e.displayText() = DB::Exception: Checksum doesn't match: corrupted data. Reference: 32eaca6117ab50a10b91e82b65de78e1. Actual: c44af50ae79ae7639ae96b59d8032bbc. Size of compressed block: 36031. The mismatch is caused by single bit flip in data block at byte 8732, bit 2. This is most likely due to hardware failure. If you receive broken data over network and the error does not repeat every time, this can be caused by bad RAM on network interface controller or bad controller itself or bad RAM on network switches or bad CPU on network switches (look at the logs on related network switches; note that TCP checksums don't help) or bad RAM on host (look at dmesg or kern.log for enormous amount of EDAC errors, ECC-related reports, Machine Check Exceptions, mcelog; note that ECC memory can fail if the number of errors is huge) or bad CPU on host. If you read data from disk, this can be caused by disk bit rott. This exception protects ClickHouse from data corruption due to hardware failures.: (while reading column Name): (while reading from part /var/lib/clickhouse/data/default/tbl/20200420_946605_946651_20/ from mark 0 with max_rows_to_read = 8192) (version 19.16.2.2 (official build)) (from 172.18.0.1:51644) (in query: SELECT Name FROM tbl)

Corrupted partition: /var/lib/clickhouse/data/default/tbl/20200420_946605_946651_20/
Corrupted column: Name

```bash
./clickhouse-bitflip /var/lib/clickhouse/data/default/tbl/20200420_946605_946651_20/Name.bin
```

To fix many files
```bash
find /var/lib/clickhouse/data/default/tbl/202004* -maxdepth 2 -type f -name '*.bin' -exec ./clickhouse-bitflip {} \; > result.log
```

Also, you can remove .bak files.

### Links
- https://github.com/ClickHouse/ClickHouse/blob/2b569cf26063bc13f6443199cff8c955f16d2edc/src/Compression/CompressedReadBufferBase.cpp#L42
- https://github.com/ClickHouse/ClickHouse/blob/ea6f90b4f2c3cc2f7d8b846c769b7f3e84907e47/src/Compression/CompressionInfo.h#L10
- https://habr.com/ru/company/oleg-bunin/blog/497334/
3 changes: 3 additions & 0 deletions build.cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
go build -o ./dist
set GOOS=linux
go build -o ./dist
185 changes: 185 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
package main

import (
"encoding/binary"
"flag"
"fmt"
"github.com/ClickHouse/clickhouse-go/lib/cityhash102"
"io"
"os"
)

type ClickHouseChecksum struct {
First, Second uint64
}

func (c ClickHouseChecksum) String() string {
return fmt.Sprintf("%x%x", c.First, c.Second)
}

type ClickHouseHeader struct {
Method uint8
CompressedSize, UncompressedSize uint32
}

var uncorrectedErrors int32 = 0
var correctedErrors int32 = 0

func main() {
flag.Parse()
srcFile := flag.Arg(0)

f, err := os.OpenFile(srcFile, os.O_RDWR, 0)
if err != nil {
panic(err)
}
defer f.Close()

for {
expected := readChecksum(f)
//fmt.Println("Reference:", expected)

startPosOfBlock, err := f.Seek(0, io.SeekCurrent)
if err != nil {
panic(err)
}

data, _ := readData(f)
//fmt.Println("Size of compressed block:", header.CompressedSize)

bitFlipPos, fixed := fixData(data, expected)
if fixed {
backup(srcFile)
fixFile(f, data, int64(bitFlipPos), startPosOfBlock)
}
}
}

func fixFile(f *os.File, data []byte, bitPos int64, startPosOfBlock int64) {
pos := bitPos / 8
filePos := startPosOfBlock + pos

fmt.Printf(" Fixing in file at byte %v. Old value: 0x%x. New value: 0x%x\n", filePos, data[pos]^1<<(bitPos%8), data[pos])

_, err := f.WriteAt(data[pos:pos+1], filePos)
if err != nil {
panic(err)
}
correctedErrors++
}

func fixData(data []byte, expected ClickHouseChecksum) (int, bool) {
isEqual, actual := compareChecksum(data, expected)
if isEqual {
//fmt.Printf(" Checksums are equal. Reference: %v\n", expected)
return 0, false
}
fmt.Printf("Checksum doesn't match: corrupted data. Reference: %v. Actual: %v. Size of compressed block: %v\n", expected, actual, len(data))

for bitPos := 0; bitPos < len(data)*8; bitPos++ {
flipBit(data, bitPos)
isEqual, actual = compareChecksum(data, expected)
if isEqual {
fmt.Println(" The mismatch is caused by single bit flip in data block at byte", bitPos/8, ", bit", bitPos%8)
return bitPos, true
}
flipBit(data, bitPos)
}

uncorrectedErrors++
fmt.Println(" Error: the mismatch is not caused by single bit flip. It can't be fixed!")
return 0, false
}

func flipBit(data []byte, pos int) {
data[pos/8] ^= 1 << (pos % 8)
}

func readChecksum(f *os.File) ClickHouseChecksum {
var checksum ClickHouseChecksum
err := binary.Read(f, binary.LittleEndian, &checksum)
if err == io.EOF {
if uncorrectedErrors == 0 && correctedErrors == 0 {
fmt.Println("No errors")
} else {
fmt.Println("Completed. Corrected errors:", correctedErrors, ". Uncorrected errors", uncorrectedErrors)
}
os.Exit(0)
}
if err != nil {
fmt.Println("binary.Read failed:", err)
os.Exit(1)
}

return checksum
}

func readData(f *os.File) (data []byte, header ClickHouseHeader) {
err := binary.Read(f, binary.LittleEndian, &header)
if err != nil {
fmt.Println("binary.Read failed:", err)
os.Exit(1)
}
var headerSize = binary.Size(header)
_, err = f.Seek(int64(headerSize*(-1)), io.SeekCurrent) // go back, checksum contains header
if err != nil {
fmt.Println("f.Seek failed:", err)
os.Exit(1)
}
data = make([]byte, header.CompressedSize)
n, err := f.Read(data)
if err != nil {
fmt.Println("f.Read failed:", err)
os.Exit(1)
}
if n != int(header.CompressedSize) {
fmt.Printf("error: file is clipped, expected compressed size: %v, actual: %v\n", header.CompressedSize, n)
os.Exit(1)
}

return
}

func backup(src string) {
dst := src + ".bak"
if _, err := os.Stat(dst); os.IsNotExist(err) {
fmt.Println(" backup file to", dst)
_, err = copyFile(src, dst)
if err != nil {
panic(err)
}
}
}

func copyFile(src string, dst string) (int64, error) {
sourceFileStat, err := os.Stat(src)
if err != nil {
return 0, err
}

if !sourceFileStat.Mode().IsRegular() {
return 0, fmt.Errorf("%s is not a regular file", src)
}

source, err := os.Open(src)
if err != nil {
return 0, err
}
defer source.Close()

destination, err := os.Create(dst)
if err != nil {
return 0, err
}
defer destination.Close()
nBytes, err := io.Copy(destination, source)
return nBytes, err
}

func compareChecksum(data []byte, expected ClickHouseChecksum) (result bool, actual ClickHouseChecksum) {
actualUint128 := cityhash102.CityHash128(data, uint32(len(data)))
actual = ClickHouseChecksum{actualUint128[0], actualUint128[1]}
result = expected == actual

return
}

0 comments on commit 42ea226

Please sign in to comment.