Skip to content

Commit

Permalink
new tricks to get back performance
Browse files Browse the repository at this point in the history
  • Loading branch information
yihuang committed Feb 25, 2023
1 parent 3f726af commit 2acd4c9
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 178 deletions.
49 changes: 28 additions & 21 deletions memiavl/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ It also integrates well with versiondb, because versiondb can also be derived fr
### Change Set File

```
version: int64
size: int64 // size of whole payload
version: 8
size: 8 // size of whole payload
payload:
delete: int8
delete: 1
keyLen: varint-uint64
key
[ // if delete is false
Expand All @@ -67,45 +67,52 @@ IAVL snapshot is composed by four files:
- `metadata`, 16bytes:

```
magic: uint32
format: uint32
version: uint32
root node index: uint32
magic: 4
format: 4
version: 4
root node index: 4
```

- `nodes`, array of fixed size(16+32bytes) nodes, the node format is like this:

```
height : uint32 // padded to 4bytes
version : uint32
size : uint32
key_node : uint32
# branch
height : 1
_padding : 3
version : 4
size : 4
key node : 4
hash : [32]byte
# leaf
height : 1
key len : 3
key offset : 8
value index : 4
```
The node has fixed length, can be indexed directly. The nodes reference each other with the node index, nodes are written in post-order, so the root node is always placed at the end.

For branch node, the `key_node` field reference the smallest leaf node in the right branch, for leaf node, it's the leaf index, which can be used to find key and value in `keys` and `values` file.
For branch node, the `key node` field reference the smallest leaf node in the right branch, the key slice is fetched from there indirectly, the leaf nodes will store key slice and value index informations, but the version field is stored in `keys` file instead.

The branch node don't need to reference left/right children explicitly, they can be derived from existing information and properties of post-order traversal:
The branch node's left/child node indexes are inferenced from existing information and properties of post-order traversal:

```
right child index = self index - 1
left child index = key_node - 1
left child index = key node - 1
```

The version/size/node indexes are encoded with `uint32`, should be enough in foreseeable future, but could be changed to `uint64` in the future.
The version/size/node indexes are encoded with 4 bytes, should be enough in foreseeable future, but could be changed to more bytes in the future.

The implementation will read the mmap-ed content in a zero-copy way, won't use extra node cache, it will only rely on the OS page cache.

- `keys`, sequence of leaf node keys, ordered and no duplication, the offsets are encoded with custom format and appended to the end of the file, support query by leaf node index.
- `keys`, sequence of leaf node keys, ordered and no duplication, the offsets are encoded with elias-fano coding and appended to the end of the file, support query by leaf node index, the offsets table is not used currently becuase the leaf node in the tree also reference the offsets.

```
payload
*repeat*
offset restart: uint64
delta offsets: [65535]uint32
version: 4 // the version field of corresponding leaf node
key
*repeat*
offset: uint64 // beginning offset of the above table
offsets table encoded with elias-fano coding
offset: uint64 // beginning offset of the offsets table
```

- `values`, sequence of leaf node values, the offsets are encoded with elias-fano coding and appended to the end of the file, support query by leaf node index.
Expand Down
40 changes: 10 additions & 30 deletions memiavl/layout_little_endian.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@ func (node *NodeLayout) Size() uint32 {
return binary.LittleEndian.Uint32(node.data[OffsetSize : OffsetSize+4])
}

func (node *NodeLayout) KeyOffset() uint64 {
return binary.LittleEndian.Uint64(node.data[OffsetKeyOffset : OffsetKeyOffset+8])
}

func (node *NodeLayout) KeySlice() (uint64, uint32) {
_ = node.data[SizeNode-1]
l := uint32(node.data[OffsetKeyLen]) | uint32(node.data[OffsetKeyLen+1])<<8 | uint32(node.data[OffsetKeyLen+2])<<16
return node.KeyOffset(), l
}

func (node *NodeLayout) KeyNode() uint32 {
return binary.LittleEndian.Uint32(node.data[OffsetKeyNode : OffsetKeyNode+4])
}
Expand All @@ -48,33 +58,3 @@ func (node *NodeLayout) LeafIndex() uint32 {
func (node *NodeLayout) Hash() []byte {
return node.data[OffsetHash : OffsetHash+SizeHash]
}

type PlainOffsetTable struct {
data []byte
}

func (t PlainOffsetTable) Get2(i uint64) (uint64, uint64) {
ichunk := i / OffsetRestartInteval
ii := i % OffsetRestartInteval
irestart := ichunk * (OffsetRestartInteval + 1) * 4
data := t.data[irestart:]

_ = data[3*4-1]
restart := binary.LittleEndian.Uint64(data[:8])

if ii == 0 {
return restart, restart + uint64(binary.LittleEndian.Uint32(data[8:12]))
}
if ii == OffsetRestartInteval-1 {
// the next one is at the beginning of the next chunk
return restart + uint64(binary.LittleEndian.Uint32(data[OffsetRestartInteval*4:])),
binary.LittleEndian.Uint64(data[(OffsetRestartInteval+1)*4:])
}
// the next one is in the same chunk
return restart + uint64(binary.LittleEndian.Uint32(data[(ii+1)*4:])),
restart + uint64(binary.LittleEndian.Uint32(data[(ii+2)*4:]))
}

func NewPlainOffsetTable(data []byte) (PlainOffsetTable, error) {
return PlainOffsetTable{data}, nil
}
45 changes: 8 additions & 37 deletions memiavl/layout_native.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,14 @@ func (node *NodeLayout) Size() uint32 {
return node.data[2]
}

func (node *NodeLayout) KeyOffset() uint64 {
return uint64(node.data[1]) | uint64(node.data[2])<<32
}

func (node *NodeLayout) KeySlice() (uint64, uint32) {
return node.KeyOffset(), node.data[0] >> 8
}

func (node *NodeLayout) KeyNode() uint32 {
return node.data[3]
}
Expand All @@ -60,40 +68,3 @@ func (node *NodeLayout) LeafIndex() uint32 {
func (node *NodeLayout) Hash() []byte {
return node.hash[:]
}

type PlainOffsetTable struct {
data []uint32
}

func (t PlainOffsetTable) Get2(i uint64) (uint64, uint64) {
ichunk := i / OffsetRestartInteval
ii := i % OffsetRestartInteval
irestart := ichunk * (OffsetRestartInteval + 1)
data := t.data[irestart:]

_ = data[2]
restart := uint64(data[0]) | uint64(data[1])<<32

if ii == 0 {
return restart, restart + uint64(data[2])
}
if ii == OffsetRestartInteval-1 {
data2 := data[OffsetRestartInteval+1:]
_ = data2[1]
return restart + uint64(data[OffsetRestartInteval]), uint64(data2[0]) | uint64(data2[1])<<32
}
return restart + uint64(data[ii+1]), restart + uint64(data[ii+2])
}

func NewPlainOffsetTable(buf []byte) (PlainOffsetTable, error) {
// check alignment and size of the buffer
p := unsafe.Pointer(unsafe.SliceData(buf))
if uintptr(p)%4 != 0 {
return PlainOffsetTable{}, errors.New("input buffer is not aligned")
}
if len(buf)%4 != 0 {
return PlainOffsetTable{}, errors.New("input buffer length is not correct")
}
data := unsafe.Slice((*uint32)(p), len(buf)/4)
return PlainOffsetTable{data}, nil
}
72 changes: 42 additions & 30 deletions memiavl/persisted_node.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@ package memiavl
import (
"bytes"
"crypto/sha256"
"encoding/binary"
)

const (
OffsetHeight = 0
OffsetVersion = OffsetHeight + 4
OffsetSize = OffsetVersion + 4
OffsetKeyNode = OffsetSize + 4
OffsetLeafIndex = OffsetSize + 4
OffsetHeight = 0
OffsetVersion = OffsetHeight + 4
OffsetSize = OffsetVersion + 4
OffsetKeyNode = OffsetSize + 4

OffsetKeyLen = OffsetHeight + 1
OffsetKeyOffset = OffsetKeyLen + 3
OffsetLeafIndex = OffsetKeyOffset + 8

OffsetHash = OffsetKeyNode + 4
SizeHash = sha256.Size
Expand All @@ -22,17 +26,18 @@ const (
// Encoding format (all integers are encoded in little endian):
//
// Branch node:
// - height : uint32
// - version : uint32
// - size : uint32
// - key node : uint32 // node index of the smallest leaf in right branch
// - hash : [32]byte
// - height : 1
// - _padding : 3
// - version : 4
// - size : 4
// - key node : 4 // node index of the smallest leaf in right branch
// - hash : 32
// Leaf node:
// - height : uint32
// - version : uint32
// - size : uint32
// - leaf index : uint32 // can index both key and value
// - hash : [32]byte
// - height : 1
// - key len : 3
// - key offset : 8
// - value index : uint32
// - hash : 32
type PersistedNode struct {
snapshot *Snapshot
index uint32
Expand All @@ -49,22 +54,30 @@ func (node PersistedNode) Height() uint8 {
}

func (node PersistedNode) Version() uint32 {
return node.data().Version()
data := node.data()
if data.Height() != 0 {
return data.Version()
}

offset := data.KeyOffset()
return binary.LittleEndian.Uint32(node.snapshot.keys[offset-4 : offset])
}

func (node PersistedNode) Size() int64 {
return int64(node.data().Size())
data := node.data()
if node.Height() == 0 {
return 1
}
return int64(data.Size())
}

func (node PersistedNode) Key() []byte {
var leafIndex uint32
data := node.data()
if data.Height() == 0 {
leafIndex = data.LeafIndex()
} else {
leafIndex = node.snapshot.nodesLayout.Node(data.KeyNode()).LeafIndex()
if data.Height() != 0 {
data = node.snapshot.nodesLayout.Node(data.KeyNode())
}
return node.snapshot.Key(uint64(leafIndex))
offset, l := data.KeySlice()
return node.snapshot.keys[offset : offset+uint64(l)]
}

// Value result is not defined for non-leaf node.
Expand Down Expand Up @@ -112,23 +125,22 @@ func (node PersistedNode) Get(key []byte) []byte {
func getPersistedNode(snapshot *Snapshot, index uint32, key []byte) []byte {
nodes := snapshot.nodesLayout
keys := snapshot.keys
keysOffsets := snapshot.keysOffsets

for {
node := nodes.Node(index)
if node.Height() == 0 {
leafKey := node.LeafIndex()
start, end := keysOffsets.Get2(uint64(leafKey))
nodeKey := keys[start:end]
offset, l := node.KeySlice()
nodeKey := keys[offset : offset+uint64(l)]
if bytes.Equal(key, nodeKey) {
return snapshot.Value(uint64(leafKey))
leafIndex := node.LeafIndex()
return snapshot.Value(uint64(leafIndex))
}
return nil
}

keyNode := node.KeyNode()
start, end := keysOffsets.Get2(uint64(nodes.Node(keyNode).LeafIndex()))
nodeKey := keys[start:end]
offset, l := nodes.Node(keyNode).KeySlice()
nodeKey := keys[offset : offset+uint64(l)]
if bytes.Compare(key, nodeKey) == -1 {
// left child
index = keyNode - 1
Expand Down
Loading

0 comments on commit 2acd4c9

Please sign in to comment.