Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: auto-shard based on node size #171

Merged
merged 6 commits into from
Feb 9, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packages/ipfs-unixfs-importer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ The input's file paths and directory structure will be preserved in the [`dag-pb
`options` is an JavaScript option that might include the following keys:

- `wrapWithDirectory` (boolean, defaults to false): if true, a wrapping node will be created
- `shardSplitThreshold` (positive integer, defaults to 1000): the number of directory entries above which we decide to use a sharding directory builder (instead of the default flat one)
- `shardSplitThreshold` (positive integer, defaults to 256KiB): if the serialized node is larger than this it will be converted to a HAMT sharded directory
achingbrain marked this conversation as resolved.
Show resolved Hide resolved
- `chunker` (string, defaults to `"fixed"`): the chunking strategy. Supports:
- `fixed`
- `rabin`
Expand Down
40 changes: 39 additions & 1 deletion packages/ipfs-unixfs-importer/src/dir-flat.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { encode, prepare } from '@ipld/dag-pb'
import { UnixFS } from 'ipfs-unixfs'
import Dir from './dir.js'
import { Dir, CID_V0, CID_V1 } from './dir.js'
import persist from './utils/persist.js'

/**
Expand Down Expand Up @@ -32,6 +32,7 @@ class DirFlat extends Dir {
async put (name, value) {
this.cid = undefined
this.size = undefined
this.nodeSize = undefined

this._children[name] = value
}
Expand Down Expand Up @@ -68,6 +69,43 @@ class DirFlat extends Dir {
}
}

calculateNodeSize () {
if (this.nodeSize !== undefined) {
return this.nodeSize
}

const links = []

for (const name of Object.keys(this._children)) {
achingbrain marked this conversation as resolved.
Show resolved Hide resolved
const child = this._children[name]
let size

if (child instanceof Dir) {
size = child.calculateNodeSize()
} else {
size = child.size
}

if (child.size != null && child.cid) {
links.push({
Name: name,
Tsize: size,
Hash: this.options.cidVersion === 0 ? CID_V0 : CID_V1
})
}
}

const unixfs = new UnixFS({
type: 'directory',
mtime: this.mtime,
mode: this.mode
})

this.nodeSize = encode(prepare({ Data: unixfs.marshal(), Links: links })).length

return this.nodeSize
}

/**
* @param {Blockstore} block
* @returns {AsyncIterable<ImportResult>}
Expand Down
91 changes: 89 additions & 2 deletions packages/ipfs-unixfs-importer/src/dir-sharded.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { encode, prepare } from '@ipld/dag-pb'
import { UnixFS } from 'ipfs-unixfs'
import Dir from './dir.js'
import { Dir, CID_V0, CID_V1 } from './dir.js'
import persist from './utils/persist.js'
import { createHAMT, Bucket } from 'hamt-sharding'

Expand Down Expand Up @@ -35,6 +35,10 @@ class DirSharded extends Dir {
* @param {InProgressImportResult | Dir} value
*/
async put (name, value) {
this.cid = undefined
this.size = undefined
this.nodeSize = undefined

await this._bucket.put(name, value)
}

Expand Down Expand Up @@ -66,6 +70,16 @@ class DirSharded extends Dir {
}
}

calculateNodeSize () {
if (this.nodeSize !== undefined) {
return this.nodeSize
}

this.nodeSize = calculateSize(this._bucket, this, this.options)

return this.nodeSize
}

/**
* @param {Blockstore} blockstore
* @returns {AsyncIterable<ImportResult>}
Expand All @@ -85,7 +99,7 @@ export default DirSharded
/**
* @param {Bucket<?>} bucket
* @param {Blockstore} blockstore
* @param {*} shardRoot
* @param {DirSharded | null} shardRoot
* @param {ImporterOptions} options
* @returns {AsyncIterable<ImportResult>}
*/
Expand Down Expand Up @@ -183,3 +197,76 @@ async function * flush (bucket, blockstore, shardRoot, options) {
size
}
}

/**
* @param {Bucket<?>} bucket
* @param {DirSharded | null} shardRoot
* @param {ImporterOptions} options
*/
function calculateSize (bucket, shardRoot, options) {
const children = bucket._children
const links = []

for (let i = 0; i < children.length; i++) {
const child = children.get(i)

if (!child) {
continue
}

const labelPrefix = i.toString(16).toUpperCase().padStart(2, '0')

if (child instanceof Bucket) {
const size = calculateSize(child, null, options)

links.push({
Name: labelPrefix,
Tsize: size,
Hash: options.cidVersion === 0 ? CID_V0 : CID_V1
})
} else if (typeof child.value.flush === 'function') {
const dir = child.value
const size = dir.nodeSize()

links.push({
Name: labelPrefix + child.key,
Tsize: size,
Hash: options.cidVersion === 0 ? CID_V0 : CID_V1
})
} else {
const value = child.value

if (!value.cid) {
continue
}

const label = labelPrefix + child.key
const size = value.size

links.push({
Name: label,
Tsize: size,
Hash: value.cid
})
}
}

// go-ipfs uses little endian, that's why we have to
// reverse the bit field before storing it
const data = Uint8Array.from(children.bitField().reverse())
const dir = new UnixFS({
type: 'hamt-sharded-directory',
data,
fanout: bucket.tableSize(),
hashType: options.hamtHashCode,
mtime: shardRoot && shardRoot.mtime,
mode: shardRoot && shardRoot.mode
})

const buffer = encode(prepare({
Data: dir.marshal(),
Links: links
}))

return buffer.length
}
21 changes: 18 additions & 3 deletions packages/ipfs-unixfs-importer/src/dir.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import { CID } from 'multiformats/cid'

/**
* @typedef {import('./types').ImporterOptions} ImporterOptions
* @typedef {import('./types').ImportResult} ImportResult
* @typedef {import('./types').InProgressImportResult} InProgressImportResult
* @typedef {import('interface-blockstore').Blockstore} Blockstore
* @typedef {import('multiformats/cid').CID} CID
*
* @typedef {object} DirProps
* @property {boolean} root
Expand All @@ -17,7 +18,7 @@
* @property {number} [mode]
* @property {import('ipfs-unixfs').Mtime} [mtime]
*/
class Dir {
export class Dir {
/**
* @param {DirProps} props
* @param {ImporterOptions} options
Expand All @@ -40,6 +41,8 @@ class Dir {
this.cid = undefined
/** @type {number | undefined} */
this.size = undefined
/** @type {number | undefined} */
this.nodeSize = undefined
}

/**
Expand All @@ -66,6 +69,18 @@ class Dir {
* @returns {AsyncIterable<ImportResult>}
*/
async * flush (blockstore) { }

/**
* @returns {number}
*/
calculateNodeSize () {
return 0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the abstract superclass of dir-flat and dir-sharded - this returns 0 to appease TypeScript, the implementation in the concrete subclasses actually do the work.

This will be refactored in a future PR to be an interface so it'll look less weird.

}
}

export default Dir
// we use these to calculate the node size to use as a check for whether a directory
// should be sharded or not. Since CIDs have a constant length and We're only
// interested in the data length and not the actual content identifier we can use
// any old CID instead of having to hash the data which is expensive.
export const CID_V0 = CID.parse('QmUNLLsPACCz1vLxQVkXqqLX5R1X345qqfHbsf67hvA3Nn')
export const CID_V1 = CID.parse('zdj7WbTaiJT1fgatdet9Ei9iDB5hdCxkbVyhyh8YTUnXMiwYi')
4 changes: 2 additions & 2 deletions packages/ipfs-unixfs-importer/src/flat-to-shard.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import DirSharded from './dir-sharded.js'
import DirFlat from './dir-flat.js'

/**
* @typedef {import('./dir').default} Dir
* @typedef {import('./dir').Dir} Dir
* @typedef {import('./types').ImporterOptions} ImporterOptions
*/

Expand All @@ -16,7 +16,7 @@ import DirFlat from './dir-flat.js'
async function flatToShard (child, dir, threshold, options) {
let newDir = dir

if (dir instanceof DirFlat && dir.directChildrenCount() >= threshold) {
if (dir instanceof DirFlat && dir.calculateNodeSize() > threshold) {
newDir = await convertToShard(dir, options)
}

Expand Down
3 changes: 2 additions & 1 deletion packages/ipfs-unixfs-importer/src/options.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ const defaultOptions = {
leafType: 'file', // 'raw'
cidVersion: 0,
progress: () => () => {},
shardSplitThreshold: 1000,
// https://github.com/ipfs/go-ipfs/pull/8114/files#diff-eec963b47a6e1080d9d8023b4e438e6e3591b4154f7379a7e728401d2055374aR319
shardSplitThreshold: 262144,
fileImportConcurrency: 50,
blockWriteConcurrency: 10,
minChunkSize: 262144,
Expand Down
2 changes: 1 addition & 1 deletion packages/ipfs-unixfs-importer/src/tree-builder.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import DirFlat from './dir-flat.js'
import flatToShard from './flat-to-shard.js'
import Dir from './dir.js'
import { Dir } from './dir.js'
import toPathComponents from './utils/to-path-components.js'

/**
Expand Down
4 changes: 2 additions & 2 deletions packages/ipfs-unixfs/src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ const DEFAULT_FILE_MODE = parseInt('0644', 8)
const DEFAULT_DIRECTORY_MODE = parseInt('0755', 8)

/**
* @param {string | number | undefined} [mode]
* @param {string | number | null | undefined} [mode]
*/
export function parseMode (mode) {
if (mode == null) {
Expand Down Expand Up @@ -161,7 +161,7 @@ class UnixFS {
* @param {number} [options.hashType]
* @param {number} [options.fanout]
* @param {MtimeLike | null} [options.mtime]
* @param {number | string} [options.mode]
* @param {number | string | null} [options.mode]
*/
constructor (options = {
type: 'file'
Expand Down