Skip to content
This repository has been archived by the owner on Aug 12, 2020. It is now read-only.

feat: support --raw-leaves #219

Merged
merged 1 commit into from
Jul 19, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@ The input's file paths and directory structure will be preserved in the [`dag-pb
- `onlyHash` (boolean, defaults to false): Only chunk and hash - do not write to disk
- `hashAlg` (string): multihash hashing algorithm to use
- `cidVersion` (integer, default 0): the CID version to use when storing the data (storage keys are based on the CID, _including_ it's version)
- `rawLeafNodes` (boolean, defaults to false): When a file would span multiple DAGNodes, if this is true the leaf nodes will be marked as `raw` `unixfs` nodes
- `rawLeaves` (boolean, defaults to false): When a file would span multiple DAGNodes, if this is true the leaf nodes will not be wrapped in `UnixFS` protobufs and will instead contain the raw file bytes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Document leafType here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done!

- `leafType` (string, defaults to `'file'`) what type of UnixFS node leaves should be - can be `'file'` or `'raw'` (ignored when `rawLeaves` is `true`)

### Exporter

Expand Down
77 changes: 52 additions & 25 deletions src/builder/builder.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ const parallel = require('async/parallel')
const waterfall = require('async/waterfall')
const dagPB = require('ipld-dag-pb')
const CID = require('cids')
const multihash = require('multihashing-async')

const reduce = require('./reduce')

Expand All @@ -17,10 +18,13 @@ const defaultOptions = {
chunkerOptions: {
maxChunkSize: 262144
},
rawLeafNodes: false
rawLeaves: false,
hashAlg: 'sha2-256',
leafType: 'file',
cidVersion: 0
}

module.exports = function (createChunker, ipld, createReducer, _options) {
module.exports = function builder (createChunker, ipld, createReducer, _options) {
const options = extend({}, defaultOptions, _options)

return function (source) {
Expand Down Expand Up @@ -62,15 +66,13 @@ module.exports = function (createChunker, ipld, createReducer, _options) {
waterfall([
(cb) => DAGNode.create(d.marshal(), [], options.hashAlg, cb),
(node, cb) => {
if (options.onlyHash) return cb(null, node)

let cid = new CID(node.multihash)

if (options.cidVersion === 1) {
cid = cid.toV1()
if (options.onlyHash) {
return cb(null, node)
}

ipld.put(node, { cid }, (err) => cb(err, node))
ipld.put(node, {
cid: new CID(options.cidVersion, 'dag-pb', node.multihash)
}, (err) => cb(err, node))
}
], (err, node) => {
if (err) {
Expand All @@ -97,7 +99,6 @@ module.exports = function (createChunker, ipld, createReducer, _options) {

let previous
let count = 0
const leafType = options.rawLeafNodes ? 'raw' : 'file'

pull(
file.content,
Expand All @@ -108,30 +109,56 @@ module.exports = function (createChunker, ipld, createReducer, _options) {
}
return Buffer.from(chunk)
}),
pull.map(buffer => new UnixFS(leafType, buffer)),
pull.asyncMap((fileNode, callback) => {
DAGNode.create(fileNode.marshal(), [], options.hashAlg, (err, node) => {
callback(err, { DAGNode: node, fileNode: fileNode })
pull.asyncMap((buffer, callback) => {
if (options.rawLeaves) {
return multihash(buffer, options.hashAlg, (error, hash) => {
if (error) {
return callback(error)
}

return callback(null, {
multihash: hash,
size: buffer.length,
leafSize: buffer.length,
cid: new CID(1, 'raw', hash),
data: buffer
})
})
}

const file = new UnixFS(options.leafType, buffer)

DAGNode.create(file.marshal(), [], options.hashAlg, (err, node) => {
if (err) {
return callback(err)
}

callback(null, {
multihash: node.multihash,
size: node.size,
leafSize: file.fileSize(),
cid: new CID(options.cidVersion, 'dag-pb', node.multihash),
data: node
})
})
}),
pull.asyncMap((leaf, callback) => {
if (options.onlyHash) return callback(null, leaf)

let cid = new CID(leaf.DAGNode.multihash)

if (options.cidVersion === 1) {
cid = cid.toV1()
if (options.onlyHash) {
return callback(null, leaf)
}

ipld.put(leaf.DAGNode, { cid }, (err) => callback(err, leaf))
ipld.put(leaf.data, {
cid: leaf.cid
}, (error) => callback(error, leaf))
}),
pull.map((leaf) => {
return {
path: file.path,
multihash: leaf.DAGNode.multihash,
size: leaf.DAGNode.size,
leafSize: leaf.fileNode.fileSize(),
name: ''
multihash: leaf.multihash,
size: leaf.size,
leafSize: leaf.leafSize,
name: '',
cid: leaf.cid
}
}),
through( // mark as single node if only one single node
Expand Down
68 changes: 40 additions & 28 deletions src/builder/reduce.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ const CID = require('cids')
const DAGLink = dagPB.DAGLink
const DAGNode = dagPB.DAGNode

module.exports = function (file, ipld, options) {
module.exports = function reduce (file, ipld, options) {
return function (leaves, callback) {
if (leaves.length === 1 && leaves[0].single && options.reduceSingleLeafToSelf) {
const leaf = leaves[0]

if (!options.rawLeafNodes) {
if (options.leafType === 'file' && !options.rawLeaves) {
return callback(null, {
path: file.path,
multihash: leaf.multihash,
Expand All @@ -23,33 +23,37 @@ module.exports = function (file, ipld, options) {
})
}

// we are using raw leaf nodes, this file only has one node but it'll be marked raw
// so convert it back to a file node
// we're using raw leaf nodes so we convert the node into a UnixFS `file` node.
return waterfall([
(cb) => ipld.get(new CID(leaf.multihash), cb),
(cb) => ipld.get(leaf.cid, cb),
(result, cb) => {
const meta = UnixFS.unmarshal(result.value.data)
const fileNode = new UnixFS('file', meta.data)
const data = result.value.data
const fileNode = new UnixFS('file', data)

DAGNode.create(fileNode.marshal(), [], options.hashAlg, (err, node) => {
cb(err, { DAGNode: node, fileNode: fileNode })
DAGNode.create(fileNode.marshal(), [], options.hashAlg, (error, node) => {
cb(error, { DAGNode: node, fileNode: fileNode })
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick, why s/err/error?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personal preference really - I think short variable names make the code less expressive.

})
},
(result, cb) => {
if (options.onlyHash) {
return cb(null, result)
}

let cid = new CID(result.DAGNode.multihash)

if (options.cidVersion === 1) {
cid = cid.toV1()
}

ipld.put(result.DAGNode, { cid }, (err) => cb(err, result))
ipld.put(result.DAGNode, { cid }, (error) => cb(error, result))
},
(result, cb) => {
cb(null, {
path: file.path,
multihash: result.DAGNode.multihash,
size: result.DAGNode.size,
leafSize: result.fileNode.fileSize(),
name: ''
name: leaf.name
})
}
], callback)
Expand All @@ -61,37 +65,45 @@ module.exports = function (file, ipld, options) {
const links = leaves.map((leaf) => {
f.addBlockSize(leaf.leafSize)

return new DAGLink(leaf.name, leaf.size, leaf.multihash)
let cid = leaf.cid

if (!cid) {
// we are an intermediate node
cid = new CID(options.cidVersion, 'dag-pb', leaf.multihash)
}

return new DAGLink(leaf.name, leaf.size, cid.buffer)
})

waterfall([
(cb) => DAGNode.create(f.marshal(), links, options.hashAlg, cb),
(node, cb) => {
if (options.onlyHash) return cb(null, node)
const cid = new CID(options.cidVersion, 'dag-pb', node.multihash)

let cid = new CID(node.multihash)

if (options.cidVersion === 1) {
cid = cid.toV1()
if (options.onlyHash) {
return cb(null, {
node, cid
})
}

ipld.put(node, { cid }, (err) => cb(err, node))
ipld.put(node, {
cid
}, (error) => cb(error, {
node, cid
}))
}
], (err, node) => {
if (err) {
callback(err)
return // early
], (error, result) => {
if (error) {
return callback(error)
}

const root = {
callback(null, {
name: '',
path: file.path,
multihash: node.multihash,
size: node.size,
multihash: result.cid.buffer,
size: result.node.size,
leafSize: f.fileSize()
}

callback(null, root)
})
})
}
}
10 changes: 10 additions & 0 deletions src/exporter/file.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ function streamBytes (dag, node, fileSize, offset, length) {

function getData ({ node, start }) {
try {
if (Buffer.isBuffer(node)) {
// this is a raw node
return extractDataFromBlock(node, start, offset, end)
}

const file = UnixFS.unmarshal(node.data)

if (!file.data) {
Expand All @@ -80,6 +85,11 @@ function streamBytes (dag, node, fileSize, offset, length) {
let streamPosition = 0

function visitor ({ node }) {
if (Buffer.isBuffer(node)) {
// this is a raw node
return pull.empty()
}

const file = UnixFS.unmarshal(node.data)
const nodeHasData = Boolean(file.data && file.data.length)

Expand Down
20 changes: 19 additions & 1 deletion src/importer/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,29 @@ const chunkers = {

const defaultOptions = {
chunker: 'fixed',
rawLeafNodes: false
rawLeaves: false,
hashOnly: false,
cidVersion: 0,
hash: null,
leafType: 'file',
hashAlg: 'sha2-256'
}

module.exports = function (ipld, _options) {
const options = Object.assign({}, defaultOptions, _options)

if (options.cidVersion > 0 && _options.rawLeaves === undefined) {
// if the cid version is 1 or above, use raw leaves as this is
// what go does.
options.rawLeaves = true
}

if (_options && _options.hash !== undefined && _options.rawLeaves === undefined) {
// if a non-default hash alg has been specified, use raw leaves as this is
// what go does.
options.rawLeaves = true
}

const Chunker = chunkers[options.chunker]
assert(Chunker, 'Unknkown chunker named ' + options.chunker)

Expand Down
31 changes: 31 additions & 0 deletions test/exporter.js
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,37 @@ module.exports = (repo) => {
)
})

it('exports a large file > 5mb imported with raw leaves', function (done) {
this.timeout(30 * 1000)

pull(
pull.values([{
path: '200Bytes.txt',
content: pull.values([bigFile])
}]),
importer(ipld, {
rawLeaves: true
}),
pull.collect(collected)
)

function collected (err, files) {
expect(err).to.not.exist()
expect(files.length).to.equal(1)

pull(
exporter(files[0].multihash, ipld),
pull.collect((err, files) => {
expect(err).to.not.exist()

expect(bs58.encode(files[0].hash)).to.equal('QmQLTvhjmSa7657mKdSfTjxFBdwxmK8n9tZC9Xdp9DtxWY')

fileEql(files[0], bigFile, done)
})
)
}
})

it('returns an empty stream for dir', (done) => {
const hash = 'QmUNLLsPACCz1vLxQVkXqqLX5R1X345qqfHbsf67hvA3Nn'

Expand Down
32 changes: 32 additions & 0 deletions test/helpers/collect-leaf-cids.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
'use strict'

const pull = require('pull-stream')
const traverse = require('pull-traverse')
const CID = require('cids')

module.exports = (ipld, multihash, callback) => {
pull(
traverse.depthFirst(new CID(multihash), (cid) => {
return pull(
pull.values([cid]),
pull.asyncMap((cid, callback) => {
ipld.get(cid, (error, result) => {
callback(error, !error && result.value)
})
}),
pull.asyncMap((node, callback) => {
if (!node.links) {
return callback()
}

return callback(
null, node.links.map(link => new CID(link.multihash))
)
}),
pull.filter(Boolean),
pull.flatten()
)
}),
pull.collect(callback)
)
}
Loading