From 7a29d833373d9b6d39ae9957074b0b1c75638a52 Mon Sep 17 00:00:00 2001 From: achingbrain Date: Tue, 17 Jul 2018 10:31:51 +0100 Subject: [PATCH] feat: support --raw-leaves MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Goes some way towards fixing ipfs/js-ipfs#1432 - will need follow up PRs for js-ipfs-mfs and js-ipfs itself (🔜). There are three ways of importing a file we need to support and each will end up with slightly different DAG structure. ipfs add will result in a balanced DAG with leaf nodes that are unixfs nodes of type file ipfs files write results in a trickle DAG with leaf nodes that are unixfs nodes of type raw ipfs add --raw-leaves and ipfs files write --raw-leaves have the balanced/trickle DAG of above, but the leaf nodes are chunks of file data not wrapped in protobufs. In all cases above the root node is a unixfs file node with a v0 CID, unless you specify --cid-version=1. This PR: Changes meaning of existing rawLeaves argument. Now means the leaf node is just data - a chunk of the file, previously it was meant a unixfs node with type raw. So far the only code using this is js-ipfs-mfs so changing it shouldn't be too disruptive. Adds a leafType option which can be file or raw - when --raw-leaves is false, this is what the unixfs leaf type will be. Uses CIDv1 for raw leaves with the codec raw --- README.md | 3 +- src/builder/builder.js | 77 +++++++++++++++++++++---------- src/builder/reduce.js | 68 ++++++++++++++++----------- src/exporter/file.js | 10 ++++ src/importer/index.js | 20 +++++++- test/exporter.js | 31 +++++++++++++ test/helpers/collect-leaf-cids.js | 32 +++++++++++++ test/importer.js | 45 ++++++++++++++++-- 8 files changed, 227 insertions(+), 59 deletions(-) create mode 100644 test/helpers/collect-leaf-cids.js diff --git a/README.md b/README.md index 9cc0aada..997a6419 100644 --- a/README.md +++ b/README.md @@ -149,7 +149,8 @@ The input's file paths and directory structure will be preserved in the [`dag-pb - `onlyHash` (boolean, defaults to false): Only chunk and hash - do not write to disk - `hashAlg` (string): multihash hashing algorithm to use - `cidVersion` (integer, default 0): the CID version to use when storing the data (storage keys are based on the CID, _including_ it's version) -- `rawLeafNodes` (boolean, defaults to false): When a file would span multiple DAGNodes, if this is true the leaf nodes will be marked as `raw` `unixfs` nodes +- `rawLeaves` (boolean, defaults to false): When a file would span multiple DAGNodes, if this is true the leaf nodes will not be wrapped in `UnixFS` protobufs and will instead contain the raw file bytes +- `leafType` (string, defaults to `'file'`) what type of UnixFS node leaves should be - can be `'file'` or `'raw'` (ignored when `rawLeaves` is `true`) ### Exporter diff --git a/src/builder/builder.js b/src/builder/builder.js index c2e0bc09..2e7cd0a9 100644 --- a/src/builder/builder.js +++ b/src/builder/builder.js @@ -8,6 +8,7 @@ const parallel = require('async/parallel') const waterfall = require('async/waterfall') const dagPB = require('ipld-dag-pb') const CID = require('cids') +const multihash = require('multihashing-async') const reduce = require('./reduce') @@ -17,10 +18,13 @@ const defaultOptions = { chunkerOptions: { maxChunkSize: 262144 }, - rawLeafNodes: false + rawLeaves: false, + hashAlg: 'sha2-256', + leafType: 'file', + cidVersion: 0 } -module.exports = function (createChunker, ipld, createReducer, _options) { +module.exports = function builder (createChunker, ipld, createReducer, _options) { const options = extend({}, defaultOptions, _options) return function (source) { @@ -62,15 +66,13 @@ module.exports = function (createChunker, ipld, createReducer, _options) { waterfall([ (cb) => DAGNode.create(d.marshal(), [], options.hashAlg, cb), (node, cb) => { - if (options.onlyHash) return cb(null, node) - - let cid = new CID(node.multihash) - - if (options.cidVersion === 1) { - cid = cid.toV1() + if (options.onlyHash) { + return cb(null, node) } - ipld.put(node, { cid }, (err) => cb(err, node)) + ipld.put(node, { + cid: new CID(options.cidVersion, 'dag-pb', node.multihash) + }, (err) => cb(err, node)) } ], (err, node) => { if (err) { @@ -97,7 +99,6 @@ module.exports = function (createChunker, ipld, createReducer, _options) { let previous let count = 0 - const leafType = options.rawLeafNodes ? 'raw' : 'file' pull( file.content, @@ -108,30 +109,56 @@ module.exports = function (createChunker, ipld, createReducer, _options) { } return Buffer.from(chunk) }), - pull.map(buffer => new UnixFS(leafType, buffer)), - pull.asyncMap((fileNode, callback) => { - DAGNode.create(fileNode.marshal(), [], options.hashAlg, (err, node) => { - callback(err, { DAGNode: node, fileNode: fileNode }) + pull.asyncMap((buffer, callback) => { + if (options.rawLeaves) { + return multihash(buffer, options.hashAlg, (error, hash) => { + if (error) { + return callback(error) + } + + return callback(null, { + multihash: hash, + size: buffer.length, + leafSize: buffer.length, + cid: new CID(1, 'raw', hash), + data: buffer + }) + }) + } + + const file = new UnixFS(options.leafType, buffer) + + DAGNode.create(file.marshal(), [], options.hashAlg, (err, node) => { + if (err) { + return callback(err) + } + + callback(null, { + multihash: node.multihash, + size: node.size, + leafSize: file.fileSize(), + cid: new CID(options.cidVersion, 'dag-pb', node.multihash), + data: node + }) }) }), pull.asyncMap((leaf, callback) => { - if (options.onlyHash) return callback(null, leaf) - - let cid = new CID(leaf.DAGNode.multihash) - - if (options.cidVersion === 1) { - cid = cid.toV1() + if (options.onlyHash) { + return callback(null, leaf) } - ipld.put(leaf.DAGNode, { cid }, (err) => callback(err, leaf)) + ipld.put(leaf.data, { + cid: leaf.cid + }, (error) => callback(error, leaf)) }), pull.map((leaf) => { return { path: file.path, - multihash: leaf.DAGNode.multihash, - size: leaf.DAGNode.size, - leafSize: leaf.fileNode.fileSize(), - name: '' + multihash: leaf.multihash, + size: leaf.size, + leafSize: leaf.leafSize, + name: '', + cid: leaf.cid } }), through( // mark as single node if only one single node diff --git a/src/builder/reduce.js b/src/builder/reduce.js index de2584e6..45d612e5 100644 --- a/src/builder/reduce.js +++ b/src/builder/reduce.js @@ -8,12 +8,12 @@ const CID = require('cids') const DAGLink = dagPB.DAGLink const DAGNode = dagPB.DAGNode -module.exports = function (file, ipld, options) { +module.exports = function reduce (file, ipld, options) { return function (leaves, callback) { if (leaves.length === 1 && leaves[0].single && options.reduceSingleLeafToSelf) { const leaf = leaves[0] - if (!options.rawLeafNodes) { + if (options.leafType === 'file' && !options.rawLeaves) { return callback(null, { path: file.path, multihash: leaf.multihash, @@ -23,33 +23,37 @@ module.exports = function (file, ipld, options) { }) } - // we are using raw leaf nodes, this file only has one node but it'll be marked raw - // so convert it back to a file node + // we're using raw leaf nodes so we convert the node into a UnixFS `file` node. return waterfall([ - (cb) => ipld.get(new CID(leaf.multihash), cb), + (cb) => ipld.get(leaf.cid, cb), (result, cb) => { - const meta = UnixFS.unmarshal(result.value.data) - const fileNode = new UnixFS('file', meta.data) + const data = result.value.data + const fileNode = new UnixFS('file', data) - DAGNode.create(fileNode.marshal(), [], options.hashAlg, (err, node) => { - cb(err, { DAGNode: node, fileNode: fileNode }) + DAGNode.create(fileNode.marshal(), [], options.hashAlg, (error, node) => { + cb(error, { DAGNode: node, fileNode: fileNode }) }) }, (result, cb) => { + if (options.onlyHash) { + return cb(null, result) + } + let cid = new CID(result.DAGNode.multihash) if (options.cidVersion === 1) { cid = cid.toV1() } - ipld.put(result.DAGNode, { cid }, (err) => cb(err, result)) + ipld.put(result.DAGNode, { cid }, (error) => cb(error, result)) }, (result, cb) => { cb(null, { + path: file.path, multihash: result.DAGNode.multihash, size: result.DAGNode.size, leafSize: result.fileNode.fileSize(), - name: '' + name: leaf.name }) } ], callback) @@ -61,37 +65,45 @@ module.exports = function (file, ipld, options) { const links = leaves.map((leaf) => { f.addBlockSize(leaf.leafSize) - return new DAGLink(leaf.name, leaf.size, leaf.multihash) + let cid = leaf.cid + + if (!cid) { + // we are an intermediate node + cid = new CID(options.cidVersion, 'dag-pb', leaf.multihash) + } + + return new DAGLink(leaf.name, leaf.size, cid.buffer) }) waterfall([ (cb) => DAGNode.create(f.marshal(), links, options.hashAlg, cb), (node, cb) => { - if (options.onlyHash) return cb(null, node) + const cid = new CID(options.cidVersion, 'dag-pb', node.multihash) - let cid = new CID(node.multihash) - - if (options.cidVersion === 1) { - cid = cid.toV1() + if (options.onlyHash) { + return cb(null, { + node, cid + }) } - ipld.put(node, { cid }, (err) => cb(err, node)) + ipld.put(node, { + cid + }, (error) => cb(error, { + node, cid + })) } - ], (err, node) => { - if (err) { - callback(err) - return // early + ], (error, result) => { + if (error) { + return callback(error) } - const root = { + callback(null, { name: '', path: file.path, - multihash: node.multihash, - size: node.size, + multihash: result.cid.buffer, + size: result.node.size, leafSize: f.fileSize() - } - - callback(null, root) + }) }) } } diff --git a/src/exporter/file.js b/src/exporter/file.js index 0da35e8c..5491f545 100644 --- a/src/exporter/file.js +++ b/src/exporter/file.js @@ -63,6 +63,11 @@ function streamBytes (dag, node, fileSize, offset, length) { function getData ({ node, start }) { try { + if (Buffer.isBuffer(node)) { + // this is a raw node + return extractDataFromBlock(node, start, offset, end) + } + const file = UnixFS.unmarshal(node.data) if (!file.data) { @@ -80,6 +85,11 @@ function streamBytes (dag, node, fileSize, offset, length) { let streamPosition = 0 function visitor ({ node }) { + if (Buffer.isBuffer(node)) { + // this is a raw node + return pull.empty() + } + const file = UnixFS.unmarshal(node.data) const nodeHasData = Boolean(file.data && file.data.length) diff --git a/src/importer/index.js b/src/importer/index.js index d9433b2b..892c823c 100644 --- a/src/importer/index.js +++ b/src/importer/index.js @@ -15,11 +15,29 @@ const chunkers = { const defaultOptions = { chunker: 'fixed', - rawLeafNodes: false + rawLeaves: false, + hashOnly: false, + cidVersion: 0, + hash: null, + leafType: 'file', + hashAlg: 'sha2-256' } module.exports = function (ipld, _options) { const options = Object.assign({}, defaultOptions, _options) + + if (options.cidVersion > 0 && _options.rawLeaves === undefined) { + // if the cid version is 1 or above, use raw leaves as this is + // what go does. + options.rawLeaves = true + } + + if (_options && _options.hash !== undefined && _options.rawLeaves === undefined) { + // if a non-default hash alg has been specified, use raw leaves as this is + // what go does. + options.rawLeaves = true + } + const Chunker = chunkers[options.chunker] assert(Chunker, 'Unknkown chunker named ' + options.chunker) diff --git a/test/exporter.js b/test/exporter.js index 1bca514c..e587f188 100644 --- a/test/exporter.js +++ b/test/exporter.js @@ -422,6 +422,37 @@ module.exports = (repo) => { ) }) + it('exports a large file > 5mb imported with raw leaves', function (done) { + this.timeout(30 * 1000) + + pull( + pull.values([{ + path: '200Bytes.txt', + content: pull.values([bigFile]) + }]), + importer(ipld, { + rawLeaves: true + }), + pull.collect(collected) + ) + + function collected (err, files) { + expect(err).to.not.exist() + expect(files.length).to.equal(1) + + pull( + exporter(files[0].multihash, ipld), + pull.collect((err, files) => { + expect(err).to.not.exist() + + expect(bs58.encode(files[0].hash)).to.equal('QmQLTvhjmSa7657mKdSfTjxFBdwxmK8n9tZC9Xdp9DtxWY') + + fileEql(files[0], bigFile, done) + }) + ) + } + }) + it('returns an empty stream for dir', (done) => { const hash = 'QmUNLLsPACCz1vLxQVkXqqLX5R1X345qqfHbsf67hvA3Nn' diff --git a/test/helpers/collect-leaf-cids.js b/test/helpers/collect-leaf-cids.js new file mode 100644 index 00000000..dd34d170 --- /dev/null +++ b/test/helpers/collect-leaf-cids.js @@ -0,0 +1,32 @@ +'use strict' + +const pull = require('pull-stream') +const traverse = require('pull-traverse') +const CID = require('cids') + +module.exports = (ipld, multihash, callback) => { + pull( + traverse.depthFirst(new CID(multihash), (cid) => { + return pull( + pull.values([cid]), + pull.asyncMap((cid, callback) => { + ipld.get(cid, (error, result) => { + callback(error, !error && result.value) + }) + }), + pull.asyncMap((node, callback) => { + if (!node.links) { + return callback() + } + + return callback( + null, node.links.map(link => new CID(link.multihash)) + ) + }), + pull.filter(Boolean), + pull.flatten() + ) + }), + pull.collect(callback) + ) +} diff --git a/test/importer.js b/test/importer.js index 9ed78925..8f26d7f7 100644 --- a/test/importer.js +++ b/test/importer.js @@ -18,6 +18,7 @@ const each = require('async/each') const waterfall = require('async/waterfall') const parallel = require('async/parallel') const UnixFs = require('ipfs-unixfs') +const collectLeafCids = require('./helpers/collect-leaf-cids') function stringifyMh (files) { return files.map((file) => { @@ -276,6 +277,7 @@ module.exports = (repo) => { pull.collect((err, nodes) => { expect(err).to.not.exist() expect(nodes.length).to.be.eql(1) + // always yield empty node expect(mh.toB58String(nodes[0].multihash)).to.be.eql('QmbFMke1KXqnYyBBWxB74N4c5SBnJMVAiMNRcGu6x1AwQH') done() @@ -291,7 +293,7 @@ module.exports = (repo) => { }, { path: '/boop/200Bytes.txt', - content: pull.values([smallFile]) + content: pull.values([bigFile]) } ]), importer(ipld, options), @@ -505,7 +507,7 @@ module.exports = (repo) => { const file = files[0] expect(file).to.exist() - ipld.get(new CID(file.multihash), (err, res) => { + ipld.get(new CID(file.multihash), (err) => { expect(err).to.exist() done() }) @@ -583,13 +585,13 @@ module.exports = (repo) => { it('imports file with raw leaf nodes when specified', (done) => { checkLeafNodeTypes(ipld, { - rawLeafNodes: true + leafType: 'raw' }, 'raw', done) }) it('imports file with file leaf nodes when specified', (done) => { checkLeafNodeTypes(ipld, { - rawLeafNodes: false + leafType: 'file' }, 'file', done) }) @@ -604,6 +606,41 @@ module.exports = (repo) => { reduceSingleLeafToSelf: false }, 1, done) }) + + it('uses raw leaf nodes when requested', (done) => { + this.timeout(60 * 1000) + + options.rawLeaves = true + + pull( + pull.values([{ + path: '1.2MiB.txt', + content: pull.values([bigFile]) + }]), + importer(ipld, options), + pull.collect((error, files) => { + expect(error).to.not.exist() + + const node = files[0] + + collectLeafCids(ipld, node.multihash, (error, cids) => { + expect(error).to.be.not.ok() + + const rawNodes = cids + .filter(cid => cid.codec === 'raw') + + expect(rawNodes).to.not.be.empty() + + rawNodes + .forEach(cid => { + expect(cid.version).to.equal(1) + }) + + done() + }) + }) + ) + }) }) }) }