Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: buffered writer #70

Merged
merged 19 commits into from
Mar 31, 2022
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 21 additions & 4 deletions api.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
import { CID } from 'multiformats/cid'

export type { CID }
/* Generic types for interfacing with block storage */

export type Block = { cid: CID, bytes: Uint8Array }
export type Block = {
cid: CID
bytes: Uint8Array
}

export type BlockHeader = {
cid: CID,
length: number,
cid: CID
length: number
blockLength: number
}

export type BlockIndex = BlockHeader & {
offset: number,
offset: number
blockOffset: number
}

Expand All @@ -36,6 +40,19 @@ export interface BlockWriter {
close(): Promise<void>
}

export interface CarBufferWriter {
write(block: Block): void
close(options?:{ align: boolean }): Uint8Array
}

export interface CarBufferWriterOptions {
roots?: CID[] // defaults to []
byteOffset?: number // defaults to 0
byteLength?: number // defaults to buffer.byteLength

headerCapacity?: number // defaults to size needed for provided roots
}

export interface WriterChannel {
writer: BlockWriter
out: AsyncIterable<Uint8Array>
Expand Down
208 changes: 208 additions & 0 deletions lib/buffer-writer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
import varint from 'varint'
import * as CBOR from '@ipld/dag-cbor'

// Number of bytes required without any roots.
const EMPTY_HEADER_SIZE = 16
// Number of bytes used for CIDv1 with sha256 digest
const DEFAULT_CID_SIZE = 36
// Number of bytes added per root
const ROOT_EXTRA_SIZE = 5

/**
* @typedef {import('../api').CID} CID
* @typedef {import('../api').Block} Block
* @typedef {import('../api').CarBufferWriter} Writer
* @typedef {import('../api').CarBufferWriterOptions} Options
* @typedef {import('./coding').CarEncoder} CarEncoder
*/

/**
* @implements {Writer}
*/
class CarBufferWriter {
/**
* @param {Uint8Array} bytes
* @param {number} byteOffset
* @param {CID[]} roots
*/
constructor (bytes, byteOffset, roots = []) {
/** @readonly */
this.bytes = bytes
/** @readonly */
this.roots = roots
this.byteOffset = byteOffset
this.headerCapacity = byteOffset
}

/**
* @param {CID} root
*/
addRoot (root) {
const byteLength = root.bytes.byteLength + ROOT_EXTRA_SIZE
if (byteLength > this.headerCapacity - EMPTY_HEADER_SIZE) {
throw new RangeError('Root will not fit')
}
this.roots.push(root)
this.headerCapacity -= byteLength
}

/**
* Write a `Block` (a `{ cid:CID, bytes:Uint8Array }` pair) to the archive.
* Throws if there is not enough capacity.
*
* @param {Block} block A `{ cid:CID, bytes:Uint8Array }` pair.
*/
write ({ cid, bytes }) {
const size = varint.encode(cid.bytes.length + bytes.length)
const byteLength = size.length + cid.bytes.byteLength + bytes.byteLength
if (this.byteOffset + byteLength > this.bytes.byteLength) {
throw new RangeError('Buffer overflow')
} else {
this.bytes.set(size, this.byteOffset)
this.byteOffset += size.length

this.bytes.set(cid.bytes, this.byteOffset)
this.byteOffset += cid.bytes.byteOffset

this.bytes.set(bytes, this.byteOffset)
this.byteOffset += bytes.byteLength
}
}

/**
* @param {object} options
* @param {boolean} options.align
*/
close ({ align = false }) {
const { roots, bytes, byteOffset, headerCapacity } = this
const headerBytes = CBOR.encode({ version: 1, roots })
const varintBytes = varint.encode(headerBytes.length)

const headerByteLength = varintBytes.length + headerBytes.byteLength
const offset = headerCapacity - headerByteLength

if (offset === 0) {
writeHeader(varintBytes, headerBytes, bytes)
return bytes.subarray(0, byteOffset)
// If header was misestimated, yet buffer has capacity to fit header and
// written blocks
} else if (byteOffset + offset < bytes.byteLength) {
// If we `align: true` is passed we will align header & blocks as needed.
if (align) {
this.byteOffset += offset
// Move encoded blocks to a new offset. Please note that we may have
// underestimated header so this needs to happen before we write header.
bytes.set(bytes.subarray(headerCapacity, byteOffset), headerByteLength)

writeHeader(varintBytes, headerBytes, bytes)
return bytes.subarray(0, this.byteOffset)
} else {
throw new RangeError(

rvagg marked this conversation as resolved.
Show resolved Hide resolved
`Header size was ${offset > 0 ? `underestimated by ${offset}` : `overestimated by ${-1 * offset}`}
You can use close({ align: true }) to align header and blocks as needed`)
}
} else {
throw new RangeError(`Header size was underestimated by ${-1 * offset} bytes and there is not enough space in the buffer now`)
}
}
}

/**
*
* @param {number[]} varint
* @param {Uint8Array} header
* @param {Uint8Array} destination
* @param {number} offset
*/
const writeHeader = (varint, header, destination, offset = 0) => {
destination.set(varint, offset)
destination.set(header, offset + varint.length)
}

/**
* Attempts to estimate header size given number of roots it will have assuming
* they're CIDv1 in with sha256 digest. Optionally it takes number of bytes
* to be allocated for all the roots, in case different hashing or CID version
* is used.
*
* Note: Returned value is just an estimate which can be inaccurate where large
* number of CIDs is passed or if they are of various sizes.
*
* @param {number} count - Number of roots
* @param {number} [rootsByteLength] - Total byteLength of all roots
*/
export const estimateHeaderSize = (
count,
rootsByteLength = count * DEFAULT_CID_SIZE
Gozala marked this conversation as resolved.
Show resolved Hide resolved
) => {
const lengthSize = arrayLengthEncodeSize(count)
const rootsSize = rootsByteLength + count * ROOT_EXTRA_SIZE
const headerSize = EMPTY_HEADER_SIZE + lengthSize + rootsSize
const varintSize = varint.encodingLength(headerSize)
return varintSize + headerSize
}

/**
*
* @param {number} length
* @returns
*/
const arrayLengthEncodeSize = length =>
length < 24
? 1
: length < 256
? 2
: length < 65536
? 3
: CBOR.encode(length).length

/**
* @param {CID[]} cids
*/
const totalByteLength = (cids) => {
let total = 0
for (const cid of cids) {
total += cid.bytes.byteLength
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could fix this too now you have the nice arrayLengthEncodeSize() which will work for CIDs too; I think if you subtract 2 from ROOT_EXTRA_SIZE (the byte array is >24<256 so it requires two bytes cbor prelude <bytes><length>, the extra bits in ROOT_EXTRA_SIZE are to do with tags, mainly) and then add it back in by adding + arrayLengthEncodeSize(cid.length) here then you get a more accurate CID length. The problem with doing that is that your API allows for an optional rootsByteLength, but that could also probably be fixed by adding those 2 to the DEFAULT_CID_SIZE.

If you add bafkqaaia and bafkqbbacaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa to your tests then you should see the impact of the bad sizes here.

Identity CIDs are the most likely way we're going to end up here--it would be silly to use one as a root but it's allowed and it's not out of the question that someone finds a use-case for it.

}
return total
}

/**
* Creates synchronous CAR writer that can be used to encode blocks into a given
* buffer. Optionally you could pass `byteOffset` and `byteLength` to specify a
* range inside buffer to write into. If car file is going to have `roots` you
* need to either pass them under `options.roots` or provide `options.headerCapacity`
Gozala marked this conversation as resolved.
Show resolved Hide resolved
* to allocate required space for the header (You can use `estimateHeaderCapacity`
* function to get an estimate). It is also possible to provide both `roots`
* and `headerCapacity` to allocate space for the roots that may not be known
* ahead of time.
*
* Note: Incorrect header estimate may lead to copying bytes inside a buffer
* which will have a negative impact on performance.
*
* @param {ArrayBuffer} buffer
* @param {Options} [options]
Gozala marked this conversation as resolved.
Show resolved Hide resolved
* @returns {Writer}
*/
export const createWriter = (
buffer,
{
roots = [],
byteOffset = 0,
byteLength = buffer.byteLength,
headerCapacity = estimateHeaderSize(
roots.length,
totalByteLength(roots)
)
} = {}
) => {
const bytes = new Uint8Array(buffer, byteOffset, byteLength)

const writer = new CarBufferWriter(bytes, headerCapacity)
for (const root of roots) {
writer.addRoot(root)
}

return writer
}
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@
"./writer": {
"browser": "./lib/writer-browser.js",
"import": "./lib/writer.js"
},
"./buffer-writer": {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you're going to need to add an equivalent block in typesVersions so consumers can find the types for it (i.e. avoiding the lib indirection)

"import": "./lib/buffer-writer.js"
}
},
"dependencies": {
Expand Down
18 changes: 18 additions & 0 deletions test/test-buffer-writer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/* eslint-env mocha */

import * as CarBufferWriter from '@ipld/car/buffer-writer'
import { createHeader } from '../lib/encoder.js'
import { assert } from './common.js'
import { CID } from 'multiformats'

describe('CarBufferWriter', () => {
const cid = CID.parse('bafkreifuosuzujyf4i6psbneqtwg2fhplc2wxptc5euspa2gn3bwhnihfu')
describe('estimateHeader', async () => {
for (const count of [0, 1, 10, 18, 24, 48, 124, 255, 258, 65536 - 1, 65536]) {
it(`estimateHeaderCapacity(${count})`, () => {
const roots = new Array(count).fill(cid)
assert.deepEqual(CarBufferWriter.estimateHeaderSize(count), createHeader(roots).byteLength)
})
}
})
})
1 change: 1 addition & 0 deletions tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"paths": {
"@ipld/car": [ "car.js", "car-browser.js", "lib/" ],
"@ipld/car/writer": [ "./lib/writer.js" ],
"@ipld/car/buffer-writer": ["./lib/buffer-writer.js"],
"@ipld/car/reader": [ "./lib/reader.js" ],
"@ipld/car/indexed-reader": [ "./lib/indexed-reader.js" ],
"@ipld/car/iterator": [ "./lib/iterator.js" ],
Expand Down