Index files in Dat archives with map-reduce to create queryable data views.
Not yet stable.
// in beaker
import DatArchiveMapReduce from 'dat://map-reduce.beakerbrowser.com/v/1.0.0/index.js'
// in node
const DatArchiveMapReduce = require('@beaker/dat-archive-map-reduce')
// create instance
const damr = new DatArchiveMapReduce()
// define your view
damr.define('site-posts-by-date', {
path: '/.data/unwalled.garden/posts/*.json',
map (value, meta, emit) {
let obj = JSON.parse(value)
if (isPost(obj)) {
let timestamp = Number(new Date(obj.createdAt))
emit([meta.origin, timestamp], meta.url)
}
}
})
function isPost (obj) {
if (obj.type !== 'unwalled.garden/post') return false
if (!obj.content || typeof obj.content !== 'string') return false
if (!obj.createdAt || typeof obj.createdAt !== 'string') return false
return true
}
// index sites
damr.index('dat://pfrazee.com', {watch: true})
damr.index('dat://mafintosh.com', {watch: true})
damr.index('dat://andrewosh.com', {watch: true})
// list the most recent 30 posts by pfrazee.com
await damr.list('site-posts-by-date', {
gt: ['dat://pfrazee.com', 0],
lt: ['dat://pfrazee.com', Infinity],
limit: 30,
reverse: true
})
// list the posts in the last 5 days by mafintosh.com
await damr.list('site-posts-by-date', {
gte: ['dat://mafintosh.com', Date.now() - ms('5d')],
lte: ['dat://mafintosh.com', Date.now()],
reverse: true
})
// reduce example
damr.define('site-posts-count', {
path: '/.data/unwalled.garden/posts/*.json',
map (value, meta, emit) {
let obj = JSON.parse(value)
if (isPost(obj)) {
emit(meta.origin, meta.pathname)
}
},
reduce (acc, value, key) {
return (acc||0) + 1
}
})
await damr.get('site-posts-count', 'dat://pfrazee.com')
Table of Contents generated with DocToc
- Class: DatArchiveMapReduce
- Instance: DatArchiveMapReduce
- damr.open()
- damr.close()
- damr.destroy()
- damr.define(name, definition)
- damr.reset(view)
- damr.get(view, key)
- damr.list(view, opts)
- damr.index(url[, opts])
- damr.unindex(url)
- damr.indexFile(archive, filepath)
- damr.indexFile(url)
- damr.unindexFile(archive, filepath)
- damr.unindexFile(url)
- damr.listIndexed()
- damr.isIndexed(url)
- Event: 'open'
- Event: 'open-failed'
- Event: 'view-reset'
- Event: 'archive-indexing'
- Event: 'archive-index-progress'
- Event: 'archive-indexed'
- Event: 'archive-missing'
- Event: 'archive-found'
- Event: 'archive-error'
var damr = new DatArchiveMapReduce('views')
name
String. Defaults to'views'
. If run in the browser, this will be the name of the IndexedDB instance. If run in NodeJS, this will be the path of the LevelDB folder.opts
Object.DatArchive
Constructor. The class constructor for dat archive instances. If in node, you should specify node-dat-archive.
Create a new DatArchiveMapReduce
instance.
The given name
will control where the indexes are saved.
You can specify different names to run multiple DatArchiveMapReduce instances at once.
await damr.open()
- Returns Promise<Void>.
Opens the internal databases. Will be called automatically by other methods, so you usually don't need to call this method.
await damr.close()
- Returns Promise<Void>.
Closes the DatArchiveMapReduce instance.
await damr.destroy()
- Returns Promise<Void>.
Closes and deletes all indexes in the DatArchiveMapReduce instance.
You can .destroy()
and then .open()
a DatArchiveMapReduce to recreate its indexes.
await damr.destroy()
await damr.open()
name
String. The name of the view.definition
Object.path
String or Array<String>. An anymatch list of files to index.map
Function(value, meta, emit). A method to accept a new or changed file and emit new stored entries in the view.value
String.meta
Object.url
String. The URL of the file (eg 'dat://foo.com/bar.json').origin
String. The origin of the file's site (eg 'dat://foo.com').pathname
String. The path of the file in the site (eg '/bar.json').
emit
Function(key, value). Call this to emit new mapped values.key
String or Array<String>. The key to store the new entry at.value
Any. The value to store for the entry.
reduce
Function(agg, value, key). A method to aggregate mapped entries into a single value.agg
Any. The current value of the reduce method's output.value
Any. The next mapped value to process.key
Any. The key of the entry being processed.- Must return the current value of the reduced entry.
- Returns Promise<Void>.
Creates a new view on the damr
object.
Example:
// create a view that counts the number of posts by each user
damr.define('site-posts-count', {
path: '/.data/unwalled.garden/posts/*.json',
map (value, meta, emit) {
let obj = JSON.parse(value)
if (isPost(obj)) {
emit(meta.origin, meta.pathname)
}
},
reduce (acc, value, key) {
return (acc||0) + 1
}
})
// get the number of posts by dat://pfrazee.com
await damr.index('dat://pfrazee.com')
await damr.get('site-posts-count', 'dat://pfrazee.com')
await damr.reset('site-posts-by-date')
view
String. The name of the view to reset.
Clears all data indexed in the view. This should be used when the view-definition has changed and needs to be rebuilt.
// get the post by pfrazee.com that was created at "Tue, 23 Jul 2019 18:23:57 GMT"
var post = await damr.get('site-posts-by-date', ['dat://pfrazee.com', Number(new Date('Tue, 23 Jul 2019 18:23:57 GMT'))])
view
String. The name of the view to query.key
Any. The key of the entry to fetch.- Returns Promise<Object>.
key
Any. The key of the entry.value
Any. The value of the entry.
Get the entry at the given key.
// list the most recent 30 posts by pfrazee.com
await damr.list('site-posts-by-date', {
gte: ['dat://pfrazee.com', 0],
lte: ['dat://pfrazee.com', Infinity],
limit: 30,
reverse: true
})
// list the posts in the last 5 days by mafintosh.com
await damr.list('site-posts-by-date', {
gte: ['dat://mafintosh.com', Date.now() - ms('5d')],
lte: ['dat://mafintosh.com', Date.now()],
reverse: true
})
view
String. The name of the view to query.opts
Object.gt
Any. The start key in the range to query (exclusive).gte
Any. The start key in the range to query (inclusive).lt
Any. The end key in the range to query (exclusive).lte
Any. The end key in the range to query (inclusive).reverse
Boolean. Reverse the order of the output? Defaults to false.limit
Number. Limit the number of entries returned. Defaults to no limit.
- Returns Promise<Array<Object>>.
key
Any. The key of the entry.value
Any. The value of the entry.
List a range of entries from a view.
await damr.index('dat://foo.com')
url
String or DatArchive. The site to index.opts
Object.watch
Boolean. Should DatArchiveMapReduce watch the archive for changes, and index them immediately? Defaults to false.
- Returns Promise<Void>.
Add a dat:// site to be indexed. The method will return when the site has been fully indexed.
await damr.unindex('dat://foo.com')
url
String or DatArchive. The site to deindex.- Returns Promise<Void>.
Remove a dat:// site from the dataset. The method will return when the site has been fully de-indexed.
await damr.indexFile(fooArchive, '/bar.json')
archive
DatArchive. The site containing the file to index.filepath
String. The path of the file to index.- Returns Promise<Void>.
Add a single file to the dataset. The method will return when the file has been indexed.
This will not add the file or its archive to the list returned by listIndexed()
.
DatArchiveMapReduce will not watch the file after this call.
await damr.indexFile('dat://foo.com/bar.json')
url
String. The url of the file to index.- Returns Promise<Void>.
Add a single file to the dataset. The method will return when the file has been indexed.
This will not add the file or its archive to the list returned by listIndexed()
.
DatArchiveMapReduce will not watch the file after this call.
await damr.unindexFile(fooArchive, '/bar.json')
archive
DatArchive. The site containing the file to deindex.filepath
String. The path of the file to deindex.- Returns Promise<Void>.
Remove a single file from the dataset. The method will return when the file has been de-indexed.
await damr.unindexFile('dat://foo.com/bar.json')
url
String. The url of the file to deindex.- Returns Promise<Void>.
Remove a single file from the dataset. The method will return when the file has been de-indexed.
var urls = await damr.listIndexed()
- Returns Array<String>.
Lists the URLs of the dat:// sites which are included in the dataset.
var yesno = await damr.isIndexed('dat://foo.com')
- Returns Boolean.
Is the given dat:// URL included in the dataset?
damr.on('open', () => {
console.log('DatArchiveMapReduce is ready for use')
})
Emitted when the DatArchiveMapReduce instance has been opened using open()
.
damr.on('open-failed', (err) => {
console.log('DatArchiveMapReduce failed to open', err)
})
error
Error.
Emitted when the DatArchiveMapReduce instance fails to open during open()
.
damr.on('view-reset', ({view}) => {
console.log('DatArchiveMapReduce has reset the indexes for', view)
})
view
String. The name of the view that was reset.
Emitted when reset()
has been called on a view. All map/reduced entries are cleared for the view.
damr.on('archive-indexing', ({view, origin, start, end}) => {
console.log(view, 'is updating for', origin, 'from version', start, 'to', end)
})
view
String. The view that is indexing.origin
String. The archive that was updated.start
Number. The version which is being indexed from.end
Number. The version which is being indexed to.
Emitted when the DatArchiveMapReduce instance has started to index the given archive.
damr.on('archive-index-progress', ({view, origin, current, total}) => {
console.log(view, 'update for', origin, 'is', Math.round(current / total * 100), '% complete')
})
view
String. The view that is indexing.origin
String. The archive that was updated.current
Number. The current update being applied.total
Number. The total number of updates being applied.
Emitted when an update has been applied during an indexing process.
damr.on('archive-indexed', ({view, origin, version}) => {
console.log(view, 'was updated for', url, 'at version', version)
})
view
String. The view that is indexing.origin
String. The archive that was updated.version
Number. The version which was updated to.
Emitted when the DatArchiveMapReduce instance has indexed the given archive.
This is similar to 'view-updated'
, but it fires every time a archive is indexed, whether or not it results in updates to the indexes.
damr.on('archive-missing', ({origin}) => {
console.log('DatArchiveMapReduce couldnt find', origin, '- now searching')
})
origin
String. The archive that is missing.
Emitted when a archive's data was not locally available or found on the network.
When this occurs, DatArchiveMapReduce will continue searching for the data, and emit 'archive-found'
on success.
damr.on('archive-found', ({origin}) => {
console.log('DatArchiveMapReduce has found and indexed', origin)
})
origin
String. The archive that was found.
Emitted when a archive's data was found after originally not being found during indexing.
This event will only be emitted after 'archive-missing'
is emitted.
damr.on('archive-error', ({origin, error}) => {
console.log('DatArchiveMapReduce failed to index', origin, error)
})
origin
String. The archive that failed.error
Error. The error emitted.
Emitted when an archive fails to load.