Skip to content

Commit

Permalink
ENH: Prebuild Lunr.js search index
Browse files Browse the repository at this point in the history
  • Loading branch information
kernc committed Jun 22, 2024
1 parent 79cfcda commit ccc3658
Show file tree
Hide file tree
Showing 5 changed files with 183 additions and 8 deletions.
94 changes: 94 additions & 0 deletions pdoc/build-index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import vm from 'vm';

const LUNR_SCRIPT = 'https://cdnjs.cloudflare.com/ajax/libs/lunr.js/2.3.9/lunr.min.js',
stdin = process.stdin,
stdout = process.stdout,
buffer = [];

async function loadScript(url) {
const response = await fetch(url);
return await response.text();
}
async function executeScript(script) {
const sandbox = { window: {}, self: {} };
vm.runInContext(script, vm.createContext(sandbox));
return sandbox;
}

function compact(index) {
/* https://john-millikin.com/compacting-lunr-search-indices */
function compactInvIndex(index) {
const fields = index["fields"];
const fieldVectorIdxs = new Map(index["fieldVectors"].map((v, idx) => [v[0], idx]));
const items = new Map(index["invertedIndex"].map(item => {
const token = item[0];
const props = item[1];
const newItem = [token];
fields.forEach(field => {
const fProps = props[field];
const matches = [];
Object.keys(fProps).forEach(docRef => {
const fieldVectorIdx = fieldVectorIdxs.get(`${field}/${docRef}`);
if (fieldVectorIdx === undefined) {
throw new Error();
}
matches.push(fieldVectorIdx);
matches.push(fProps[docRef]);
});
newItem.push(matches);
});
return [props["_index"], newItem];
}));
const indexes = Array.from(items.keys()).sort((a, b) => a - b);
const compacted = Array.from(indexes, k => items.get(k));
return compacted;
}
function compactVectors(index) {
return index["fieldVectors"].map(item => {
const id = item[0];
const vectors = item[1];
let prev = null;
const compacted = vectors.map((v, ii) => {
if (ii % 2 === 0) {
if (prev !== null && v === prev + 1) {
prev += 1;
return null;
}
prev = v;
}
return v;
});
return [id, compacted];
});
}
index.invertedIndex = compactInvIndex(index);
index.fieldVectors = compactVectors(index);
}

let lunr = (await executeScript(await loadScript(LUNR_SCRIPT)))['lunr'];

stdin.resume();
stdin.setEncoding('utf8');

stdin.on('data', function (data) {buffer.push(data)});

stdin.on('end', function () {
const documents = JSON.parse(buffer.join(''));
let idx = lunr(function () {
this.ref('i');
this.field('name', {boost: 10});
this.field('ref', {boost: 5});
this.field('doc');
this.metadataWhitelist = ['position'];
documents.forEach(function (doc, i) {
const parts = doc.ref.split('.');
doc['name'] = parts[parts.length - 1];
doc['i'] = i;
this.add(doc);
}, this)
})

let out = idx.toJSON();
compact(out);
stdout.write(JSON.stringify(out));
})
28 changes: 23 additions & 5 deletions pdoc/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@
import os.path as path
import json
import re
import subprocess
import sys
import warnings
from contextlib import contextmanager
from functools import lru_cache
from http.server import BaseHTTPRequestHandler, HTTPServer
from pathlib import Path
from typing import Dict, List, Sequence
from warnings import warn

Expand Down Expand Up @@ -397,6 +399,7 @@ def recursive_add_to_index(dobj):
info['doc'] = trim_docstring(dobj.docstring)
if isinstance(dobj, pdoc.Function):
info['func'] = 1
nonlocal index
index.append(info)
for member_dobj in getattr(dobj, 'doc', {}).values():
recursive_add_to_index(member_dobj)
Expand All @@ -414,12 +417,27 @@ def to_url_id(module):
recursive_add_to_index(top_module)
urls = sorted(url_cache.keys(), key=url_cache.__getitem__)

json_values = [dict(obj, url=urls[obj['url']]) for obj in index]
cmd = ['node', str(Path(__file__).with_name('build-index.js'))]
proc = subprocess.Popen(cmd, text=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
main_path = args.output_dir
with _open_write_file(path.join(main_path, 'index.js')) as f:
f.write("URLS=")
json.dump(urls, f, indent=0, separators=(',', ':'))
f.write(";\nINDEX=")
json.dump(index, f, indent=0, separators=(',', ':'))
if proc.poll() is None:
stdout, stderr = proc.communicate(json.dumps(json_values))
assert proc.poll() == 0, proc.poll()
if proc.returncode == 0:
stdout = 'INDEX=' + stdout
else:
warn(f'Prebuilding Lunr index with command `{" ".join(cmd)}` failed: '
f'{proc.stderr and proc.stderr.read() or ""!r}. '
f'The search feature will still work, '
f'but may be slower (with the index rebuilt just before use). '
f'To prebuild an index in advance, ensure `node` is executable in the '
f'pdoc environment.', category=RuntimeWarning)
stdout = ('URLS=' + json.dumps(urls, indent=0, separators=(',', ':')) +
';\nINDEX=' + json.dumps(index, indent=0, separators=(',', ':')))
index_path = Path(main_path).joinpath('index.js')
index_path.write_text(stdout)
print(str(index_path))

# Generate search.html
with _open_write_file(path.join(main_path, 'doc-search.html')) as f:
Expand Down
1 change: 1 addition & 0 deletions pdoc/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"type": "module"}
59 changes: 58 additions & 1 deletion pdoc/templates/search.mako
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,10 @@
}
async function build_index() {
return lunr(function () {
try {
return lunr.Index.load(_expand(INDEX)); // Prebuilt index
} catch {
return lunr(function () {
this.ref('i');
this.field('name', {boost: 10});
this.field('ref', {boost: 5});
Expand All @@ -67,6 +70,60 @@
this.add(doc);
}, this);
});
}
}
function _expand(compact) {
// https://john-millikin.com/compacting-lunr-search-indices
const fields = compact["fields"];
const fieldVectors = compact["fieldVectors"].map((item) => {
const id = item[0];
const vectors = item[1];
let prev = null;
const expanded = vectors.map((v, ii) => {
if (ii % 2 === 0) {
if (v === null) {
v = prev + 1;
}
prev = v;
}
return v;
});
return [id, expanded];
});
const invertedIndex = compact["invertedIndex"].map((item, itemIdx) => {
const token = item[0];
const fieldMap = {"_index": itemIdx};
fields.forEach((field, fieldIdx) => {
const matches = {};
let docRef = null;
item[fieldIdx + 1].forEach((v, ii) => {
if (ii % 2 === 0) {
docRef = fieldVectors[v][0].slice((field + '/').length);
} else {
matches[docRef] = v;
}
});
fieldMap[field] = matches;
})
return [token, fieldMap];
});
invertedIndex.sort((a, b) => {
if (a[0] < b[0]) {
return -1;
}
if (a[0] > b[0]) {
return 1;
}
return 0;
});
return {
"version": compact["version"],
"fields": fields,
"fieldVectors": fieldVectors,
"invertedIndex": invertedIndex,
"pipeline": compact["pipeline"],
};
}
function search(query) {
Expand Down
9 changes: 7 additions & 2 deletions pdoc/test/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,13 @@ def test_lunr_search(self):
files = self.PUBLIC_FILES + ["doc-search.html", "index.js"]
self._basic_html_assertions(expected_files=files)
self._check_files(exclude_patterns=['class="gcse-search"'])
self._check_files(include_patterns=['URLS=[\n"example_pkg/index.html",\n"example_pkg/'],
file_pattern='index.js')
if shutil.which('node'):
self._check_files(include_patterns=['INDEX={"version"'],
file_pattern='index.js')
else:
self._check_files(
include_patterns=['URLS=[\n"example_pkg/index.html",\n"example_pkg/'],
file_pattern='index.js')
self._check_files(include_patterns=["'../doc-search.html#'"],
file_pattern='example_pkg/index.html')
self._check_files(include_patterns=["'../doc-search.html#'"],
Expand Down

0 comments on commit ccc3658

Please sign in to comment.