-
Notifications
You must be signed in to change notification settings - Fork 550
/
tokenizer.js
76 lines (65 loc) · 2.21 KB
/
tokenizer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
/*!
* lunr.tokenizer
* Copyright (C) @YEAR Oliver Nightingale
*/
/**
* A function for splitting a string into tokens ready to be inserted into
* the search index. Uses `lunr.tokenizer.separator` to split strings, change
* the value of this property to change how strings are split into tokens.
*
* This tokenizer will convert its parameter to a string by calling `toString` and
* then will split this string on the character in `lunr.tokenizer.separator`.
* Arrays will have their elements converted to strings and wrapped in a lunr.Token.
*
* Optional metadata can be passed to the tokenizer, this metadata will be cloned and
* added as metadata to every token that is created from the object to be tokenized.
*
* @static
* @param {?(string|object|object[])} obj - The object to convert into tokens
* @param {?object} metadata - Optional metadata to associate with every token
* @returns {lunr.Token[]}
* @see {@link lunr.Pipeline}
*/
lunr.tokenizer = function (obj, metadata) {
if (obj == null || obj == undefined) {
return []
}
if (Array.isArray(obj)) {
return obj.map(function (t) {
return new lunr.Token(
lunr.utils.asString(t).toLowerCase(),
lunr.utils.clone(metadata)
)
})
}
var str = obj.toString().toLowerCase(),
len = str.length,
tokens = []
for (var sliceEnd = 0, sliceStart = 0; sliceEnd <= len; sliceEnd++) {
var char = str.charAt(sliceEnd),
sliceLength = sliceEnd - sliceStart
if ((char.match(lunr.tokenizer.separator) || sliceEnd == len)) {
if (sliceLength > 0) {
var tokenMetadata = lunr.utils.clone(metadata) || {}
tokenMetadata["position"] = [sliceStart, sliceLength]
tokenMetadata["index"] = tokens.length
tokens.push(
new lunr.Token (
str.slice(sliceStart, sliceEnd),
tokenMetadata
)
)
}
sliceStart = sliceEnd + 1
}
}
return tokens
}
/**
* The separator used to split a string into tokens. Override this property to change the behaviour of
* `lunr.tokenizer` behaviour when tokenizing strings. By default this splits on whitespace and hyphens.
*
* @static
* @see lunr.tokenizer
*/
lunr.tokenizer.separator = /[\s\-]+/