Skip to content

Commit

Permalink
Allow word_delimiter_graph_filter to not adjust internal offsets (#36699
Browse files Browse the repository at this point in the history
)

This commit adds an adjust_offsets parameter to the word_delimiter_graph token filter, defaulting
to true. Most of the time you'd want sub-tokens emitted by this filter to have offsets that are
adjusted to their real position in the token stream; however, some token filters can change the 
length or starting position of a token (eg trim) without changing their offset attributes, and this 
can lead to word_delimiter_graph emitting illegal offsets. Setting adjust_offsets to false in these 
cases will allow indexing again.

Fixes #34741, #33710
  • Loading branch information
romseygeek authored Dec 18, 2018
1 parent 0ff1f1f commit af57575
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ Advance settings include:
to a file configured with protected words (one on each line).
Automatically resolves to `config/` based location if exists.

`adjust_offsets`::
By default, the filter tries to output subtokens with adjusted offsets
to reflect their actual position in the token stream. However, when
used in combination with other filters that alter the length or starting
position of tokens without changing their offsets
(e.g. <<analysis-trim-tokenfilter,`trim`>>) this can cause tokens with
illegal offsets to be emitted. Setting `adjust_offsets` to false will
stop `word_delimiter_graph` from adjusting these internal offsets.

`type_table`::
A custom type mapping table, for example (when configured
using `type_table_path`):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
private final byte[] charTypeTable;
private final int flags;
private final CharArraySet protoWords;
private final boolean adjustOffsets;

public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env,
String name, Settings settings) {
Expand Down Expand Up @@ -95,11 +96,12 @@ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environ
Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
this.flags = flags;
this.adjustOffsets = settings.getAsBoolean("adjust_offsets", true);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new WordDelimiterGraphFilter(tokenStream, true, charTypeTable, flags, protoWords);
return new WordDelimiterGraphFilter(tokenStream, adjustOffsets, charTypeTable, flags, protoWords);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,35 @@ public void testPartsAndCatenate() throws IOException {
String source = "PowerShot";
int[] expectedIncr = new int[]{1, 0, 1};
int[] expectedPosLen = new int[]{2, 1, 1};
int[] expectedStartOffsets = new int[]{0, 0, 5};
int[] expectedEndOffsets = new int[]{9, 5, 9};
String[] expected = new String[]{"PowerShot", "Power", "Shot" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
expectedIncr, expectedPosLen, null);
}

public void testAdjustingOffsets() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.put("index.analysis.filter.my_word_delimiter.adjust_offsets", "false")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
int[] expectedIncr = new int[]{1, 0, 1};
int[] expectedPosLen = new int[]{2, 1, 1};
int[] expectedStartOffsets = new int[]{0, 0, 0};
int[] expectedEndOffsets = new int[]{9, 9, 9};
String[] expected = new String[]{"PowerShot", "Power", "Shot" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
expectedIncr, expectedPosLen, null);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,26 @@
- match: { tokens.2.token: brown }
- match: { tokens.3.token: fox }

- do:
indices.analyze:
body:
text: the qu1ck brown fox
tokenizer: standard
filter:
- type: word_delimiter_graph
adjust_offsets: false
- length: { tokens: 6 }
- match: { tokens.0.token: the }
- match: { tokens.1.token: qu }
- match: { tokens.1.start_offset: 4 }
- match: { tokens.1.end_offset: 9 }
- match: { tokens.2.token: "1" }
- match: { tokens.2.start_offset: 4 }
- match: { tokens.2.end_offset: 9 }
- match: { tokens.3.token: ck }
- match: { tokens.3.start_offset: 4 }
- match: { tokens.3.end_offset: 9 }

- do:
indices.analyze:
body:
Expand Down

0 comments on commit af57575

Please sign in to comment.