Skip to content

Commit

Permalink
Fix escaping special characters
Browse files Browse the repository at this point in the history
Use WhitespaceTokenizer instead of StandardTokenizer
https://stackoverflow.com/a/6119584/21694752
  • Loading branch information
LoayGhreeb committed Aug 28, 2024
1 parent 909afe4 commit bfa1356
Show file tree
Hide file tree
Showing 9 changed files with 33 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public DefaultLinkedFilesIndexer(BibDatabaseContext databaseContext, FilePrefere
this.indexedFiles = new ConcurrentHashMap<>();

indexDirectoryPath = databaseContext.getFulltextIndexPath();
IndexWriterConfig config = new IndexWriterConfig(SearchFieldConstants.Standard_ANALYZER);
IndexWriterConfig config = new IndexWriterConfig(SearchFieldConstants.Whitespace_ANALYZER);
if ("unsaved".equals(indexDirectoryPath.getFileName().toString())) {
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
indexDirectoryPath = indexDirectoryPath.resolveSibling("unsaved" + NUMBER_OF_UNSAVED_LIBRARIES++);
Expand Down
13 changes: 1 addition & 12 deletions src/main/java/org/jabref/migrations/SearchToLuceneVisitor.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ public class SearchToLuceneVisitor extends SearchBaseVisitor<QueryNode> {

private final boolean isRegularExpression;

private boolean isNegation = false;

public SearchToLuceneVisitor(boolean isRegularExpression) {
this.isRegularExpression = isRegularExpression;
}
Expand All @@ -45,15 +43,13 @@ public QueryNode visitStart(SearchParser.StartContext ctx) {
// See https://github.com/LoayGhreeb/lucene-mwe/issues/1 for more details
if (result instanceof ModifierQueryNode modifierQueryNode) {
if (modifierQueryNode.getModifier() == ModifierQueryNode.Modifier.MOD_NOT) {
isNegation = true;
return new AndQueryNode(List.of(new FieldQueryNode(SearchFieldConstants.DEFAULT_FIELD.toString(), "*", 0, 0), modifierQueryNode));
}
}

// User might search for NOT this AND NOT that - we also need to convert properly
if (result instanceof AndQueryNode andQueryNode) {
if (andQueryNode.getChildren().stream().allMatch(child -> child instanceof ModifierQueryNode modifierQueryNode && modifierQueryNode.getModifier() == ModifierQueryNode.Modifier.MOD_NOT)) {
isNegation = true;
List<QueryNode> children = andQueryNode.getChildren().stream()
// prepend "all:* AND" to each child
.map(child -> new AndQueryNode(List.of(new FieldQueryNode(SearchFieldConstants.DEFAULT_FIELD.toString(), "*", 0, 0), child)))
Expand Down Expand Up @@ -108,7 +104,7 @@ public QueryNode visitComparison(SearchParser.ComparisonContext context) {
context.EQUAL() != null ||
context.EEQUAL() != null) { // exact match
if (LOGGER.isDebugEnabled() && context.EEQUAL() != null) {
LOGGER.warn("Exact match is currently supported by Lucene, using contains instead. Term: {}", context.getText());
LOGGER.warn("Exact match is currently not supported by Lucene, using contains instead. Term: {}", context.getText());
}
return getFieldQueryNode(field, right, startIndex, stopIndex);
}
Expand Down Expand Up @@ -139,11 +135,4 @@ private QueryNode getFieldQueryNode(String field, String term, int startIndex, i
}
return new FieldQueryNode(field, term, startIndex, stopIndex);
}

/**
* Returns whether the search query is a negation (and was patched to be a filter).
*/
public boolean isNegation() {
return this.isNegation;
}
}
4 changes: 2 additions & 2 deletions src/main/java/org/jabref/model/search/NGramAnalyzer.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

public class NGramAnalyzer extends Analyzer {
private final int minGram;
Expand All @@ -23,7 +23,7 @@ public NGramAnalyzer(int minGram, int maxGram, CharArraySet stopWords) {

@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new StandardTokenizer();
Tokenizer source = new WhitespaceTokenizer();
TokenStream result = new LowerCaseFilter(source);
result = new StopFilter(result, stopWords);
result = new ASCIIFoldingFilter(result);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public enum SearchFieldConstants {
PAGE_NUMBER("pageNumber"),
MODIFIED("modified");

public static final Analyzer Standard_ANALYZER = new StandardAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
public static final Analyzer Whitespace_ANALYZER = new WhitespaceAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
public static final Analyzer NGram_Analyzer_For_INDEXING = new NGramAnalyzer(1, Integer.MAX_VALUE, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
public static final List<String> PDF_FIELDS = List.of(PATH.toString(), CONTENT.toString(), ANNOTATIONS.toString());
private final String field;
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jabref/model/search/SearchQuery.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ public SearchQuery(String query, EnumSet<SearchFlags> searchFlags) {
query = '/' + query + '/';
}

MultiFieldQueryParser queryParser = new MultiFieldQueryParser(fieldsToSearchArray, SearchFieldConstants.Standard_ANALYZER, boosts);
MultiFieldQueryParser queryParser = new MultiFieldQueryParser(fieldsToSearchArray, SearchFieldConstants.Whitespace_ANALYZER, boosts);
queryParser.setAllowLeadingWildcard(true);

try {
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jabref/model/search/SearchResult.java
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ public int getPageNumber() {
}

private static List<String> getHighlighterFragments(Highlighter highlighter, SearchFieldConstants field, String content) {
try (TokenStream contentStream = SearchFieldConstants.Standard_ANALYZER.tokenStream(field.toString(), content)) {
try (TokenStream contentStream = SearchFieldConstants.Whitespace_ANALYZER.tokenStream(field.toString(), content)) {
TextFragment[] frags = highlighter.getBestTextFragments(contentStream, content, true, 10);
return Arrays.stream(frags).map(TextFragment::toString).toList();
} catch (IOException | InvalidTokenOffsetsException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,18 @@
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

public class StandardAnalyzer extends Analyzer {
public class WhitespaceAnalyzer extends Analyzer {
private final CharArraySet stopWords;
public StandardAnalyzer(CharArraySet stopWords) {
public WhitespaceAnalyzer(CharArraySet stopWords) {
this.stopWords = stopWords;
}

@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new StandardTokenizer();
Tokenizer source = new WhitespaceTokenizer();
TokenStream result = new LowerCaseFilter(source);
result = new StopFilter(result, stopWords);
result = new ASCIIFoldingFilter(result);
Expand Down
33 changes: 20 additions & 13 deletions src/test/java/org/jabref/logic/search/LuceneQueryParserTest.java
Original file line number Diff line number Diff line change
@@ -1,38 +1,45 @@
package org.jabref.logic.search;

import java.util.EnumSet;
import java.util.stream.Stream;

import org.jabref.model.search.SearchFlags;
import org.jabref.model.search.SearchQuery;
import org.jabref.logic.cleanup.Formatter;
import org.jabref.logic.layout.format.LatexToUnicodeFormatter;
import org.jabref.model.search.SearchFieldConstants;

import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;

import static org.junit.jupiter.api.Assertions.assertEquals;

public class LuceneQueryParserTest {
private static final Formatter FORMATTER = new LatexToUnicodeFormatter();

public static Stream<Arguments> searchQuires() {
return Stream.of(
// unicode
Arguments.of("preissinger", "preißinger"),
Arguments.of("jesus", "jesús"),
Arguments.of("breitenbucher", "breitenbücher"),
Arguments.of("all:preissinger", "preißinger"),
Arguments.of("all:jesus", "jesús"),
Arguments.of("all:breitenbucher", "breitenbücher"),

// latex
Arguments.of("preissinger", "prei{\\ss}inger"),
Arguments.of("jesus", "jes{\\'{u}}s"),
Arguments.of("breitenbucher", "breitenb{\\\"{u}}cher")
Arguments.of("all:preissinger", "prei{\\ss}inger"),
Arguments.of("all:jesus", "jes{\\'{u}}s"),
Arguments.of("all:breitenbucher", "breitenb{\\\"{u}}cher"),

Arguments.of("groups:/exclude", "groups:\\/exclude")
);
}

@ParameterizedTest
@MethodSource
void searchQuires(String expected, String query) {
expected = "(all:" + expected + ")^4.0";
SearchQuery searchQuery = new SearchQuery(query, EnumSet.noneOf(SearchFlags.class));
assertEquals(expected, searchQuery.getParsedQuery().toString());
void searchQuires(String expected, String query) throws ParseException {
QueryParser parser = new QueryParser(SearchFieldConstants.DEFAULT_FIELD.toString(), new WhitespaceAnalyzer());
query = FORMATTER.format(query);
String result = parser.parse(query).toString();
assertEquals(expected, result);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ public static Stream<Arguments> transformationNormal() {

Arguments.of("title:chocolate", "title=chocolate"),
Arguments.of("title:chocolate OR author:smith", "title = chocolate or author = smith"),
Arguments.of("groups:\\/exclude", "groups= /exclude"),
Arguments.of("title:chocolate AND author:smith", "title = \"chocolate\" AND author = \"smith\""),
Arguments.of("title:chocolate AND author:smith", "title contains \"chocolate\" AND author matches \"smith\""),
Arguments.of("( title:chocolate ) OR ( author:smith )", "(title == chocolate) or (author == smith)"),
Expand All @@ -26,6 +27,7 @@ public static Stream<Arguments> transformationNormal() {
Arguments.of("abstract:model\\{1,2\\}ing", "abstract = model{1,2}ing"),
Arguments.of("all:* AND -title:chocolate", "title != chocolate"),
Arguments.of("all:* AND -title:chocolate", "not title contains chocolate"),
Arguments.of("groups=:\\:paywall AND -file=\"\" AND -groups=\\/exclude", "groups=:paywall and file!=\"\" and groups!=/exclude"),

// not converted, because not working in JabRef 5.x
// Arguments.of("title:\"image processing\" OR keywords:\"image processing\"", "title|keywords = \"image processing\""),
Expand Down

0 comments on commit bfa1356

Please sign in to comment.