From bfa13565fd3affa44b4b539de15c8c79b5a9542c Mon Sep 17 00:00:00 2001 From: Loay Ghreeb Date: Wed, 28 Aug 2024 09:28:48 +0300 Subject: [PATCH] Fix escaping special characters Use WhitespaceTokenizer instead of StandardTokenizer https://stackoverflow.com/a/6119584/21694752 --- .../indexing/DefaultLinkedFilesIndexer.java | 2 +- .../migrations/SearchToLuceneVisitor.java | 13 +------- .../jabref/model/search/NGramAnalyzer.java | 4 +-- .../model/search/SearchFieldConstants.java | 2 +- .../org/jabref/model/search/SearchQuery.java | 2 +- .../org/jabref/model/search/SearchResult.java | 2 +- ...dAnalyzer.java => WhitespaceAnalyzer.java} | 8 ++--- .../logic/search/LuceneQueryParserTest.java | 33 +++++++++++-------- .../SearchToLuceneMigrationTest.java | 2 ++ 9 files changed, 33 insertions(+), 35 deletions(-) rename src/main/java/org/jabref/model/search/{StandardAnalyzer.java => WhitespaceAnalyzer.java} (77%) diff --git a/src/main/java/org/jabref/logic/search/indexing/DefaultLinkedFilesIndexer.java b/src/main/java/org/jabref/logic/search/indexing/DefaultLinkedFilesIndexer.java index 98bc712ce7f..9d4c6ad5b2e 100644 --- a/src/main/java/org/jabref/logic/search/indexing/DefaultLinkedFilesIndexer.java +++ b/src/main/java/org/jabref/logic/search/indexing/DefaultLinkedFilesIndexer.java @@ -65,7 +65,7 @@ public DefaultLinkedFilesIndexer(BibDatabaseContext databaseContext, FilePrefere this.indexedFiles = new ConcurrentHashMap<>(); indexDirectoryPath = databaseContext.getFulltextIndexPath(); - IndexWriterConfig config = new IndexWriterConfig(SearchFieldConstants.Standard_ANALYZER); + IndexWriterConfig config = new IndexWriterConfig(SearchFieldConstants.Whitespace_ANALYZER); if ("unsaved".equals(indexDirectoryPath.getFileName().toString())) { config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); indexDirectoryPath = indexDirectoryPath.resolveSibling("unsaved" + NUMBER_OF_UNSAVED_LIBRARIES++); diff --git a/src/main/java/org/jabref/migrations/SearchToLuceneVisitor.java b/src/main/java/org/jabref/migrations/SearchToLuceneVisitor.java index 56ad9e743f9..5ddaa260d8b 100644 --- a/src/main/java/org/jabref/migrations/SearchToLuceneVisitor.java +++ b/src/main/java/org/jabref/migrations/SearchToLuceneVisitor.java @@ -30,8 +30,6 @@ public class SearchToLuceneVisitor extends SearchBaseVisitor { private final boolean isRegularExpression; - private boolean isNegation = false; - public SearchToLuceneVisitor(boolean isRegularExpression) { this.isRegularExpression = isRegularExpression; } @@ -45,7 +43,6 @@ public QueryNode visitStart(SearchParser.StartContext ctx) { // See https://github.com/LoayGhreeb/lucene-mwe/issues/1 for more details if (result instanceof ModifierQueryNode modifierQueryNode) { if (modifierQueryNode.getModifier() == ModifierQueryNode.Modifier.MOD_NOT) { - isNegation = true; return new AndQueryNode(List.of(new FieldQueryNode(SearchFieldConstants.DEFAULT_FIELD.toString(), "*", 0, 0), modifierQueryNode)); } } @@ -53,7 +50,6 @@ public QueryNode visitStart(SearchParser.StartContext ctx) { // User might search for NOT this AND NOT that - we also need to convert properly if (result instanceof AndQueryNode andQueryNode) { if (andQueryNode.getChildren().stream().allMatch(child -> child instanceof ModifierQueryNode modifierQueryNode && modifierQueryNode.getModifier() == ModifierQueryNode.Modifier.MOD_NOT)) { - isNegation = true; List children = andQueryNode.getChildren().stream() // prepend "all:* AND" to each child .map(child -> new AndQueryNode(List.of(new FieldQueryNode(SearchFieldConstants.DEFAULT_FIELD.toString(), "*", 0, 0), child))) @@ -108,7 +104,7 @@ public QueryNode visitComparison(SearchParser.ComparisonContext context) { context.EQUAL() != null || context.EEQUAL() != null) { // exact match if (LOGGER.isDebugEnabled() && context.EEQUAL() != null) { - LOGGER.warn("Exact match is currently supported by Lucene, using contains instead. Term: {}", context.getText()); + LOGGER.warn("Exact match is currently not supported by Lucene, using contains instead. Term: {}", context.getText()); } return getFieldQueryNode(field, right, startIndex, stopIndex); } @@ -139,11 +135,4 @@ private QueryNode getFieldQueryNode(String field, String term, int startIndex, i } return new FieldQueryNode(field, term, startIndex, stopIndex); } - - /** - * Returns whether the search query is a negation (and was patched to be a filter). - */ - public boolean isNegation() { - return this.isNegation; - } } diff --git a/src/main/java/org/jabref/model/search/NGramAnalyzer.java b/src/main/java/org/jabref/model/search/NGramAnalyzer.java index d226b887d4b..206e1568619 100644 --- a/src/main/java/org/jabref/model/search/NGramAnalyzer.java +++ b/src/main/java/org/jabref/model/search/NGramAnalyzer.java @@ -6,9 +6,9 @@ import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.analysis.ngram.NGramTokenFilter; -import org.apache.lucene.analysis.standard.StandardTokenizer; public class NGramAnalyzer extends Analyzer { private final int minGram; @@ -23,7 +23,7 @@ public NGramAnalyzer(int minGram, int maxGram, CharArraySet stopWords) { @Override protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer source = new StandardTokenizer(); + Tokenizer source = new WhitespaceTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new StopFilter(result, stopWords); result = new ASCIIFoldingFilter(result); diff --git a/src/main/java/org/jabref/model/search/SearchFieldConstants.java b/src/main/java/org/jabref/model/search/SearchFieldConstants.java index 77b5f7cb957..ae37feb9475 100644 --- a/src/main/java/org/jabref/model/search/SearchFieldConstants.java +++ b/src/main/java/org/jabref/model/search/SearchFieldConstants.java @@ -17,7 +17,7 @@ public enum SearchFieldConstants { PAGE_NUMBER("pageNumber"), MODIFIED("modified"); - public static final Analyzer Standard_ANALYZER = new StandardAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); + public static final Analyzer Whitespace_ANALYZER = new WhitespaceAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); public static final Analyzer NGram_Analyzer_For_INDEXING = new NGramAnalyzer(1, Integer.MAX_VALUE, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); public static final List PDF_FIELDS = List.of(PATH.toString(), CONTENT.toString(), ANNOTATIONS.toString()); private final String field; diff --git a/src/main/java/org/jabref/model/search/SearchQuery.java b/src/main/java/org/jabref/model/search/SearchQuery.java index 674c502d3df..2ea0840de2c 100644 --- a/src/main/java/org/jabref/model/search/SearchQuery.java +++ b/src/main/java/org/jabref/model/search/SearchQuery.java @@ -83,7 +83,7 @@ public SearchQuery(String query, EnumSet searchFlags) { query = '/' + query + '/'; } - MultiFieldQueryParser queryParser = new MultiFieldQueryParser(fieldsToSearchArray, SearchFieldConstants.Standard_ANALYZER, boosts); + MultiFieldQueryParser queryParser = new MultiFieldQueryParser(fieldsToSearchArray, SearchFieldConstants.Whitespace_ANALYZER, boosts); queryParser.setAllowLeadingWildcard(true); try { diff --git a/src/main/java/org/jabref/model/search/SearchResult.java b/src/main/java/org/jabref/model/search/SearchResult.java index c2a76c8fc63..98c0bac5410 100644 --- a/src/main/java/org/jabref/model/search/SearchResult.java +++ b/src/main/java/org/jabref/model/search/SearchResult.java @@ -76,7 +76,7 @@ public int getPageNumber() { } private static List getHighlighterFragments(Highlighter highlighter, SearchFieldConstants field, String content) { - try (TokenStream contentStream = SearchFieldConstants.Standard_ANALYZER.tokenStream(field.toString(), content)) { + try (TokenStream contentStream = SearchFieldConstants.Whitespace_ANALYZER.tokenStream(field.toString(), content)) { TextFragment[] frags = highlighter.getBestTextFragments(contentStream, content, true, 10); return Arrays.stream(frags).map(TextFragment::toString).toList(); } catch (IOException | InvalidTokenOffsetsException e) { diff --git a/src/main/java/org/jabref/model/search/StandardAnalyzer.java b/src/main/java/org/jabref/model/search/WhitespaceAnalyzer.java similarity index 77% rename from src/main/java/org/jabref/model/search/StandardAnalyzer.java rename to src/main/java/org/jabref/model/search/WhitespaceAnalyzer.java index cf50b518ced..c25b1852680 100644 --- a/src/main/java/org/jabref/model/search/StandardAnalyzer.java +++ b/src/main/java/org/jabref/model/search/WhitespaceAnalyzer.java @@ -6,18 +6,18 @@ import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; -import org.apache.lucene.analysis.standard.StandardTokenizer; -public class StandardAnalyzer extends Analyzer { +public class WhitespaceAnalyzer extends Analyzer { private final CharArraySet stopWords; - public StandardAnalyzer(CharArraySet stopWords) { + public WhitespaceAnalyzer(CharArraySet stopWords) { this.stopWords = stopWords; } @Override protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer source = new StandardTokenizer(); + Tokenizer source = new WhitespaceTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new StopFilter(result, stopWords); result = new ASCIIFoldingFilter(result); diff --git a/src/test/java/org/jabref/logic/search/LuceneQueryParserTest.java b/src/test/java/org/jabref/logic/search/LuceneQueryParserTest.java index 9a471e305bc..7580d9f9899 100644 --- a/src/test/java/org/jabref/logic/search/LuceneQueryParserTest.java +++ b/src/test/java/org/jabref/logic/search/LuceneQueryParserTest.java @@ -1,11 +1,14 @@ package org.jabref.logic.search; -import java.util.EnumSet; import java.util.stream.Stream; -import org.jabref.model.search.SearchFlags; -import org.jabref.model.search.SearchQuery; +import org.jabref.logic.cleanup.Formatter; +import org.jabref.logic.layout.format.LatexToUnicodeFormatter; +import org.jabref.model.search.SearchFieldConstants; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParser; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; @@ -13,26 +16,30 @@ import static org.junit.jupiter.api.Assertions.assertEquals; public class LuceneQueryParserTest { + private static final Formatter FORMATTER = new LatexToUnicodeFormatter(); public static Stream searchQuires() { return Stream.of( // unicode - Arguments.of("preissinger", "preißinger"), - Arguments.of("jesus", "jesús"), - Arguments.of("breitenbucher", "breitenbücher"), + Arguments.of("all:preissinger", "preißinger"), + Arguments.of("all:jesus", "jesús"), + Arguments.of("all:breitenbucher", "breitenbücher"), // latex - Arguments.of("preissinger", "prei{\\ss}inger"), - Arguments.of("jesus", "jes{\\'{u}}s"), - Arguments.of("breitenbucher", "breitenb{\\\"{u}}cher") + Arguments.of("all:preissinger", "prei{\\ss}inger"), + Arguments.of("all:jesus", "jes{\\'{u}}s"), + Arguments.of("all:breitenbucher", "breitenb{\\\"{u}}cher"), + + Arguments.of("groups:/exclude", "groups:\\/exclude") ); } @ParameterizedTest @MethodSource - void searchQuires(String expected, String query) { - expected = "(all:" + expected + ")^4.0"; - SearchQuery searchQuery = new SearchQuery(query, EnumSet.noneOf(SearchFlags.class)); - assertEquals(expected, searchQuery.getParsedQuery().toString()); + void searchQuires(String expected, String query) throws ParseException { + QueryParser parser = new QueryParser(SearchFieldConstants.DEFAULT_FIELD.toString(), new WhitespaceAnalyzer()); + query = FORMATTER.format(query); + String result = parser.parse(query).toString(); + assertEquals(expected, result); } } diff --git a/src/test/java/org/jabref/migrations/SearchToLuceneMigrationTest.java b/src/test/java/org/jabref/migrations/SearchToLuceneMigrationTest.java index 4cefc0b5c08..6b6a2732e1c 100644 --- a/src/test/java/org/jabref/migrations/SearchToLuceneMigrationTest.java +++ b/src/test/java/org/jabref/migrations/SearchToLuceneMigrationTest.java @@ -17,6 +17,7 @@ public static Stream transformationNormal() { Arguments.of("title:chocolate", "title=chocolate"), Arguments.of("title:chocolate OR author:smith", "title = chocolate or author = smith"), + Arguments.of("groups:\\/exclude", "groups= /exclude"), Arguments.of("title:chocolate AND author:smith", "title = \"chocolate\" AND author = \"smith\""), Arguments.of("title:chocolate AND author:smith", "title contains \"chocolate\" AND author matches \"smith\""), Arguments.of("( title:chocolate ) OR ( author:smith )", "(title == chocolate) or (author == smith)"), @@ -26,6 +27,7 @@ public static Stream transformationNormal() { Arguments.of("abstract:model\\{1,2\\}ing", "abstract = model{1,2}ing"), Arguments.of("all:* AND -title:chocolate", "title != chocolate"), Arguments.of("all:* AND -title:chocolate", "not title contains chocolate"), + Arguments.of("groups=:\\:paywall AND -file=\"\" AND -groups=\\/exclude", "groups=:paywall and file!=\"\" and groups!=/exclude"), // not converted, because not working in JabRef 5.x // Arguments.of("title:\"image processing\" OR keywords:\"image processing\"", "title|keywords = \"image processing\""),