Fix escaping special characters

Use WhitespaceTokenizer instead of StandardTokenizer https://stackoverflow.com/a/6119584/21694752
JabRef · Aug 28, 2024 · bfa1356 · bfa1356
1 parent 909afe4
commit bfa1356
Show file tree

Hide file tree

Showing 9 changed files with 33 additions and 35 deletions.
diff --git a/src/main/java/org/jabref/logic/search/indexing/DefaultLinkedFilesIndexer.java b/src/main/java/org/jabref/logic/search/indexing/DefaultLinkedFilesIndexer.java
@@ -65,7 +65,7 @@ public DefaultLinkedFilesIndexer(BibDatabaseContext databaseContext, FilePrefere
         this.indexedFiles = new ConcurrentHashMap<>();
 
         indexDirectoryPath = databaseContext.getFulltextIndexPath();
-        IndexWriterConfig config = new IndexWriterConfig(SearchFieldConstants.Standard_ANALYZER);
+        IndexWriterConfig config = new IndexWriterConfig(SearchFieldConstants.Whitespace_ANALYZER);
         if ("unsaved".equals(indexDirectoryPath.getFileName().toString())) {
             config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
             indexDirectoryPath = indexDirectoryPath.resolveSibling("unsaved" + NUMBER_OF_UNSAVED_LIBRARIES++);

diff --git a/src/main/java/org/jabref/migrations/SearchToLuceneVisitor.java b/src/main/java/org/jabref/migrations/SearchToLuceneVisitor.java
@@ -30,8 +30,6 @@ public class SearchToLuceneVisitor extends SearchBaseVisitor<QueryNode> {
 
     private final boolean isRegularExpression;
 
-    private boolean isNegation = false;
-
     public SearchToLuceneVisitor(boolean isRegularExpression) {
         this.isRegularExpression = isRegularExpression;
     }
@@ -45,15 +43,13 @@ public QueryNode visitStart(SearchParser.StartContext ctx) {
         // See https://github.com/LoayGhreeb/lucene-mwe/issues/1 for more details
         if (result instanceof ModifierQueryNode modifierQueryNode) {
             if (modifierQueryNode.getModifier() == ModifierQueryNode.Modifier.MOD_NOT) {
-                isNegation = true;
                 return new AndQueryNode(List.of(new FieldQueryNode(SearchFieldConstants.DEFAULT_FIELD.toString(), "*", 0, 0), modifierQueryNode));
             }
         }
 
         // User might search for NOT this AND NOT that - we also need to convert properly
         if (result instanceof AndQueryNode andQueryNode) {
             if (andQueryNode.getChildren().stream().allMatch(child -> child instanceof ModifierQueryNode modifierQueryNode && modifierQueryNode.getModifier() == ModifierQueryNode.Modifier.MOD_NOT)) {
-                isNegation = true;
                 List<QueryNode> children = andQueryNode.getChildren().stream()
                                                        // prepend "all:* AND" to each child
                                                        .map(child -> new AndQueryNode(List.of(new FieldQueryNode(SearchFieldConstants.DEFAULT_FIELD.toString(), "*", 0, 0), child)))
@@ -108,7 +104,7 @@ public QueryNode visitComparison(SearchParser.ComparisonContext context) {
                     context.EQUAL() != null ||
                     context.EEQUAL() != null) { // exact match
                 if (LOGGER.isDebugEnabled() && context.EEQUAL() != null) {
-                    LOGGER.warn("Exact match is currently  supported by Lucene, using contains instead. Term: {}", context.getText());
+                    LOGGER.warn("Exact match is currently not supported by Lucene, using contains instead. Term: {}", context.getText());
                 }
                 return getFieldQueryNode(field, right, startIndex, stopIndex);
             }
@@ -139,11 +135,4 @@ private QueryNode getFieldQueryNode(String field, String term, int startIndex, i
         }
         return new FieldQueryNode(field, term, startIndex, stopIndex);
     }
-
-    /**
-     * Returns whether the search query is a negation (and was patched to be a filter).
-     */
-    public boolean isNegation() {
-        return this.isNegation;
-    }
 }
diff --git a/src/main/java/org/jabref/model/search/NGramAnalyzer.java b/src/main/java/org/jabref/model/search/NGramAnalyzer.java
@@ -6,9 +6,9 @@
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
 import org.apache.lucene.analysis.ngram.NGramTokenFilter;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
 
 public class NGramAnalyzer extends Analyzer {
     private final int minGram;
@@ -23,7 +23,7 @@ public NGramAnalyzer(int minGram, int maxGram, CharArraySet stopWords) {
 
     @Override
     protected TokenStreamComponents createComponents(String fieldName) {
-        Tokenizer source = new StandardTokenizer();
+        Tokenizer source = new WhitespaceTokenizer();
         TokenStream result = new LowerCaseFilter(source);
         result = new StopFilter(result, stopWords);
         result = new ASCIIFoldingFilter(result);

diff --git a/src/main/java/org/jabref/model/search/SearchFieldConstants.java b/src/main/java/org/jabref/model/search/SearchFieldConstants.java
@@ -17,7 +17,7 @@ public enum SearchFieldConstants {
     PAGE_NUMBER("pageNumber"),
     MODIFIED("modified");
 
-    public static final Analyzer Standard_ANALYZER = new StandardAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
+    public static final Analyzer Whitespace_ANALYZER = new WhitespaceAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
     public static final Analyzer NGram_Analyzer_For_INDEXING = new NGramAnalyzer(1, Integer.MAX_VALUE, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
     public static final List<String> PDF_FIELDS = List.of(PATH.toString(), CONTENT.toString(), ANNOTATIONS.toString());
     private final String field;

diff --git a/src/main/java/org/jabref/model/search/SearchQuery.java b/src/main/java/org/jabref/model/search/SearchQuery.java
@@ -83,7 +83,7 @@ public SearchQuery(String query, EnumSet<SearchFlags> searchFlags) {
             query = '/' + query + '/';
         }
 
-        MultiFieldQueryParser queryParser = new MultiFieldQueryParser(fieldsToSearchArray, SearchFieldConstants.Standard_ANALYZER, boosts);
+        MultiFieldQueryParser queryParser = new MultiFieldQueryParser(fieldsToSearchArray, SearchFieldConstants.Whitespace_ANALYZER, boosts);
         queryParser.setAllowLeadingWildcard(true);
 
         try {

diff --git a/src/main/java/org/jabref/model/search/SearchResult.java b/src/main/java/org/jabref/model/search/SearchResult.java
@@ -76,7 +76,7 @@ public int getPageNumber() {
     }
 
     private static List<String> getHighlighterFragments(Highlighter highlighter, SearchFieldConstants field, String content) {
-        try (TokenStream contentStream = SearchFieldConstants.Standard_ANALYZER.tokenStream(field.toString(), content)) {
+        try (TokenStream contentStream = SearchFieldConstants.Whitespace_ANALYZER.tokenStream(field.toString(), content)) {
             TextFragment[] frags = highlighter.getBestTextFragments(contentStream, content, true, 10);
             return Arrays.stream(frags).map(TextFragment::toString).toList();
         } catch (IOException | InvalidTokenOffsetsException e) {

diff --git a/...jabref/model/search/StandardAnalyzer.java → ...bref/model/search/WhitespaceAnalyzer.java b/...jabref/model/search/StandardAnalyzer.java → ...bref/model/search/WhitespaceAnalyzer.java
@@ -6,18 +6,18 @@
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
 
-public class StandardAnalyzer extends Analyzer {
+public class WhitespaceAnalyzer extends Analyzer {
     private final CharArraySet stopWords;
-    public StandardAnalyzer(CharArraySet stopWords) {
+    public WhitespaceAnalyzer(CharArraySet stopWords) {
         this.stopWords = stopWords;
     }
 
     @Override
     protected TokenStreamComponents createComponents(String fieldName) {
-        Tokenizer source = new StandardTokenizer();
+        Tokenizer source = new WhitespaceTokenizer();
         TokenStream result = new LowerCaseFilter(source);
         result = new StopFilter(result, stopWords);
         result = new ASCIIFoldingFilter(result);

diff --git a/src/test/java/org/jabref/logic/search/LuceneQueryParserTest.java b/src/test/java/org/jabref/logic/search/LuceneQueryParserTest.java
@@ -1,38 +1,45 @@
 package org.jabref.logic.search;
 
-import java.util.EnumSet;
 import java.util.stream.Stream;
 
-import org.jabref.model.search.SearchFlags;
-import org.jabref.model.search.SearchQuery;
+import org.jabref.logic.cleanup.Formatter;
+import org.jabref.logic.layout.format.LatexToUnicodeFormatter;
+import org.jabref.model.search.SearchFieldConstants;
 
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParser;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.Arguments;
 import org.junit.jupiter.params.provider.MethodSource;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
 public class LuceneQueryParserTest {
+    private static final Formatter FORMATTER = new LatexToUnicodeFormatter();
 
     public static Stream<Arguments> searchQuires() {
         return Stream.of(
                 // unicode
-                Arguments.of("preissinger", "preißinger"),
-                Arguments.of("jesus", "jesús"),
-                Arguments.of("breitenbucher", "breitenbücher"),
+                Arguments.of("all:preissinger", "preißinger"),
+                Arguments.of("all:jesus", "jesús"),
+                Arguments.of("all:breitenbucher", "breitenbücher"),
 
                 // latex
-                Arguments.of("preissinger", "prei{\\ss}inger"),
-                Arguments.of("jesus", "jes{\\'{u}}s"),
-                Arguments.of("breitenbucher", "breitenb{\\\"{u}}cher")
+                Arguments.of("all:preissinger", "prei{\\ss}inger"),
+                Arguments.of("all:jesus", "jes{\\'{u}}s"),
+                Arguments.of("all:breitenbucher", "breitenb{\\\"{u}}cher"),
+
+                Arguments.of("groups:/exclude", "groups:\\/exclude")
         );
     }
 
     @ParameterizedTest
     @MethodSource
-    void searchQuires(String expected, String query) {
-        expected = "(all:" + expected + ")^4.0";
-        SearchQuery searchQuery = new SearchQuery(query, EnumSet.noneOf(SearchFlags.class));
-        assertEquals(expected, searchQuery.getParsedQuery().toString());
+    void searchQuires(String expected, String query) throws ParseException {
+        QueryParser parser = new QueryParser(SearchFieldConstants.DEFAULT_FIELD.toString(), new WhitespaceAnalyzer());
+         query = FORMATTER.format(query);
+        String result = parser.parse(query).toString();
+        assertEquals(expected, result);
     }
 }
diff --git a/src/test/java/org/jabref/migrations/SearchToLuceneMigrationTest.java b/src/test/java/org/jabref/migrations/SearchToLuceneMigrationTest.java
@@ -17,6 +17,7 @@ public static Stream<Arguments> transformationNormal() {
 
                 Arguments.of("title:chocolate", "title=chocolate"),
                 Arguments.of("title:chocolate OR author:smith", "title = chocolate or author = smith"),
+                Arguments.of("groups:\\/exclude", "groups= /exclude"),
                 Arguments.of("title:chocolate AND author:smith", "title = \"chocolate\" AND author = \"smith\""),
                 Arguments.of("title:chocolate AND author:smith", "title contains \"chocolate\" AND author matches \"smith\""),
                 Arguments.of("( title:chocolate ) OR ( author:smith )", "(title == chocolate) or (author == smith)"),
@@ -26,6 +27,7 @@ public static Stream<Arguments> transformationNormal() {
                 Arguments.of("abstract:model\\{1,2\\}ing", "abstract = model{1,2}ing"),
                 Arguments.of("all:* AND -title:chocolate", "title != chocolate"),
                 Arguments.of("all:* AND -title:chocolate", "not title contains chocolate"),
+                Arguments.of("groups=:\\:paywall AND -file=\"\" AND -groups=\\/exclude", "groups=:paywall and file!=\"\" and groups!=/exclude"),
 
                 // not converted, because not working in JabRef 5.x
                 // Arguments.of("title:\"image processing\" OR keywords:\"image processing\"", "title|keywords = \"image processing\""),