Fix charset detection with utf16 and others (#8947)

* Fix charset detection with utf16 and others Fixes #8895 Fixes #8870 * checkstyöe * Fix typo in method names * change newlines * get bytes * Set newline character to LF * Revert "get bytes" This reverts commit 1082f8a. * progress * switch line sep to LF * Please work * Try jitpack * Add manual build of icu4j * Check if we have ascii in the list of charsets * fix checkstyle * Update external-libraries.md * Enocde with UTF-16BE * Fix umlaut * Hack to get test running * Also compare meta data * Add enforced ignorance of malformed characters Source: http://biercoff.com/malformedinputexception-input-length-1-exception-solution-for-scala-and-java/ Co-authored-by: Christoph <siedlerkiller@gmail.com> * checkstyle * IntelliJ now also renders the file correctly * Add test Additionally - Replace unknown characters - Remove obsolete wrapping classes in test * Refine CHANGELOG.md * Remove non-working jpackage reference Co-authored-by: Oliver Kopp <kopp.dev@gmail.com> Co-authored-by: Houssem Nasri <housi.housi2015@gmail.com>
JabRef · Jul 11, 2022 · 76850e7 · 76850e7
1 parent cef980d
commit 76850e7
Show file tree

Hide file tree

Showing 11 changed files with 88 additions and 46 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -37,6 +37,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
 - We improved the message explaining the options when modifying an automatic keyword group [#8911](https://github.com/JabRef/jabref/issues/8911)
 - We moved the preferences option "Warn about duplicates on import" option from the tab "File" to the tab "Import and Export". [koppor#570](https://github.com/koppor/jabref/issues/570)
 - When JabRef encounters `% Encoding: UTF-8` header, it is kept during writing (and not removed). [#8964](https://github.com/JabRef/jabref/pull/8964)
+- We replace characters which cannot be decoded using the specified encoding by a (probably another) valid character. This happens if JabRef detects the wrong charset (e.g., UTF-8 instead of Windows 1252). One can use the [Integrity Check](https://docs.jabref.org/finding-sorting-and-cleaning-entries/checkintegrity) to find those characters.
 
 ### Fixed
 

diff --git a/build.gradle b/build.gradle
@@ -100,7 +100,7 @@ dependencyLocking {
 }
 
 javafx {
-    version = "18"
+    version = "18.0.1"
     modules = [ 'javafx.controls', 'javafx.fxml', 'javafx.web', 'javafx.swing' ]
 }
 
@@ -127,7 +127,6 @@ dependencies {
     implementation 'com.h2database:h2-mvstore:2.1.214'
 
     implementation group: 'org.apache.tika', name: 'tika-core', version: '2.4.1'
-    implementation 'com.ibm.icu:icu4j-charset:71.1'
 
     // required for reading write-protected PDFs - see https://github.com/JabRef/jabref/pull/942#issuecomment-209252635
     implementation 'org.bouncycastle:bcprov-jdk18on:1.71'

diff --git a/external-libraries.md b/external-libraries.md
@@ -398,6 +398,9 @@ Id:      com.ibm.icu:*
 Project: International Components for Unicode
 URL:     https://icu.unicode.org/
 License: Unicode License (https://www.unicode.org/copyright.html)
+Note:    Our own fork https://github.com/JabRef/icu. Upstream PR: https://github.com/unicode-org/icu/pull/2127
+Path:    lib/icu4j.jar
+SourcePath: lib/ic4j-src.jar
 ```
 
 ```yaml
@@ -519,25 +522,24 @@ License: Apache-2.0
 3. (on WSL) `sed 's/[^a-z]*//' < build/dependencies.txt | sed "s/\(.*\) .*/\1/" | grep -v "\->" | sort | uniq > build/dependencies-for-external-libraries.txt`
 
 ```text
-com.fasterxml.jackson.core:jackson-annotations:2.13.2
-com.fasterxml.jackson.core:jackson-core:2.13.2
-com.fasterxml.jackson.core:jackson-databind:2.13.2
-com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.13.2
-com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.13.2
-com.fasterxml.jackson:jackson-bom:2.13.2
-com.github.tomtung:latex2unicode_2.12:0.2.6
+com.fasterxml.jackson.core:jackson-annotations:2.13.3
+com.fasterxml.jackson.core:jackson-core:2.13.3
+com.fasterxml.jackson.core:jackson-databind:2.13.3
+com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.13.3
+com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.13.3
+com.fasterxml.jackson:jackson-bom:2.13.3
+com.github.sialcasa.mvvmFX:mvvmfx-validation:f195849ca9
+com.github.tomtung:latex2unicode_2.13:0.3.2
 com.google.code.gson:gson:2.9.0
 com.google.errorprone:error_prone_annotations:2.11.0
 com.google.guava:failureaccess:1.0.1
 com.google.guava:guava:31.1-jre
 com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava
 com.google.j2objc:j2objc-annotations:1.3
 com.googlecode.javaewah:JavaEWAH:1.1.13
-com.h2database:h2-mvstore:2.1.212
-com.ibm.icu:icu4j-charset:70.1
-com.ibm.icu:icu4j:70.1
+com.h2database:h2-mvstore:2.1.214
 com.jfoenix:jfoenix:9.0.10
-com.konghq:unirest-java:3.13.8
+com.konghq:unirest-java:3.13.10
 com.microsoft.azure:applicationinsights-core:2.4.1
 com.microsoft.azure:applicationinsights-logging-log4j2:2.4.1
 com.oracle.ojdbc:ojdbc10:19.3.0.0
@@ -565,10 +567,9 @@ com.vladsch.flexmark:flexmark-util-visitor:0.64.0
 com.vladsch.flexmark:flexmark-util:0.64.0
 com.vladsch.flexmark:flexmark:0.64.0
 commons-cli:commons-cli:1.5.0
-commons-codec:commons-codec:1.11
+commons-codec:commons-codec:1.15
 commons-io:commons-io:2.11.0
 commons-logging:commons-logging:1.2
-de.saxsys:mvvmfx-validation:1.9.0-SNAPSHOT
 de.saxsys:mvvmfx:1.8.0
 de.undercouch:citeproc-java:3.0.0-alpha.6
 eu.lestard:doc-annotations:0.2
@@ -581,7 +582,7 @@ net.java.dev.jna:jna-platform:5.6.0
 net.java.dev.jna:jna:5.6.0
 net.jcip:jcip-annotations:1.0
 net.jodah:typetools:0.6.1
-org.antlr:antlr-runtime:3.5.2
+org.antlr:antlr-runtime:3.5.3
 org.antlr:antlr4-runtime:4.9.3
 org.apache.commons:commons-csv:1.9.0
 org.apache.commons:commons-lang3:3.12.0
@@ -590,23 +591,22 @@ org.apache.httpcomponents:httpclient:4.5.13
 org.apache.httpcomponents:httpcore-nio:4.4.13
 org.apache.httpcomponents:httpcore:4.4.13
 org.apache.httpcomponents:httpmime:4.5.13
-org.apache.lucene:lucene-analysis-common:9.1.0
-org.apache.lucene:lucene-backward-codecs:9.1.0
-org.apache.lucene:lucene-core:9.1.0
-org.apache.lucene:lucene-highlighter:9.1.0
-org.apache.lucene:lucene-queries:9.1.0
-org.apache.lucene:lucene-queryparser:9.1.0
-org.apache.lucene:lucene-sandbox:9.1.0
+org.apache.lucene:lucene-analysis-common:9.2.0
+org.apache.lucene:lucene-core:9.2.0
+org.apache.lucene:lucene-highlighter:9.2.0
+org.apache.lucene:lucene-queries:9.2.0
+org.apache.lucene:lucene-queryparser:9.2.0
+org.apache.lucene:lucene-sandbox:9.2.0
 org.apache.pdfbox:fontbox:3.0.0-RC1
 org.apache.pdfbox:pdfbox:3.0.0-RC1
 org.apache.pdfbox:xmpbox:3.0.0-RC1
-org.apache.tika:tika-core:2.3.0
-org.bouncycastle:bcprov-jdk15on:1.70
+org.apache.tika:tika-core:2.4.1
+org.bouncycastle:bcprov-jdk18on:1.71
 org.checkerframework:checker-qual:3.12.0
 org.codehaus.mojo:animal-sniffer-annotations:1.18
 org.controlsfx:controlsfx:11.1.1
-org.eclipse.jgit:org.eclipse.jgit:6.1.0.202203080745-r
-org.fxmisc.flowless:flowless:0.6.9
+org.eclipse.jgit:org.eclipse.jgit:6.2.0.202206071550-r
+org.fxmisc.flowless:flowless:0.6.10
 org.fxmisc.richtext:richtextfx:0.10.9
 org.fxmisc.undo:undofx:2.1.1
 org.fxmisc.wellbehaved:wellbehavedfx:0.3.3
@@ -616,23 +616,23 @@ org.glassfish.jaxb:jaxb-runtime:3.0.2
 org.glassfish.jaxb:txw2:3.0.2
 org.jbibtex:jbibtex:1.0.19
 org.jetbrains:annotations:15.0
-org.jsoup:jsoup:1.14.3
+org.jsoup:jsoup:1.15.1
 org.kordamp.ikonli:ikonli-core:12.3.1
 org.kordamp.ikonli:ikonli-javafx:12.3.1
 org.kordamp.ikonli:ikonli-materialdesign2-pack:12.3.1
-org.libreoffice:libreoffice:7.3.2
-org.libreoffice:unoloader:7.3.2
-org.mariadb.jdbc:mariadb-java-client:2.7.5
-org.openjfx:javafx-base:18
-org.openjfx:javafx-controls:18
-org.openjfx:javafx-fxml:18
-org.openjfx:javafx-graphics:18
-org.openjfx:javafx-media:18
-org.openjfx:javafx-swing:18
-org.openjfx:javafx-web:18
-org.postgresql:postgresql:42.3.3
+org.libreoffice:libreoffice:7.3.4
+org.libreoffice:unoloader:7.3.4
+org.mariadb.jdbc:mariadb-java-client:2.7.6
+org.openjfx:javafx-base:18.0.1
+org.openjfx:javafx-controls:18.0.1
+org.openjfx:javafx-fxml:18.0.1
+org.openjfx:javafx-graphics:18.0.1
+org.openjfx:javafx-media:18.0.1
+org.openjfx:javafx-swing:18.0.1
+org.openjfx:javafx-web:18.0.1
+org.postgresql:postgresql:42.4.0
 org.reactfx:reactfx:2.0-M5
-org.scala-lang:scala-library:2.12.8
+org.scala-lang:scala-library:2.13.8
 org.slf4j:slf4j-api:2.0.0-alpha7
 org.tinylog:slf4j-tinylog:2.4.1
 org.tinylog:tinylog-api:2.4.1

diff --git a/lib/icu4j-src.jar b/lib/icu4j-src.jar
diff --git a/lib/icu4j.jar b/lib/icu4j.jar
diff --git a/src/main/java/org/jabref/logic/importer/Importer.java b/src/main/java/org/jabref/logic/importer/Importer.java
@@ -123,12 +123,10 @@ protected static Charset getCharset(BufferedInputStream bufferedInputStream) {
             if ((matches == null) || (matches.length == 0)) {
                 return defaultCharSet;
             }
-            if (Arrays.stream(matches).anyMatch(singleCharset -> singleCharset.getName().equals(defaultCharSet.toString()))) {
+
+            if (Arrays.stream(matches).anyMatch(charset -> "ASCII".equals(charset.getName()))) {
                 return defaultCharSet;
             }
-            if (Arrays.stream(matches).anyMatch(singleCharset -> singleCharset.getName().equals(StandardCharsets.UTF_16.toString()))) {
-                return StandardCharsets.UTF_16;
-            }
 
             if (matches[0] != null) {
                 return Charset.forName(matches[0].getName());
@@ -166,7 +164,7 @@ public static BufferedReader getReader(Path filePath) throws IOException {
         return new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
     }
 
-    public static BufferedReader getReader(InputStream stream) throws IOException {
+    public static BufferedReader getReader(InputStream stream) {
         BufferedInputStream bufferedInputStream = new BufferedInputStream(stream);
         Charset charset = getCharset(bufferedInputStream);
         InputStreamReader reader = new InputStreamReader(bufferedInputStream, charset);

diff --git a/src/main/java/org/jabref/logic/importer/fileformat/BibtexImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/BibtexImporter.java
@@ -4,7 +4,10 @@
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Objects;
@@ -79,7 +82,14 @@ public ParserResult importDatabase(Path filePath) throws IOException {
             LOGGER.debug("Encoding used to read the file: {}", encoding);
         }
 
-        try (BufferedReader reader = Files.newBufferedReader(filePath, encoding)) {
+        // We replace unreadable characters
+        // Unfortunately, no warning will be issued to the user
+        // As this is a very seldom case, we accept that
+        CharsetDecoder decoder = encoding.newDecoder();
+        decoder.onMalformedInput(CodingErrorAction.REPLACE);
+
+        try (InputStreamReader inputStreamReader = new InputStreamReader(Files.newInputStream(filePath), decoder);
+             BufferedReader reader = new BufferedReader(inputStreamReader)) {
             ParserResult parserResult = this.importDatabase(reader);
             parserResult.getMetaData().setEncoding(encoding);
             parserResult.getMetaData().setEncodingExplicitlySupplied(encodingExplicitlySupplied);

diff --git a/src/test/java/org/jabref/logic/importer/fileformat/BibtexImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/BibtexImporterTest.java
@@ -12,10 +12,12 @@
 import org.jabref.logic.importer.ImportFormatPreferences;
 import org.jabref.logic.importer.ParserResult;
 import org.jabref.logic.util.StandardFileType;
+import org.jabref.model.database.BibDatabaseMode;
 import org.jabref.model.entry.BibEntry;
 import org.jabref.model.entry.field.StandardField;
 import org.jabref.model.entry.field.UnknownField;
 import org.jabref.model.entry.types.StandardEntryType;
+import org.jabref.model.metadata.MetaData;
 import org.jabref.model.util.DummyFileUpdateMonitor;
 
 import org.junit.jupiter.api.BeforeEach;
@@ -170,13 +172,40 @@ public void testParsingOfUtf8EncodedFileReadsUmlautCharacterCorrectly(String fil
                 parserResult.getDatabase().getEntries());
     }
 
+    @ParameterizedTest
+    @CsvSource({"encoding-utf-16BE-with-header.bib", "encoding-utf-16BE-without-header.bib"})
+    public void testParsingOfUtf16EncodedFileReadsUmlautCharacterCorrectly(String filename) throws Exception {
+        ParserResult parserResult = importer.importDatabase(
+                Path.of(BibtexImporterTest.class.getResource(filename).toURI()));
+
+        assertEquals(
+                List.of(new BibEntry(StandardEntryType.Article).withField(StandardField.TITLE, "Ü ist ein Umlaut")),
+                parserResult.getDatabase().getEntries());
+
+        MetaData metaData = new MetaData();
+        metaData.setMode(BibDatabaseMode.BIBTEX);
+        metaData.setEncoding(StandardCharsets.UTF_16BE);
+        assertEquals(metaData, parserResult.getMetaData());
+    }
+
     @Test
     public void encodingSupplied() throws Exception {
         ParserResult parserResult = importer.importDatabase(
                 Path.of(BibtexImporterTest.class.getResource("encoding-utf-8-with-header.bib").toURI()));
         assertTrue(parserResult.getMetaData().getEncodingExplicitlySupplied());
     }
 
+    @Test
+    public void wrongEncodingSupplied() throws Exception {
+        ParserResult parserResult = importer.importDatabase(
+                Path.of(BibtexImporterTest.class.getResource("encoding-windows-1252-but-utf-8-declared--decoding-fails.bib").toURI()));
+
+        // The test file contains "Test{NBSP}I. Last" where the character "{NBSP}" is encoded using Windows-1252 instead of UTF-8
+        assertEquals(
+                List.of(new BibEntry(StandardEntryType.Article).withField(StandardField.AUTHOR, "Test�I. Last")),
+                parserResult.getDatabase().getEntries());
+    }
+
     @Test
     public void encodingNotSupplied() throws Exception {
         ParserResult parserResult = importer.importDatabase(

diff --git a/src/test/resources/org/jabref/logic/importer/fileformat/encoding-utf-16BE-with-header.bib b/src/test/resources/org/jabref/logic/importer/fileformat/encoding-utf-16BE-with-header.bib
diff --git a/src/test/resources/org/jabref/logic/importer/fileformat/encoding-utf-16BE-without-header.bib b/src/test/resources/org/jabref/logic/importer/fileformat/encoding-utf-16BE-without-header.bib
diff --git a/...ef/logic/importer/fileformat/encoding-windows-1252-but-utf-8-declared--decoding-fails.bib b/...ef/logic/importer/fileformat/encoding-windows-1252-but-utf-8-declared--decoding-fails.bib
@@ -0,0 +1,5 @@
+% Encoding: UTF-8
+
+@Article{,
+  author  = {Test I. Last},
+}