From be558fe0a45147202f129a4be622046a9ba9324d Mon Sep 17 00:00:00 2001 From: Tobias Diez Date: Mon, 26 Aug 2019 21:09:00 +0200 Subject: [PATCH] Revert "Switch from tika-parsers to tika-core (#5217)" This reverts commit 29cf4f20 --- build.gradle | 2 +- ...fully-support-utf8-only-for-latex-files.md | 44 ------------------- docs/adr/index.md | 1 - .../logic/texparser/DefaultTexParser.java | 7 +-- .../logic/texparser/DefaultTexParserTest.java | 21 +++------ 5 files changed, 9 insertions(+), 66 deletions(-) delete mode 100644 docs/adr/0005-fully-support-utf8-only-for-latex-files.md diff --git a/build.gradle b/build.gradle index 71db4021033..fe5488944f0 100644 --- a/build.gradle +++ b/build.gradle @@ -122,7 +122,7 @@ dependencies { compile 'org.apache.pdfbox:fontbox:2.0.16' compile 'org.apache.pdfbox:xmpbox:2.0.16' - compile group: 'org.apache.tika', name: 'tika-core', version: '1.22' + compile group: 'org.apache.tika', name: 'tika-parsers', version: '1.22' // required for reading write-protected PDFs - see https://github.com/JabRef/jabref/pull/942#issuecomment-209252635 compile 'org.bouncycastle:bcprov-jdk15on:1.62' diff --git a/docs/adr/0005-fully-support-utf8-only-for-latex-files.md b/docs/adr/0005-fully-support-utf8-only-for-latex-files.md deleted file mode 100644 index 6e066e8b7d8..00000000000 --- a/docs/adr/0005-fully-support-utf8-only-for-latex-files.md +++ /dev/null @@ -1,44 +0,0 @@ -# Fully Support UTF-8 Only For LaTeX Files - -## Context and Problem Statement - -The feature [search for citations](https://github.com/JabRef/help.jabref.org/issues/210) displays the content of LaTeX files. -The LaTeX files are text files and might be encoded arbitrarily. - -## Considered Options - -* Support UTF-8 encoding only -* Support ASCII encoding only -* Support (nearly) all encodings - -## Decision Outcome - -Chosen option: "Support UTF-8 encoding only", because comes out best (see below). - -### Positive Consequences - -* All content of LaTeX files are displayed in JabRef - -### Negative Consequences - -* When a LaTeX files is encoded in another encoding, the user might see strange characters in JabRef - -## Pros and Cons of the Options - -### Support UTF-8 encoding only - -* Good, because covers most tex file encodings -* Good, because easy to implement -* Bad, because does not support encodings used before around 2010 - -### Support ASCII encoding only - -* Good, because easy to implement -* Bad, because does not support any encoding at all - -### Support (nearly) all encodings - -* Good, because easy to implement -* Bad, because it relies on Apache Tika's `CharsetDetector`, which resides in `tika-parsers`. - This causes issues during compilation (see https://github.com/JabRef/jabref/pull/3421#issuecomment-524532832). - Example: `error: module java.xml.bind reads package javax.activation from both java.activation and jakarta.activation`. diff --git a/docs/adr/index.md b/docs/adr/index.md index de1818aea3c..7bbc5b8c4fc 100644 --- a/docs/adr/index.md +++ b/docs/adr/index.md @@ -9,7 +9,6 @@ This log lists the architectural decisions for JabRef. - [ADR-0002](0002-use-slf4j-for-logging.md) - Use slf4j together with log4j2 for logging - [ADR-0003](0003-use-gradle-as-build-tool.md) - Use Gradle as build tool - [ADR-0004](0004-use-mariadb-connector.md) - Use MariaDB Connector -- [ADR-0005](0005-fully-support-utf8-only-for-latex-files.md) - Fully Support UTF-8 Only For LaTeX Files diff --git a/src/main/java/org/jabref/logic/texparser/DefaultTexParser.java b/src/main/java/org/jabref/logic/texparser/DefaultTexParser.java index 64cd449b11f..c3661fc0d56 100644 --- a/src/main/java/org/jabref/logic/texparser/DefaultTexParser.java +++ b/src/main/java/org/jabref/logic/texparser/DefaultTexParser.java @@ -1,13 +1,10 @@ package org.jabref.logic.texparser; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.io.LineNumberReader; import java.io.Reader; import java.io.UncheckedIOException; import java.nio.channels.ClosedChannelException; -import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -20,6 +17,7 @@ import org.jabref.model.texparser.TexParser; import org.jabref.model.texparser.TexParserResult; +import org.apache.tika.parser.txt.CharsetDetector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -84,8 +82,7 @@ public TexParserResult parse(List texFiles) { } try ( - InputStream inputStream = Files.newInputStream(file); - Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8); + Reader reader = new CharsetDetector().setText(Files.readAllBytes(file)).detect().getReader(); LineNumberReader lineNumberReader = new LineNumberReader(reader)) { for (String line = lineNumberReader.readLine(); line != null; line = lineNumberReader.readLine()) { // Skip comments and blank lines. diff --git a/src/test/java/org/jabref/logic/texparser/DefaultTexParserTest.java b/src/test/java/org/jabref/logic/texparser/DefaultTexParserTest.java index 06b481efaab..b0a90c8166a 100644 --- a/src/test/java/org/jabref/logic/texparser/DefaultTexParserTest.java +++ b/src/test/java/org/jabref/logic/texparser/DefaultTexParserTest.java @@ -12,7 +12,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals; public class DefaultTexParserTest { - private final static String DARWIN = "Darwin1888"; private final static String EINSTEIN = "Einstein1920"; private final static String NEWTON = "Newton1999"; @@ -89,9 +88,7 @@ public void testFileEncodingIso88591() throws URISyntaxException { TexParserResult expectedParserResult = new TexParserResult(); expectedParserResult.getFileList().add(texFile); - // The character � is on purpose - we cannot use Apache Tika's CharsetDetector - see ADR-0005 - expectedParserResult - .addKey("anykey", texFile, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}."); + expectedParserResult.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}."); assertEquals(expectedParserResult, parserResult); } @@ -104,9 +101,7 @@ public void testFileEncodingIso885915() throws URISyntaxException { TexParserResult expectedParserResult = new TexParserResult(); expectedParserResult.getFileList().add(texFile); - // The character � is on purpose - we cannot use Apache Tika's CharsetDetector - see ADR-0005 - expectedParserResult - .addKey("anykey", texFile, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}."); + expectedParserResult.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}."); assertEquals(expectedParserResult, parserResult); } @@ -117,17 +112,13 @@ public void testFileEncodingForThreeFiles() throws URISyntaxException { Path texFile2 = Paths.get(DefaultTexParserTest.class.getResource("iso-8859-1.tex").toURI()); Path texFile3 = Paths.get(DefaultTexParserTest.class.getResource("iso-8859-15.tex").toURI()); - TexParserResult parserResult = new DefaultTexParser() - .parse(Arrays.asList(texFile, texFile2, texFile3)); + TexParserResult parserResult = new DefaultTexParser().parse(Arrays.asList(texFile, texFile2, texFile3)); TexParserResult expectedParserResult = new TexParserResult(); expectedParserResult.getFileList().addAll(Arrays.asList(texFile, texFile2, texFile3)); - expectedParserResult - .addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}."); - expectedParserResult - .addKey("anykey", texFile2, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}."); - expectedParserResult - .addKey("anykey", texFile3, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}."); + expectedParserResult.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}."); + expectedParserResult.addKey("anykey", texFile2, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}."); + expectedParserResult.addKey("anykey", texFile3, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}."); assertEquals(expectedParserResult, parserResult); }