Skip to content

Commit

Permalink
Switch from tika-parsers to tika-core
Browse files Browse the repository at this point in the history
  • Loading branch information
koppor committed Aug 24, 2019
1 parent dd0f304 commit c557468
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 9 deletions.
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ dependencies {
compile 'org.apache.pdfbox:fontbox:2.0.16'
compile 'org.apache.pdfbox:xmpbox:2.0.16'

compile group: 'org.apache.tika', name: 'tika-parsers', version: '1.22'
compile group: 'org.apache.tika', name: 'tika-core', version: '1.22'

// required for reading write-protected PDFs - see https://github.com/JabRef/jabref/pull/942#issuecomment-209252635
compile 'org.bouncycastle:bcprov-jdk15on:1.62'
Expand Down
44 changes: 44 additions & 0 deletions docs/adr/0005-fully-support-utf8-only-for-latex-files.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Fully Support UTF-8 Only For LaTeX Files

## Context and Problem Statement

The feature [search for citations](https://github.com/JabRef/help.jabref.org/issues/210) displays the content of LaTeX files.
The LaTeX files are text files and might be encoded arbitrarily.

## Considered Options

* Support UTF-8 encoding only
* Support ASCII encoding only
* Support (nearly) all encodings

## Decision Outcome

Chosen option: "Support UF-8 encoding only", because comes out best (see below).

### Positive Consequences

* All content of LaTeX files are displayed in JabRef

### Negative Consequences

* When a LaTeX files is encoded in another encoding, the user might see strange characters in JabRef

## Pros and Cons of the Options

### Support UTF-8 encoding only

* Good, because covers most tex file encodings
* Good, because easy to implement
* Bad, because does not support encodings used before around 2010

### Support ASCII encoding only

* Good, because easy to implement
* Bad, because does not support any encoding at all

### Support (nearly) all encodings

* Good, because easy to implement
* Bad, because it relies on Apache Tika's `CharsetDetector`, which resides in `tika-parsers`.
This causes issues during compilation (see https://github.com/JabRef/jabref/pull/3421#issuecomment-524532832).
Example: `error: module java.xml.bind reads package javax.activation from both java.activation and jakarta.activation`.
1 change: 1 addition & 0 deletions docs/adr/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ This log lists the architectural decisions for JabRef.
- [ADR-0002](0002-use-slf4j-for-logging.md) - Use slf4j together with log4j2 for logging
- [ADR-0003](0003-use-gradle-as-build-tool.md) - Use Gradle as build tool
- [ADR-0004](0004-use-mariadb-connector.md) - Use MariaDB Connector
- [ADR-0005](0005-fully-support-utf8-only-for-latex-files.md) - Fully Support UTF-8 Only For LaTeX Files

<!-- adrlogstop -->

Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package org.jabref.logic.texparser;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.UncheckedIOException;
import java.nio.channels.ClosedChannelException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
Expand All @@ -18,7 +21,6 @@
import org.jabref.model.texparser.TexParser;
import org.jabref.model.texparser.TexParserResult;

import org.apache.tika.parser.txt.CharsetDetector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -78,7 +80,8 @@ public TexParserResult parse(List<Path> texFiles) {
}

try (
Reader reader = new CharsetDetector().setText(Files.readAllBytes(file)).detect().getReader();
InputStream inputStream = Files.newInputStream(file);
Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8);
LineNumberReader lineNumberReader = new LineNumberReader(reader)) {
for (String line = lineNumberReader.readLine(); line != null; line = lineNumberReader.readLine()) {
// Skip comments and blank lines.
Expand Down
21 changes: 15 additions & 6 deletions src/test/java/org/jabref/logic/texparser/DefaultTexParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import static org.junit.jupiter.api.Assertions.assertEquals;

public class DefaultTexParserTest {

private final static String DARWIN = "Darwin1888";
private final static String EINSTEIN = "Einstein1920";
private final static String NEWTON = "Newton1999";
Expand Down Expand Up @@ -90,7 +91,9 @@ public void testFileEncodingIso88591() throws URISyntaxException {
TexParserResult expectedParserResult = new TexParserResult();

expectedParserResult.getFileList().add(texFile);
expectedParserResult.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
// The character � is on purpose - we cannot use Apache Tika's CharsetDetector - see ADR-0005
expectedParserResult
.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}.");

assertEquals(expectedParserResult, parserResult);
}
Expand All @@ -103,7 +106,9 @@ public void testFileEncodingIso885915() throws URISyntaxException {
TexParserResult expectedParserResult = new TexParserResult();

expectedParserResult.getFileList().add(texFile);
expectedParserResult.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
// The character � is on purpose - we cannot use Apache Tika's CharsetDetector - see ADR-0005
expectedParserResult
.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}.");

assertEquals(expectedParserResult, parserResult);
}
Expand All @@ -114,13 +119,17 @@ public void testFileEncodingForThreeFiles() throws URISyntaxException {
Path texFile2 = Paths.get(DefaultTexParserTest.class.getResource("iso-8859-1.tex").toURI());
Path texFile3 = Paths.get(DefaultTexParserTest.class.getResource("iso-8859-15.tex").toURI());

TexParserResult parserResult = new DefaultTexParser().parse(Arrays.asList(texFile, texFile2, texFile3));
TexParserResult parserResult = new DefaultTexParser()
.parse(Arrays.asList(texFile, texFile2, texFile3));
TexParserResult expectedParserResult = new TexParserResult();

expectedParserResult.getFileList().addAll(Arrays.asList(texFile, texFile2, texFile3));
expectedParserResult.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
expectedParserResult.addKey("anykey", texFile2, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
expectedParserResult.addKey("anykey", texFile3, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
expectedParserResult
.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
expectedParserResult
.addKey("anykey", texFile2, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}.");
expectedParserResult
.addKey("anykey", texFile3, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}.");

assertEquals(expectedParserResult, parserResult);
}
Expand Down

0 comments on commit c557468

Please sign in to comment.