From fec8db0431a930a867093200a95e9e71f1977f86 Mon Sep 17 00:00:00 2001 From: tallison Date: Wed, 11 Mar 2020 11:38:46 -0400 Subject: [PATCH] TIKA-2714 -- add detection for rar4 and rar5 files; throw an UnsupportedFormatException for rar5 files --- .../org/apache/tika/mime/tika-mimetypes.xml | 15 ++++++++++++++- .../org/apache/tika/parser/pkg/RarParser.java | 6 ++++++ .../org/apache/tika/parser/pkg/RarParserTest.java | 2 +- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 142673e4c5..06005f1bfd 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -4035,7 +4035,20 @@ - + + <_comment>RAR archive + + + + + + + <_comment>RAR archive + + + + + diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java index 633b2ccddf..4cdcedd898 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java @@ -27,11 +27,13 @@ import com.github.junrar.rarfile.FileHeader; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; +import org.apache.tika.exception.UnsupportedFormatException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; @@ -62,7 +64,11 @@ public void parse(InputStream stream, ContentHandler handler, xhtml.startDocument(); EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + String mediaType = metadata.get(Metadata.CONTENT_TYPE); + if (mediaType != null && mediaType.contains("version=5")) { + throw new UnsupportedFormatException("Tika does not yet support rar version 5."); + } Archive rar = null; try (TemporaryResources tmp = new TemporaryResources()) { TikaInputStream tis = TikaInputStream.get(stream, tmp); diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java index 34dcaab9cc..d6f5af1fd5 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java @@ -48,7 +48,7 @@ public void testRarParsing() throws Exception { AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext); } - assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("application/x-rar-compressed; version=4", metadata.get(Metadata.CONTENT_TYPE)); String content = handler.toString(); assertContains("test-documents/testEXCEL.xls", content); assertContains("Sample Excel Worksheet", content);