Skip to content

Commit

Permalink
TIKA-2714 -- add detection for rar4 and rar5 files; throw an Unsuppor…
Browse files Browse the repository at this point in the history
…tedFormatException for rar5 files
  • Loading branch information
tballison committed Mar 11, 2020
1 parent 8c33080 commit fec8db0
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4035,7 +4035,20 @@
</magic>
<glob pattern="*.rar"/>
</mime-type>

<mime-type type="application/x-rar-compressed;version=4">
<_comment>RAR archive</_comment>
<magic priority="60">
<match value="\x52\x61\x72\x21\x1a\x07\x00" type="string" offset="0"/>
</magic>
<sub-class-of type="application/x-rar-compressed"/>
</mime-type>
<mime-type type="application/x-rar-compressed;version=5">
<_comment>RAR archive</_comment>
<magic priority="60">
<match value="\x52\x61\x72\x21\x1a\x07\x01\x00" type="string" offset="0"/>
</magic>
<sub-class-of type="application/x-rar-compressed"/>
</mime-type>
<mime-type type="application/x-roxio-toast">
<glob pattern="*.toast"/>
<sub-class-of type="application/x-iso9660-image"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@
import com.github.junrar.rarfile.FileHeader;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.UnsupportedFormatException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
Expand Down Expand Up @@ -62,7 +64,11 @@ public void parse(InputStream stream, ContentHandler handler,
xhtml.startDocument();

EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
String mediaType = metadata.get(Metadata.CONTENT_TYPE);

if (mediaType != null && mediaType.contains("version=5")) {
throw new UnsupportedFormatException("Tika does not yet support rar version 5.");
}
Archive rar = null;
try (TemporaryResources tmp = new TemporaryResources()) {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public void testRarParsing() throws Exception {
AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
}

assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("application/x-rar-compressed; version=4", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("test-documents/testEXCEL.xls", content);
assertContains("Sample Excel Worksheet", content);
Expand Down

0 comments on commit fec8db0

Please sign in to comment.