Skip to content

Commit

Permalink
TIKA-4354 -- make incremental update metadata and parsing default in …
Browse files Browse the repository at this point in the history
…tika-cli (#2059)

* TIKA-4354 -- make incremental update metadata and parsing default in tika-cli

(cherry picked from commit ff9d722)
  • Loading branch information
tballison committed Nov 19, 2024
1 parent f91b2ca commit a308f1d
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 8 deletions.
13 changes: 5 additions & 8 deletions tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,6 @@ public void process(InputStream stream, OutputStream output, Metadata metadata)
*/
private String password = System.getenv("TIKA_PASSWORD");
private DigestingParser.Digester digester = null;
private boolean asyncMode = false;
private boolean pipeMode = true;
private boolean fork = false;
private boolean prettyPrint;
Expand Down Expand Up @@ -344,12 +343,12 @@ private void configurePDFExtractSettings() {
if (configFilePath == null && context.get(PDFParserConfig.class) == null) {
PDFParserConfig pdfParserConfig = new PDFParserConfig();
pdfParserConfig.setExtractInlineImages(true);
pdfParserConfig.setExtractIncrementalUpdateInfo(true);
pdfParserConfig.setParseIncrementalUpdates(true);
String warn = "As a convenience, TikaCLI has turned on extraction of\n" +
"inline images and incremental updates for the PDFParser (TIKA-2374 and " +
"TIKA-4017).\n" +
"Aside from the -z option, this is not the default behavior\n" +
"in Tika generally or in tika-server.";
"inline images and incremental updates for the PDFParser (TIKA-2374, " +
"TIKA-4017 and TIKA-4354).\n" +
"This is not the default behavior in Tika generally or in tika-server.";
LOG.info(warn);
context.set(PDFParserConfig.class, pdfParserConfig);
}
Expand Down Expand Up @@ -409,8 +408,6 @@ public void process(String arg) throws Exception {
// ignore, as container-aware detectors are now always used
} else if (arg.equals("-f") || arg.equals("--fork")) {
fork = true;
} else if (arg.equals("-a") || arg.equals("--async")) {
asyncMode = true;
} else if (arg.startsWith("--config=")) {
configFilePath = arg.substring("--config=".length());
} else if (arg.startsWith("--digest=")) {
Expand Down Expand Up @@ -454,7 +451,6 @@ public void process(String arg) throws Exception {
}
extractDir = new File(dirPath);
} else if (arg.equals("-z") || arg.equals("--extract")) {
configurePDFExtractSettings();
type = NO_OUTPUT;
context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor());
} else if (arg.equals("-r") || arg.equals("--pretty-print")) {
Expand Down Expand Up @@ -485,6 +481,7 @@ public void process(String arg) throws Exception {
} else {
url = new URL(arg);
}
configurePDFExtractSettings();
if (recursiveJSON) {
handleRecursiveJson(url, System.out);
} else {
Expand Down
8 changes: 8 additions & 0 deletions tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,14 @@ public void testJsonMetadataPrettyPrintOutput() throws Exception {
assertTrue(fb > -1 && title > -1 && fb > title);
}

@Test
public void testDefaultPDFIncrementalUpdateSettings() throws Exception {
String json = getParamOutContent("-J",
resourcePrefix + "testPDF_incrementalUpdates.pdf");
assertTrue(json.contains("pdf:incrementalUpdateCount\":\"2\""));
assertTrue(json.contains("embeddedResourceType\":\"VERSION\""));
}

/**
* Tests -l option of the cli
*
Expand Down
Binary file not shown.

0 comments on commit a308f1d

Please sign in to comment.