Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX - PDF minor version number checking. #317

Merged
merged 10 commits into from
Mar 28, 2018
7 changes: 7 additions & 0 deletions jhove-bbt/scripts/create-1.19-target.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,10 @@ fi

# Simply copy baseline for now we're not making any changes
cp -R "${baselineRoot}" "${targetRoot}"

# Add the Release Canidate PDF-HUL details
find "${targetRoot}" -type f -name "audit.jhove.xml" -exec sed -i 's/^ <module release="1.10">PDF-hul<\/module>$/ <module release="1.11-RC">PDF-hul<\/module>/' {} \;
find "${targetRoot}" -type f -name "audit-PDF-hul.jhove.xml" -exec sed -i 's/^ <release>1.10<\/release>$/ <release>1.11-RC<\/release>/' {} \;
find "${targetRoot}" -type f -name "audit-PDF-hul.jhove.xml" -exec sed -i 's/^ <date>2017-10-31<\/date>$/ <date>2018-03-16<\/date>/' {} \;
find "${targetRoot}" -type f -name "*.pdf.jhove.xml" -exec sed -i 's%<reportingModule release="1.10" date="2017-10-31">PDF%<reportingModule release="1.11-RC" date="2018-03-16">PDF%' {} \;
find "${targetRoot}" -type f -name "README.jhove.xml" -exec sed -i 's%<reportingModule release="1.10" date="2017-10-31">PDF%<reportingModule release="1.11-RC" date="2018-03-16">PDF%' {} \;
6 changes: 5 additions & 1 deletion jhove-bbt/scripts/travis-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,9 @@ echo " - applying the baseline patches for ${MAJOR_MINOR_VER} at: ${TARGET_ROOT}
bash "${SCRIPT_DIR}/create-${MAJOR_MINOR_VER}-target.sh" -b "${BASELINE_VERSION}" -c "${MAJOR_MINOR_VER}"

echo "Black box testing ${MAJOR_MINOR_VER}."
echo " - using development JHOVE installer: ${TEST_ROOT}/targets/${MAJOR_MINOR_VER}."
bash "${SCRIPT_DIR}/bbt-jhove.sh" -b "${TEST_ROOT}/targets/${MAJOR_MINOR_VER}" -c "${TEST_ROOT}/corpora" -j . -o "${TEST_ROOT}/candidates" -k "dev-${MAJOR_MINOR_VER}" -i
exit $?
echo " - test comparison key is: dev-${MAJOR_MINOR_VER}."
exitStatus=$?
echo " - BB Testing output is: ${exitStatus}"
exit $exitStatus
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
import edu.harvard.hul.ois.jhove.module.pdf.PdfArray;
import edu.harvard.hul.ois.jhove.module.pdf.PdfDictionary;
import edu.harvard.hul.ois.jhove.module.pdf.PdfException;
import edu.harvard.hul.ois.jhove.module.pdf.PdfHeader;
import edu.harvard.hul.ois.jhove.module.pdf.PdfIndirectObj;
import edu.harvard.hul.ois.jhove.module.pdf.PdfInvalidException;
import edu.harvard.hul.ois.jhove.module.pdf.PdfMalformedException;
Expand Down Expand Up @@ -113,9 +114,6 @@ public class PdfModule
public static final String MIME_TYPE = "application/pdf";
public static final String EXT = ".pdf";

private static final String PDF_VER1_HEADER_PREFIX = "PDF-1.";
private static final String PDF_SIG_HEADER = "%" + PDF_VER1_HEADER_PREFIX;
private static final String POSTSCRIPT_HEADER_PREFIX = "!PS-Adobe-";
private static final String ENCODING_PREFIX = "ENC=";
// private static final String NO_HEADER = "No PDF header";

Expand Down Expand Up @@ -362,8 +360,8 @@ public class PdfModule
******************************************************************/

private static final String NAME = "PDF-hul";
private static final String RELEASE = "1.10";
private static final int [] DATE = {2017, 10, 31};
private static final String RELEASE = "1.11-RC";
private static final int [] DATE = {2018, 03, 16};
private static final String [] FORMAT = {
"PDF", "Portable Document Format"
};
Expand Down Expand Up @@ -638,7 +636,7 @@ public PdfModule()
_signature.add(new ExternalSignature(EXT,
SignatureType.EXTENSION,
SignatureUseType.OPTIONAL));
_signature.add(new InternalSignature(PDF_SIG_HEADER,
_signature.add(new InternalSignature(PdfHeader.PDF_SIG_HEADER,
SignatureType.MAGIC,
SignatureUseType.MANDATORY,
0));
Expand Down Expand Up @@ -1065,82 +1063,19 @@ protected final void initParse()

protected boolean parseHeader(RepInfo info) throws IOException
{
Token token = null;
String value = null;

/* Parse file header. */

boolean foundSig = false;
for (;;) {
if (_parser.getOffset() > 1024) {
break;
}
try {
token = null;
token = _parser.getNext(1024L);
}
catch (IOException ee) {
break;
}
catch (Exception e) {} // fall through
if (token == null) {
break;
}
if (token instanceof Comment) {
value = ((Comment) token).getValue();
if (value.indexOf(PDF_VER1_HEADER_PREFIX) == 0) {
foundSig = true;
_version = value.substring(4, 7);
/* If we got this far, take note that the signature is OK. */
info.setSigMatch(_name);
break;
}
// The implementation notes (though not the spec)
// allow an alternative signature of %!PS-Adobe-N.n PDF-M.m
if (value.indexOf(POSTSCRIPT_HEADER_PREFIX) == 0) {
// But be careful: that much by itself is the standard
// PostScript signature.
int n = value.indexOf(PDF_VER1_HEADER_PREFIX);
if (n >= 11) {
foundSig = true;
_version = value.substring(n + 4);
// However, this is not PDF-A compliant.
_pdfACompliant = false;
info.setSigMatch(_name);
break;
}
}
}

// If we don't find it right at the beginning, we aren't
// PDF/A compliant.
_pdfACompliant = false;
}
if (!foundSig) {
PdfHeader header = PdfHeader.parseHeader(_parser);
if (header == null) {
info.setWellFormed(false);
info.setMessage(new ErrorMessage(MessageConstants.ERR_PDF_HEADER_MISSING, 0L));
info.setMessage(new ErrorMessage(MessageConstants.ERR_PDF_HEADER_MISSING, 0L));
return false;
}
// Check for PDF/A conformance. The next item must be
// a comment with four characters, each greater than 127
try {
token = _parser.getNext();
String cmt = ((Comment) token).getValue();
char[] cmtArray = cmt.toCharArray();
int ctlcnt = 0;
for (int i = 0; i < 4; i++) {
if (cmtArray[i] > 127) {
ctlcnt++;
}
}
if (ctlcnt < 4) {
_pdfACompliant = false;
}
}
catch (Exception e) {
// Most likely a ClassCastException on a non-comment
_pdfACompliant = false;
if (!header.isVersionValid()) {
info.setValid(false);
info.setMessage(new ErrorMessage(MessageConstants.ERR_PDF_MINOR_INVALID, 0L));
}
_version = header.getVersionString();
_pdfACompliant = header.isPdfACompliant();
info.setSigMatch(_name);
return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ public enum MessageConstants {
public static final String ERR_PAGE_TREE_NODE_INVALID = "Invalid page tree node";
public static final String ERR_PAGE_TREE_TRIMBOX_MALFORMED = "Malformed TrimBox in page tree";
public static final String ERR_PDF_HEADER_MISSING = "No PDF header";
public static final String ERR_PDF_MINOR_INVALID = "PDF minor version number is greater than 7.";
public static final String ERR_PDF_TRAILER_MISSING = "No PDF trailer";
public static final String ERR_PREV_OFFSET_TRAILER_DICT_INVALID = "Invalid Prev offset in trailer dictionary";
public static final String ERR_RESOURCES_ENTRY_INVALID = "Invalid Resources Entry in document";
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
package edu.harvard.hul.ois.jhove.module.pdf;

import java.io.IOException;

/**
* Simple class that is the a prototype of a proper header parser class. The aim
* was to introduce a simple version check for the PDF/A minor version number,
* see {@link PdfHeader#isVersionValid()}, while not changing anything else
* through over ambition.
*
* @author <a href="mailto:carl@openpreservation.org">Carl Wilson</a>
* <a href="https://github.com/carlwilson">carlwilson AT github</a>
* @version 0.1 Created 8 Mar 2018:00:46:39
*/

public final class PdfHeader {
public static final String PDF_VER1_HEADER_PREFIX = "PDF-1."; //$NON-NLS-1$
public static final String PDF_SIG_HEADER = "%" + PDF_VER1_HEADER_PREFIX; //$NON-NLS-1$
public static final String POSTSCRIPT_HEADER_PREFIX = "!PS-Adobe-"; //$NON-NLS-1$
public static final int MAX_VALID_MAJOR_VERSION = 7;

private final String versionString;
private final boolean isPdfACompilant;

/**
*
*/
private PdfHeader(final String versionString,
final boolean isPdfaCompliant) {
this.versionString = versionString;
this.isPdfACompilant = isPdfaCompliant;
}

/**
* @return the version string parsed from the PDF Header
*/
public String getVersionString() {
return this.versionString;
}

/**
* @return true if the header is considered PDF/A compliant, otherwise false
*/
public boolean isPdfACompliant() {
return this.isPdfACompilant;
}

/**
* Performs a very simple version number validity check. Given version
* number is a String of form 1.x, x is the minor version number. This
* method parses the minor version number from the version String and tests
* whether it is less than or equal to
* {@link PdfHeader#MAX_VALID_MAJOR_VERSION}.
*
* @return true if an integer minor version number can be parsed from the
* version string AND it is less than or equal to
* {@link PdfHeader#MAX_VALID_MAJOR_VERSION}. Otherwise false.
*/
public boolean isVersionValid() {
// Set minor version to one larger than maximum so invalid if parse
// fails
int minorVersion = MAX_VALID_MAJOR_VERSION + 1;
try {
minorVersion = getMinorVersion(this.versionString);
} catch (NumberFormatException nfe) {
// TODO : Do nothing for now, the version number is still invalid.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: Nothing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The aim is to not break existing behaviour. This ensures that this case is handled as before as the minor version number is set too large and false is returned. I've left the TODO as I'm intending to polish this a little more, but it needs a new error messages and is outside of the PR scope.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if in the long term view it is as easy as thinking about the minor version not being set too large, especially with PDF 2.0 now available. Couldn't it be instead checking the versionString against a list of currently available versions? So, as of today: 1.0, 1.1, 1.2, 1.3, ,1.4, 1.5., 1.6, 1.7, 2.0 ?
Downside is that the list of available versions would have to be updated everytime a new version becomes available - but if the code instead states that the max correct minor version is 7, which is true for PDF 1.x, it would have to still be updated regarding the limitation of minor version for 2.x as there currently only is 2.0 (and this would again have to be updated once 2.1 becomes available).
Does that make any sense?

}
return minorVersion <= MAX_VALID_MAJOR_VERSION;
}

/**
* Creates a new {@link PdfHeader} instance using the passed parameters.
*
* @param versionString
* the version number from the PDF Header, should be of form
* <code>1.x</code> where x should be of the range 0-7.
* @param isPdfaCompliant
* boolean flag indicating if the PDF/A is compliant or non
* compliant with JHOVE's PDF/A profile.
* @return a {@link PdfHeader} instance initialised using
* <code>versionString</code> and <code>isPdfaCompliant</code>.
* @throws NullPointerException
* when parameter <code>versionString</code> is null.
*/
static PdfHeader fromValues(final String versionString,
final boolean isPdfaCompliant) {
if (versionString == null)
throw new NullPointerException(
"Parameter versionString can not be null.");
return new PdfHeader(versionString, isPdfaCompliant);
}

/**
* Factory method for {@link PdfHeader} that parses a new instance using the
* supplied {@link Parser} instance.
*
* @param _parser
* the {@link Parser} instance that will be used to parse header
* details
* @return a new {@link PdfHeader} instance derived using the supplied
* {@link Parser} or <code>null</code> when no header could be found
* and parsed.
*/
public static PdfHeader parseHeader(final Parser parser) {
Token token = null;
String value = null;
boolean isPdfACompliant = false;
String version = null;

/* Parse file header. */
for (;;) {
if (parser.getOffset() > 1024) {
return null;
}
try {
token = null;
token = parser.getNext(1024L);
} catch (IOException ee) {
return null;
} catch (Exception e) {
// fall through
}

if (token == null) {
return null;
}
if (token instanceof Comment) {
value = ((Comment) token).getValue();
if (value.indexOf(PDF_VER1_HEADER_PREFIX) == 0) {
version = value.substring(4, 7);
isPdfACompliant = true;
break;
}
// The implementation notes (though not the spec)
// allow an alternative signature of %!PS-Adobe-N.n PDF-M.m
if (value.indexOf(POSTSCRIPT_HEADER_PREFIX) == 0) {
// But be careful: that much by itself is the standard
// PostScript signature.
int n = value.indexOf(PDF_VER1_HEADER_PREFIX);
if (n >= 11) {
version = value.substring(n + 4);
break;
}
}
}
}

if (version == null) {
return null;
}

try {
isPdfACompliant = isTokenPdfACompliant(parser.getNext());
} catch (Exception excep) {
// Most likely a ClassCastException on a non-comment
isPdfACompliant = false;
}
// Check for PDF/A conformance. The next item must be
// a comment with four characters, each greater than 127
return new PdfHeader(version, isPdfACompliant);
}

private static int getMinorVersion(final String version) {
double doubleVer = Double.parseDouble(version);
double fractPart = doubleVer % 1;
int minor = (int) (10L * fractPart);
return minor;
}

private static boolean isTokenPdfACompliant(final Token token) {
String cmt = ((Comment) token).getValue();
char[] cmtArray = cmt.toCharArray();
int ctlcnt = 0;
for (int i = 0; i < 4; i++) {
if (cmtArray[i] > 127) {
ctlcnt++;
}
}
return (ctlcnt > 3);
}
}
Loading