From 8d765906183296906466afa4e61ebcad059a813c Mon Sep 17 00:00:00 2001 From: tallison Date: Mon, 23 May 2022 09:10:45 -0400 Subject: [PATCH] tighten up regex in StandardsText --- .../src/main/java/org/apache/tika/sax/StandardsText.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java index bdba930a71..b4109d9dc0 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java +++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java @@ -60,7 +60,8 @@ public class StandardsText { private static final String REGEX_APPLICABLE_DOCUMENTS = "(?i:.*APPLICABLE\\sDOCUMENTS|REFERENCE|STANDARD|REQUIREMENT|GUIDELINE|COMPLIANCE.*)"; // Regular expression to match the alphanumeric identifier of the standard - private static final String REGEX_IDENTIFIER = "(?([0-9]{3,}|([A-Z]+(-|_|\\.)?[0-9]{2,}))((-|_|\\.)?[A-Z0-9]+)*)"; + private static final String REGEX_IDENTIFIER = "(?([0-9]{3,20}|([A-Z]+(-|_|\\.)" + + "?[0-9]{2,20}))((-|_|\\.)?[A-Z0-9]+){0,10})"; // Regular expression to match the standard organization private static final String REGEX_ORGANIZATION = StandardOrganizations.getOrganzationsRegex(); @@ -71,8 +72,9 @@ public class StandardsText { // Regular expression to match a string that is supposed to be a standard // reference - private static final String REGEX_FALLBACK = "\\(?" + "(?[A-Z]\\w+)" - + "\\)?((\\s?(?\\/)\\s?)(\\w+\\s)*\\(?" + "(?[A-Z]\\w+)" + "\\)?)?" + private static final String REGEX_FALLBACK = "\\(?" + "(?[A-Z]\\w{1,100})" + + "\\)?((\\s?(?\\/)\\s?)(\\w{1,100}\\s)*\\(?" + "(?[A" + + "-Z]\\w{1,100})" + "\\)?)?" + REGEX_STANDARD_TYPE + "?" + "(-|\\s)?" + REGEX_IDENTIFIER; // Regular expression to match the standard organization within a string