diff --git a/src/main/java/org/spdx/utility/compare/TemplateRegexMatcher.java b/src/main/java/org/spdx/utility/compare/TemplateRegexMatcher.java index 00bb0066..9abe68ef 100644 --- a/src/main/java/org/spdx/utility/compare/TemplateRegexMatcher.java +++ b/src/main/java/org/spdx/utility/compare/TemplateRegexMatcher.java @@ -42,6 +42,7 @@ * * isTemplateMatchWithinText(String text) will return true if the text text matches the template * + * getQuickMatchRegex() will return a regular expression with limited backtracking which can be used for a quick search * getCompleteRegex() will return a regular expression for the entire license where * getStartRegex(int wordLimit) will return a regular expression to match the beginning of a license * and getEndRegex(int wordLimit) will return a regular expression to match the end of a license @@ -53,7 +54,7 @@ public class TemplateRegexMatcher implements ILicenseTemplateOutputHandler { static final Logger logger = LoggerFactory.getLogger(TemplateRegexMatcher.class); - static final int WORD_LIMIT = 25; // number of words to search for at the beginning and end of the template + static final int WORD_LIMIT = 25; // number of words to search for in the quick match, beginning and end of the template static final String REGEX_GLOBAL_MODIFIERS = "(?im)"; // ignore case and muti-line @@ -177,6 +178,45 @@ public String getCompleteRegex() { return REGEX_GLOBAL_MODIFIERS + regexPatternList.toString(); } + /** + * @param wordLimit maximum number of contiguous words to match + * @return a regular expression to match the template with minimum backtracking - avoiding optional and var tags + */ + public String getQuickMatchRegex(int wordLimit) { + RegexList result = new RegexList(); + int index = 0; + int numWords = 0; + List elementList = regexPatternList.getElements(); + int largestContiguousText = 0; // number of contiguous tokens in a regular text + while (index < elementList.size() && numWords <= wordLimit) { + RegexElement element = elementList.get(index++); + result.addElement(element); + if (element instanceof RegexToken) { + numWords++; + } else { + if (numWords > largestContiguousText) { + largestContiguousText = numWords; + } + result.getElements().clear(); + numWords = 0; + } + } + if (numWords < largestContiguousText) { + // Need to retry to get as much as we can + while (index < elementList.size() && numWords <= largestContiguousText) { + RegexElement element = elementList.get(index++); + result.addElement(element); + if (element instanceof RegexToken) { + numWords++; + } else { + result.getElements().clear(); + numWords = 0; + } + } + } + return REGEX_GLOBAL_MODIFIERS + result.toString(); + } + /** * @param wordLimit number of non optional words to include in the pattern * @return a regex to match the start of the license per the template @@ -269,15 +309,18 @@ public boolean isTemplateMatchWithinText(String text) throws SpdxCompareExceptio String compareText = normalizedText.toString(); - Pattern startPattern = Pattern.compile(getStartRegex(WORD_LIMIT)); - Matcher startMatcher = startPattern.matcher(compareText); - if(startMatcher.find()) { - startIndex = startMatcher.start(); - Pattern endPattern = Pattern.compile(getEndRegex(WORD_LIMIT)); - Matcher endMatcher = endPattern.matcher(compareText); - if (endMatcher.find()) { - endIndex = endMatcher.end(); - result = compareText.substring(startIndex, endIndex); + Pattern quickPattern = Pattern.compile(getQuickMatchRegex(WORD_LIMIT)); + if (quickPattern.matcher(compareText).find()) { + Pattern startPattern = Pattern.compile(getStartRegex(WORD_LIMIT)); + Matcher startMatcher = startPattern.matcher(compareText); + if(startMatcher.find()) { + startIndex = startMatcher.start(); + Pattern endPattern = Pattern.compile(getEndRegex(WORD_LIMIT)); + Matcher endMatcher = endPattern.matcher(compareText); + if (endMatcher.find()) { + endIndex = endMatcher.end(); + result = compareText.substring(startIndex, endIndex); + } } } return result;