diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 2cacef5120edd..01dd1d125d352 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -666,3 +666,4 @@ !highlight.pdf !bug1708040.pdf !issue18694.pdf +!issue18693.pdf diff --git a/test/pdfs/issue18693.pdf b/test/pdfs/issue18693.pdf new file mode 100755 index 0000000000000..68c628063762a Binary files /dev/null and b/test/pdfs/issue18693.pdf differ diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index e1f3169d58fb8..408cba67d7aca 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -1062,6 +1062,26 @@ describe("pdf_find_controller", function () { await testOnFind({ eventBus }); }); + it("performs a search in a text with compound word on two lines", async function () { + const { eventBus, pdfFindController } = + await initPdfFindController("issue18693.pdf"); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "hel-Lo", + }, + matchesPerPage: [1], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[6]], + pageMatchesLength: [[7]], + }); + }); + describe("custom matcher", () => { it("calls to the matcher with the right arguments", async () => { const QUERY = "Foo bar"; diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index 09e61fb2d9385..71855127a4b7b 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -131,7 +131,8 @@ function normalize(text) { // 30A0-30FF: Katakana const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])"; const HKDiacritics = "(?:\u3099|\u309A)"; - const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(${HKDiacritics}\\n)|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(${CJK}\\n)|(\\n)`; + const CompoundWord = "\\p{Ll}-\\n\\p{Lu}"; + const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(${HKDiacritics}\\n)|(\\p{M}+(?:-\\n)?)|(${CompoundWord})|(\\S-\\n)|(${CJK}\\n)|(\\n)`; if (syllablePositions.length === 0) { // Most of the syllables belong to Hangul so there are no need @@ -193,7 +194,7 @@ function normalize(text) { normalized = normalized.replace( normalizationRegex, - (match, p1, p2, p3, p4, p5, p6, p7, p8, i) => { + (match, p1, p2, p3, p4, p5, p6, p7, p8, p9, i) => { i -= shiftOrigin; if (p1) { // Maybe fractions or quotations mark... @@ -267,7 +268,7 @@ function normalize(text) { if (hasTrailingDashEOL) { // Diacritics are followed by a -\n. - // See comments in `if (p5)` block. + // See comments in `if (p6)` block. i += len - 1; positions.push([i - shift + 1, 1 + shift]); shift += 1; @@ -280,32 +281,41 @@ function normalize(text) { } if (p5) { + // Compound word with a line break after the hyphen. + positions.push([i - shift + 3, 1 + shift]); + shift += 1; + shiftOrigin += 1; + eol += 1; + return p5.replace("\n", ""); + } + + if (p6) { // "X-\n" is removed because an hyphen at the end of a line // with not a space before is likely here to mark a break // in a word. // If X is encoded with UTF-32 then it can have a length greater than 1. // The \n isn't in the original text so here y = i, n = X.len - 2 and // o = X.len - 1. - const len = p5.length - 2; + const len = p6.length - 2; positions.push([i - shift + len, 1 + shift]); shift += 1; shiftOrigin += 1; eol += 1; - return p5.slice(0, -2); + return p6.slice(0, -2); } - if (p6) { + if (p7) { // An ideographic at the end of a line doesn't imply adding an extra // white space. // A CJK can be encoded in UTF-32, hence their length isn't always 1. - const len = p6.length - 1; + const len = p7.length - 1; positions.push([i - shift + len, shift]); shiftOrigin += 1; eol += 1; - return p6.slice(0, -1); + return p7.slice(0, -1); } - if (p7) { + if (p8) { // eol is replaced by space: "foo\nbar" is likely equivalent to // "foo bar". positions.push([i - shift + 1, shift - 1]); @@ -327,7 +337,7 @@ function normalize(text) { shift -= newCharLen; shiftOrigin += newCharLen; } - return p8; + return p9; } );