fix(markdown-serializer): Override Turndown escaping behaviour with c…

…ustom rules BREAKING CHANGE: With this change, we are making sure that the Markdown serializer escaping behaviour is tailored to our needs in every place that it's used. Thus, we no longer need the `options` parameter to disable the escaping behaviour.
Doist · Jan 16, 2023 · 1e4a581 · 1e4a581
1 parent 3599b44
commit 1e4a581
Show file tree

Hide file tree

Showing 3 changed files with 199 additions and 120 deletions.
diff --git a/src/constants/regular-expressions.ts b/src/constants/regular-expressions.ts
@@ -3,4 +3,11 @@
  */
 const REGEX_LINE_BREAKS = /(?:\r\n|\r|\n)/g
 
-export { REGEX_LINE_BREAKS }
+/**
+ * A regex for standard punctuation characters for US-ASCII plus unicode punctuation.
+ *
+ * @see https://stackoverflow.com/a/25575009
+ */
+const REGEX_PUNCTUATION = /[\u2000-\u206f\u2e00-\u2e7f'!"#$%&()*+,\-./:;<=>?@\\[\]^_`{|}~]/
+
+export { REGEX_LINE_BREAKS, REGEX_PUNCTUATION }
diff --git a/src/serializers/markdown/markdown.test.ts b/src/serializers/markdown/markdown.test.ts
@@ -11,6 +11,13 @@ import { createMarkdownSerializer } from './markdown'
 
 import type { MarkdownSerializerReturnType } from './markdown'
 
+const HTML_INPUT_SPECIAL_HTML_CHARS = `Ambition &amp; Balance<br>
+&lt;doist&gt;<br>
+&lt;/doist&gt;<br>
+&lt;doist&gt;&lt;/doist&gt;<br>
+&quot;Doist&quot;<br>
+&#39;Doist&#39;`
+
 const HTML_INPUT_HEADINGS = `<h1>Heading level 1</h1>
 <h2>Heading level 2</h2>
 <h3>Heading level 3</h3>
@@ -94,11 +101,6 @@ const HTML_INPUT_UNORDERED_LISTS = `<ul>
 </ul>
 <hr>
 <ul>
-<li>1968. A great year!</li>
-<li>I think 1969 was second best.</li>
-</ul>
-<hr>
-<ul>
 <li>This is the first list item.</li>
 <li>Here&#39;s the second list item.<br>  I need to add another paragraph below the second list item.</li>
 <li>And here&#39;s the third list item.</li>
@@ -137,11 +139,6 @@ const HTML_INPUT_TASK_LISTS = `<ul data-type="taskList">
 </ul>
 <hr>
 <ul data-type="taskList">
-<li data-type="taskItem" data-checked="false">1968. A great year!</li>
-<li data-type="taskItem" data-checked="true">I think 1969 was second best.</li>
-</ul>
-<hr>
-<ul data-type="taskList">
 <li data-type="taskItem" data-checked="false">This is the first list item.</li>
 <li data-type="taskItem" data-checked="false">Here&#39;s the second list item.<br>  I need to add another paragraph below the second list item.</li>
 <li data-type="taskItem" data-checked="false">And here&#39;s the third list item.</li>
@@ -183,29 +180,40 @@ const HTML_INPUT_LINKS = `<p>My favorite search engine is <a href="https://duckd
 
 const HTML_INPUT_STYLED_LINKS = `<p>I love supporting the <strong><a href="https://eff.org">EFF</a></strong>.<br>This is the <em><a href="https://www.markdownguide.org">Markdown Guide</a></em>.<br>See the section on <a href="#code"><code>code</code></a>.</p>`
 
-describe('Markdown Serializer', () => {
-    const HTML_INPUT_SPECIAL_HTML_CHARS = `Ambition &amp; Balance<br>
-&lt;doist&gt;<br>
-&lt;/doist&gt;<br>
-&lt;doist&gt;&lt;/doist&gt;<br>
-&quot;Doist&quot;<br>
-&#39;Doist&#39;`
-
-    const HTML_INPUT_SPECIAL_MARKDOWN_CHARS = `before \\ after<br>
-before * after<br>
-- after<br>
-+ after<br>
-= after<br>
-=== after<br>
-\` after <br>
-~~~ after<br>
-before [ after<br>
-before ] after<br>
-> after<br>
-before _ after<br>
-1. after<br>
-99. after<br>`
+const HTML_INPUT_PONCTUATION_CHARACTERS = `<p>\\'</p>
+<p>\\!</p>
+<p>\\"</p>
+<p>\\#</p>
+<p>\\$</p>
+<p>\\%</p>
+<p>\\&amp;</p>
+<p>\\(</p>
+<p>\\)</p>
+<p>\\*</p>
+<p>\\+</p>
+<p>\\,</p>
+<p>\\\\</p>
+<p>\\-</p>
+<p>\\.</p>
+<p>\\/</p>
+<p>\\:</p>
+<p>\\;</p>
+<p>\\&lt;</p>
+<p>\\=</p>
+<p>\\&gt;</p>
+<p>\\?</p>
+<p>\\@</p>
+<p>\\[</p>
+<p>\\]</p>
+<p>\\^</p>
+<p>\\_</p>
+<p>\\\`</p>
+<p>\\{</p>
+<p>\\|</p>
+<p>\\}</p>
+<p>\\~</p>`
 
+describe('Markdown Serializer', () => {
     describe('Plain-text Document', () => {
         describe('with default extensions', () => {
             let markdownSerializer: MarkdownSerializerReturnType
@@ -227,24 +235,6 @@ before _ after<br>
 'Doist'`)
             })
 
-            test('special Markdown characters are NOT escaped', () => {
-                expect(markdownSerializer.serialize(HTML_INPUT_SPECIAL_MARKDOWN_CHARS))
-                    .toBe(`before \\ after
-before * after
-- after
-+ after
-= after
-=== after
-\` after
-~~~ after
-before [ after
-before ] after
-> after
-before _ after
-1. after
-99. after`)
-            })
-
             test('paragraphs Markdown output is correct', () => {
                 expect(markdownSerializer.serialize(HTML_INPUT_PARAGRAPHS)).toBe(
                     `I really like using Markdown.
@@ -260,6 +250,49 @@ I think I'll use it to format all of my documents from now on.`,
                     replacement: expect.any(Function),
                 })
             })
+
+            describe('with overridden `escape` function', () => {
+                test('return the input as-is (escaping behaviour disabled)', () => {
+                    expect(
+                        markdownSerializer.serialize(`<p>- 1968. A great year!</p>
+<p>- I think 1969 was second best.</p>`),
+                    ).toBe(`- 1968. A great year!
+- I think 1969 was second best.`)
+                    expect(markdownSerializer.serialize(HTML_INPUT_PONCTUATION_CHARACTERS))
+                        .toBe(`\\'
+\\!
+\\"
+\\#
+\\$
+\\%
+\\&
+\\(
+\\)
+\\*
+\\+
+\\,
+\\\\
+\\-
+\\.
+\\/
+\\:
+\\;
+\\<
+\\=
+\\>
+\\?
+\\@
+\\[
+\\]
+\\^
+\\_
+\\\`
+\\{
+\\|
+\\}
+\\~`)
+                })
+            })
         })
 
         describe('with custom `*Suggestion` extensions', () => {
@@ -292,7 +325,7 @@ Answer: [Doist Frontend](channel://190200)`)
     })
 
     describe('Rich-text Document', () => {
-        describe('without default extensions', () => {
+        describe('with default extensions', () => {
             let markdownSerializer: MarkdownSerializerReturnType
 
             beforeEach(() => {
@@ -309,24 +342,6 @@ Answer: [Doist Frontend](channel://190200)`)
 'Doist'`)
             })
 
-            test('special Markdown characters are escaped', () => {
-                expect(markdownSerializer.serialize(HTML_INPUT_SPECIAL_MARKDOWN_CHARS))
-                    .toBe(`before \\\\ after
-before \\* after
-\\- after
-\\+ after
-\\= after
-\\=== after
-\\\` after
-\\~~~ after
-before \\[ after
-before \\] after
-\\> after
-before \\_ after
-1\\. after
-99\\. after`)
-            })
-
             test('headings Markdown output is correct', () => {
                 expect(markdownSerializer.serialize(HTML_INPUT_HEADINGS)).toBe(
                     '# Heading level 1\n\n## Heading level 2\n\n### Heading level 3\n\n#### Heading level 4\n\n##### Heading level 5\n\n###### Heading level 6',
@@ -427,11 +442,6 @@ Strikethrough uses two tildes: ~~scratch this~~`,
 
 ---
 
-- 1968\\. A great year!
-- I think 1969 was second best.
-
----
-
 - This is the first list item.
 - Here's the second list item.
     I need to add another paragraph below the second list item.
@@ -471,11 +481,6 @@ Strikethrough uses two tildes: ~~scratch this~~`,
 
 ---
 
--   1968\\. A great year!
--   I think 1969 was second best.
-
----
-
 -   This is the first list item.
 -   Here's the second list item.
     I need to add another paragraph below the second list item.
@@ -554,15 +559,83 @@ See the section on [\`code\`](#code).`,
                 )
             })
 
-            test('special Markdown characters are NOT escaped if `escape` is disabled', () => {
-                const customSerializer = createMarkdownSerializer(getSchema([RichTextKit]), {
-                    escape: false,
+            describe('with overridden `escape` function', () => {
+                test('backslash characters preceding punctuation characters are escaped correctly', () => {
+                    expect(markdownSerializer.serialize(HTML_INPUT_PONCTUATION_CHARACTERS))
+                        .toBe(`\\\\'
+
+\\\\!
+
+\\\\"
+
+\\\\#
+
+\\\\$
+
+\\\\%
+
+\\\\&
+
+\\\\(
+
+\\\\)
+
+\\\\*
+
+\\\\+
+
+\\\\,
+
+\\\\\\
+
+\\\\-
+
+\\\\.
+
+\\\\/
+
+\\\\:
+
+\\\\;
+
+\\\\<
+
+\\\\=
+
+\\\\>
+
+\\\\?
+
+\\\\@
+
+\\\\[
+
+\\\\]
+
+\\\\^
+
+\\\\_
+
+\\\\\`
+
+\\\\{
+
+\\\\|
+
+\\\\}
+
+\\\\~`)
+                })
+
+                test('text content that matches the ordered list syntax is escaped correctly', () => {
+                    expect(
+                        markdownSerializer.serialize(`<ul>
+<li>1968. A great year!</li>
+<li>I think 1969 was second best.</li>
+</ul>`),
+                    ).toBe(`- 1968\\. A great year!
+- I think 1969 was second best.`)
                 })
-                expect(
-                    customSerializer.serialize(
-                        `<p><strong>Wrapped markdown</strong> **still markdown**</p>`,
-                    ),
-                ).toBe(`**Wrapped markdown** **still markdown**`)
             })
         })
 
@@ -620,11 +693,6 @@ See the section on [\`code\`](#code).`,
 
 ---
 
--   1968\\. A great year!
--   I think 1969 was second best.
-
----
-
 -   This is the first list item.
 -   Here's the second list item.
     I need to add another paragraph below the second list item.
@@ -686,11 +754,6 @@ See the section on [\`code\`](#code).`,
 
 ---
 
-- [ ] 1968\\. A great year!
-- [x] I think 1969 was second best.
-
----
-
 - [ ] This is the first list item.
 - [ ] Here's the second list item.
     I need to add another paragraph below the second list item.