Skip to content

Commit

Permalink
fix(markdown-serializer): Override Turndown escaping behaviour with c…
Browse files Browse the repository at this point in the history
…ustom rules (#102)
  • Loading branch information
rfgamaral authored Feb 23, 2023
1 parent b30b0f2 commit 6950afb
Show file tree
Hide file tree
Showing 3 changed files with 199 additions and 120 deletions.
9 changes: 8 additions & 1 deletion src/constants/regular-expressions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,11 @@
*/
const REGEX_LINE_BREAKS = /(?:\r\n|\r|\n)/g

export { REGEX_LINE_BREAKS }
/**
* A regex for standard punctuation characters for US-ASCII plus unicode punctuation.
*
* @see https://stackoverflow.com/a/25575009
*/
const REGEX_PUNCTUATION = /[\u2000-\u206f\u2e00-\u2e7f'!"#$%&()*+,\-./:;<=>?@\\[\]^_`{|}~]/

export { REGEX_LINE_BREAKS, REGEX_PUNCTUATION }
257 changes: 160 additions & 97 deletions src/serializers/markdown/markdown.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@ import { createMarkdownSerializer } from './markdown'

import type { MarkdownSerializerReturnType } from './markdown'

const HTML_INPUT_SPECIAL_HTML_CHARS = `Ambition &amp; Balance<br>
&lt;doist&gt;<br>
&lt;/doist&gt;<br>
&lt;doist&gt;&lt;/doist&gt;<br>
&quot;Doist&quot;<br>
&#39;Doist&#39;`

const HTML_INPUT_HEADINGS = `<h1>Heading level 1</h1>
<h2>Heading level 2</h2>
<h3>Heading level 3</h3>
Expand Down Expand Up @@ -94,11 +101,6 @@ const HTML_INPUT_UNORDERED_LISTS = `<ul>
</ul>
<hr>
<ul>
<li>1968. A great year!</li>
<li>I think 1969 was second best.</li>
</ul>
<hr>
<ul>
<li>This is the first list item.</li>
<li>Here&#39;s the second list item.<br> I need to add another paragraph below the second list item.</li>
<li>And here&#39;s the third list item.</li>
Expand Down Expand Up @@ -137,11 +139,6 @@ const HTML_INPUT_TASK_LISTS = `<ul data-type="taskList">
</ul>
<hr>
<ul data-type="taskList">
<li data-type="taskItem" data-checked="false">1968. A great year!</li>
<li data-type="taskItem" data-checked="true">I think 1969 was second best.</li>
</ul>
<hr>
<ul data-type="taskList">
<li data-type="taskItem" data-checked="false">This is the first list item.</li>
<li data-type="taskItem" data-checked="false">Here&#39;s the second list item.<br> I need to add another paragraph below the second list item.</li>
<li data-type="taskItem" data-checked="false">And here&#39;s the third list item.</li>
Expand Down Expand Up @@ -183,29 +180,40 @@ const HTML_INPUT_LINKS = `<p>My favorite search engine is <a href="https://duckd

const HTML_INPUT_STYLED_LINKS = `<p>I love supporting the <strong><a href="https://eff.org">EFF</a></strong>.<br>This is the <em><a href="https://www.markdownguide.org">Markdown Guide</a></em>.<br>See the section on <a href="#code"><code>code</code></a>.</p>`

describe('Markdown Serializer', () => {
const HTML_INPUT_SPECIAL_HTML_CHARS = `Ambition &amp; Balance<br>
&lt;doist&gt;<br>
&lt;/doist&gt;<br>
&lt;doist&gt;&lt;/doist&gt;<br>
&quot;Doist&quot;<br>
&#39;Doist&#39;`

const HTML_INPUT_SPECIAL_MARKDOWN_CHARS = `before \\ after<br>
before * after<br>
- after<br>
+ after<br>
= after<br>
=== after<br>
\` after <br>
~~~ after<br>
before [ after<br>
before ] after<br>
> after<br>
before _ after<br>
1. after<br>
99. after<br>`
const HTML_INPUT_PONCTUATION_CHARACTERS = `<p>\\' text \\'</p>
<p>\\! text \\!</p>
<p>\\" text \\"</p>
<p>\\# text \\#</p>
<p>\\$ text \\$</p>
<p>\\% text \\%</p>
<p>\\&amp; text \\&amp;</p>
<p>\\( text \\(</p>
<p>\\) text \\)</p>
<p>\\* text \\*</p>
<p>\\+ text \\+</p>
<p>\\, text \\,</p>
<p>\\\\ text \\\\</p>
<p>\\- text \\-</p>
<p>\\. text \\.</p>
<p>\\/ text \\/</p>
<p>\\: text \\:</p>
<p>\\; text \\;</p>
<p>\\&lt; text \\&lt;</p>
<p>\\= text \\=</p>
<p>\\&gt; text \\&gt;</p>
<p>\\? text \\?</p>
<p>\\@ text \\@</p>
<p>\\[ text \\[</p>
<p>\\] text \\]</p>
<p>\\^ text \\^</p>
<p>\\_ text \\_</p>
<p>\\\` text \\\`</p>
<p>\\{ text \\{</p>
<p>\\| text \\|</p>
<p>\\} text \\}</p>
<p>\\~ text \\~</p>`

describe('Markdown Serializer', () => {
describe('Plain-text Document', () => {
describe('with default extensions', () => {
let markdownSerializer: MarkdownSerializerReturnType
Expand All @@ -227,24 +235,6 @@ before _ after<br>
'Doist'`)
})

test('special Markdown characters are NOT escaped', () => {
expect(markdownSerializer.serialize(HTML_INPUT_SPECIAL_MARKDOWN_CHARS))
.toBe(`before \\ after
before * after
- after
+ after
= after
=== after
\` after
~~~ after
before [ after
before ] after
> after
before _ after
1. after
99. after`)
})

test('paragraphs Markdown output is correct', () => {
expect(markdownSerializer.serialize(HTML_INPUT_PARAGRAPHS)).toBe(
`I really like using Markdown.
Expand All @@ -260,6 +250,49 @@ I think I'll use it to format all of my documents from now on.`,
replacement: expect.any(Function),
})
})

describe('with overridden `escape` function', () => {
test('return the input as-is (escaping behaviour disabled)', () => {
expect(
markdownSerializer.serialize(`<p>- 1968. A great year!</p>
<p>- I think 1969 was second best.</p>`),
).toBe(`- 1968. A great year!
- I think 1969 was second best.`)
expect(markdownSerializer.serialize(HTML_INPUT_PONCTUATION_CHARACTERS))
.toBe(`\\' text \\'
\\! text \\!
\\" text \\"
\\# text \\#
\\$ text \\$
\\% text \\%
\\& text \\&
\\( text \\(
\\) text \\)
\\* text \\*
\\+ text \\+
\\, text \\,
\\\\ text \\\\
\\- text \\-
\\. text \\.
\\/ text \\/
\\: text \\:
\\; text \\;
\\< text \\<
\\= text \\=
\\> text \\>
\\? text \\?
\\@ text \\@
\\[ text \\[
\\] text \\]
\\^ text \\^
\\_ text \\_
\\\` text \\\`
\\{ text \\{
\\| text \\|
\\} text \\}
\\~ text \\~`)
})
})
})

describe('with custom `*Suggestion` extensions', () => {
Expand Down Expand Up @@ -292,7 +325,7 @@ Answer: [Doist Frontend](channel://190200)`)
})

describe('Rich-text Document', () => {
describe('without default extensions', () => {
describe('with default extensions', () => {
let markdownSerializer: MarkdownSerializerReturnType

beforeEach(() => {
Expand All @@ -309,24 +342,6 @@ Answer: [Doist Frontend](channel://190200)`)
'Doist'`)
})

test('special Markdown characters are escaped', () => {
expect(markdownSerializer.serialize(HTML_INPUT_SPECIAL_MARKDOWN_CHARS))
.toBe(`before \\\\ after
before \\* after
\\- after
\\+ after
\\= after
\\=== after
\\\` after
\\~~~ after
before \\[ after
before \\] after
\\> after
before \\_ after
1\\. after
99\\. after`)
})

test('headings Markdown output is correct', () => {
expect(markdownSerializer.serialize(HTML_INPUT_HEADINGS)).toBe(
'# Heading level 1\n\n## Heading level 2\n\n### Heading level 3\n\n#### Heading level 4\n\n##### Heading level 5\n\n###### Heading level 6',
Expand Down Expand Up @@ -427,11 +442,6 @@ Strikethrough uses two tildes: ~~scratch this~~`,
---
- 1968\\. A great year!
- I think 1969 was second best.
---
- This is the first list item.
- Here's the second list item.
I need to add another paragraph below the second list item.
Expand Down Expand Up @@ -471,11 +481,6 @@ Strikethrough uses two tildes: ~~scratch this~~`,
---
- 1968\\. A great year!
- I think 1969 was second best.
---
- This is the first list item.
- Here's the second list item.
I need to add another paragraph below the second list item.
Expand Down Expand Up @@ -554,15 +559,83 @@ See the section on [\`code\`](#code).`,
)
})

test('special Markdown characters are NOT escaped if `escape` is disabled', () => {
const customSerializer = createMarkdownSerializer(getSchema([RichTextKit]), {
escape: false,
describe('with overridden `escape` function', () => {
test('backslash characters preceding punctuation characters are escaped correctly', () => {
expect(markdownSerializer.serialize(HTML_INPUT_PONCTUATION_CHARACTERS))
.toBe(`\\\\' text \\\\'
\\\\! text \\\\!
\\\\" text \\\\"
\\\\# text \\\\#
\\\\$ text \\\\$
\\\\% text \\\\%
\\\\& text \\\\&
\\\\( text \\\\(
\\\\) text \\\\)
\\\\* text \\\\*
\\\\+ text \\\\+
\\\\, text \\\\,
\\\\\\ text \\\\\\
\\\\- text \\\\-
\\\\. text \\\\.
\\\\/ text \\\\/
\\\\: text \\\\:
\\\\; text \\\\;
\\\\< text \\\\<
\\\\= text \\\\=
\\\\> text \\\\>
\\\\? text \\\\?
\\\\@ text \\\\@
\\\\[ text \\\\[
\\\\] text \\\\]
\\\\^ text \\\\^
\\\\_ text \\\\_
\\\\\` text \\\\\`
\\\\{ text \\\\{
\\\\| text \\\\|
\\\\} text \\\\}
\\\\~ text \\\\~`)
})

test('text content that matches the ordered list syntax is escaped correctly', () => {
expect(
markdownSerializer.serialize(`<ul>
<li>1968. A great year!</li>
<li>I think 1969 was second best.</li>
</ul>`),
).toBe(`- 1968\\. A great year!
- I think 1969 was second best.`)
})
expect(
customSerializer.serialize(
`<p><strong>Wrapped markdown</strong> **still markdown**</p>`,
),
).toBe(`**Wrapped markdown** **still markdown**`)
})
})

Expand Down Expand Up @@ -620,11 +693,6 @@ See the section on [\`code\`](#code).`,
---
- 1968\\. A great year!
- I think 1969 was second best.
---
- This is the first list item.
- Here's the second list item.
I need to add another paragraph below the second list item.
Expand Down Expand Up @@ -686,11 +754,6 @@ See the section on [\`code\`](#code).`,
---
- [ ] 1968\\. A great year!
- [x] I think 1969 was second best.
---
- [ ] This is the first list item.
- [ ] Here's the second list item.
I need to add another paragraph below the second list item.
Expand Down
Loading

0 comments on commit 6950afb

Please sign in to comment.