From e304501ec3384a48d820b51668adb3e7e81c15fa Mon Sep 17 00:00:00 2001 From: Johannes Kaufmann Date: Sat, 31 Aug 2024 12:48:16 +0200 Subject: [PATCH] inital commit of v2 --- .github/ISSUE_TEMPLATE/bug_report.md | 30 + .github/dependabot.yml | 9 + .github/workflows/go.yml | 58 ++ .github/workflows/release.yml | 36 ++ .gitignore | 27 + .goreleaser.yaml | 46 ++ CONTRIBUTING.md | 0 README.md | 120 ++++ SECURITY.md | 6 + cli/cmd/cli_run.go | 30 + cli/cmd/cmd_convert.go | 48 ++ cli/cmd/cmd_help.go | 108 ++++ cli/cmd/cmd_version.go | 11 + cli/cmd/errors.go | 66 ++ cli/cmd/exec.go | 147 +++++ cli/cmd/exec_test.go | 319 ++++++++++ cli/cmd/flags.go | 80 +++ cli/cmd/flags_categorize.go | 68 +++ cli/cmd/flags_test.go | 47 ++ cli/cmd/print.go | 59 ++ cli/cmd/testdata/.gitattributes | 4 + .../[argument_unknown]_html/stderr.golden | 7 + .../[argument_unknown]_html/stdout.golden | 0 .../stderr.golden | 7 + .../stdout.golden | 0 .../stderr.golden | 7 + .../stdout.golden | 0 .../[argument_unknown]_version/stderr.golden | 7 + .../[argument_unknown]_version/stdout.golden | 0 .../[convert]_strong_default/stderr.golden | 0 .../[convert]_strong_default/stdout.golden | 1 + .../stderr.golden | 0 .../stdout.golden | 1 + .../stderr.golden | 0 .../stdout.golden | 1 + .../stderr.golden | 5 + .../stdout.golden | 0 .../[flag_unknown]_with_pipe/stderr.golden | 3 + .../[flag_unknown]_with_pipe/stdout.golden | 0 .../stderr.golden | 3 + .../stdout.golden | 0 .../[general]_help_pipe/stderr.golden | 0 .../[general]_help_pipe/stdout.golden | 53 ++ .../[general]_help_terminal/stderr.golden | 0 .../[general]_help_terminal/stdout.golden | 53 ++ .../[general]_no_content/stderr.golden | 7 + .../[general]_no_content/stdout.golden | 0 .../[general]_version_pipe/stderr.golden | 0 .../[general]_version_pipe/stdout.golden | 5 + .../[general]_version_terminal/stderr.golden | 0 .../[general]_version_terminal/stdout.golden | 5 + .../stderr.golden | 3 + .../stdout.golden | 0 .../[validation]_invalid_value/stderr.golden | 3 + .../[validation]_invalid_value/stdout.golden | 0 .../[validation]_no_value/stderr.golden | 0 .../[validation]_no_value/stdout.golden | 1 + cli/cmd/util_pipe.go | 25 + cli/main.go | 30 + collapse/collapse.go | 187 ++++++ collapse/collapse_test.go | 289 +++++++++ collapse/whitespace.go | 79 +++ collapse/whitespace_test.go | 247 ++++++++ convert.go | 23 + convert_test.go | 143 +++++ converter/base.go | 133 ++++ converter/base_test.go | 164 +++++ converter/convert.go | 118 ++++ converter/convert_test.go | 78 +++ converter/converter.go | 44 ++ converter/ctx.go | 186 ++++++ converter/ctx_test.go | 54 ++ converter/escape.go | 82 +++ converter/escape_test.go | 55 ++ converter/keep_remove.go | 42 ++ converter/plugin.go | 22 + converter/prioritized.go | 33 + converter/prioritized_test.go | 36 ++ converter/register.go | 171 ++++++ converter/render.go | 65 ++ converter/status.go | 8 + converter/url.go | 108 ++++ converter/url_test.go | 305 ++++++++++ examples/basics/main.go | 19 + examples/options/main.go | 29 + go.mod | 24 + go.sum | 51 ++ internal/domutils/add_space.go | 66 ++ internal/domutils/add_space_test.go | 61 ++ internal/domutils/adjacent.go | 75 +++ internal/domutils/adjacent_test.go | 300 +++++++++ internal/domutils/alternatives.go | 114 ++++ internal/domutils/alternatives_test.go | 82 +++ internal/domutils/domutils.go | 43 ++ internal/domutils/empty_code.go | 42 ++ internal/domutils/empty_code_test.go | 74 +++ internal/domutils/list_end_comment.go | 74 +++ internal/domutils/redundant.go | 29 + internal/domutils/redundant_test.go | 115 ++++ internal/domutils/span.go | 50 ++ internal/domutils/span_test.go | 82 +++ internal/domutils/swap.go | 51 ++ internal/domutils/swap_test.go | 567 ++++++++++++++++++ internal/escape/elem_backslash.go | 9 + internal/escape/elem_code.go | 45 ++ internal/escape/elem_code_test.go | 129 ++++ internal/escape/elem_divider.go | 47 ++ internal/escape/elem_divider_test.go | 79 +++ internal/escape/elem_header.go | 77 +++ internal/escape/elem_header_test.go | 157 +++++ internal/escape/elem_image.go | 36 ++ internal/escape/elem_image_test.go | 72 +++ internal/escape/elem_italic_bold.go | 21 + internal/escape/elem_italic_bold_test.go | 204 +++++++ internal/escape/elem_list.go | 67 +++ internal/escape/elem_list_test.go | 123 ++++ internal/escape/elem_quote.go | 23 + internal/escape/elem_quote_test.go | 67 +++ internal/escape/replacer.go | 8 + internal/escape/util.go | 70 +++ internal/escape/util_test.go | 208 +++++++ internal/tester/dom.go | 51 ++ internal/tester/dom_representation.go | 20 + internal/tester/goldenfiles.go | 98 +++ internal/tester/round_trip.go | 121 ++++ internal/textutils/codefence.go | 61 ++ internal/textutils/codefence_test.go | 79 +++ internal/textutils/collapse_code.go | 31 + internal/textutils/collapse_code_test.go | 57 ++ internal/textutils/consecutive_newlines.go | 90 +++ .../textutils/consecutive_newlines_test.go | 170 ++++++ internal/textutils/delimiter.go | 37 ++ internal/textutils/delimiter_test.go | 90 +++ internal/textutils/escape_multiline.go | 77 +++ internal/textutils/escape_multiline_test.go | 140 +++++ internal/textutils/prefix_lines.go | 16 + internal/textutils/prefix_lines_test.go | 61 ++ internal/textutils/quote.go | 39 ++ internal/textutils/quote_test.go | 53 ++ internal/textutils/surrounding_spaces.go | 60 ++ internal/textutils/surrounding_spaces_test.go | 73 +++ marker/marker.go | 81 +++ marker/marker_test.go | 32 + plugin/commonmark/commonmark.go | 185 ++++++ plugin/commonmark/commonmark_test.go | 288 +++++++++ plugin/commonmark/handle_pre_render.go | 91 +++ plugin/commonmark/handle_render.go | 47 ++ plugin/commonmark/options.go | 118 ++++ plugin/commonmark/render_blockquote.go | 31 + plugin/commonmark/render_bold_italic.go | 39 ++ plugin/commonmark/render_bold_italic_test.go | 132 ++++ plugin/commonmark/render_break.go | 13 + plugin/commonmark/render_code.go | 149 +++++ plugin/commonmark/render_comment.go | 25 + plugin/commonmark/render_divider.go | 15 + plugin/commonmark/render_heading.go | 148 +++++ plugin/commonmark/render_image.go | 63 ++ plugin/commonmark/render_link.go | 97 +++ plugin/commonmark/render_list.go | 106 ++++ plugin/commonmark/testdata/.gitattributes | 4 + .../testdata/GoldenFiles/blockquote.in.html | 64 ++ .../testdata/GoldenFiles/blockquote.out.md | 46 ++ .../testdata/GoldenFiles/bold.in.html | 152 +++++ .../testdata/GoldenFiles/bold.out.md | 159 +++++ .../testdata/GoldenFiles/code.in.html | 287 +++++++++ .../testdata/GoldenFiles/code.out.md | 355 +++++++++++ .../testdata/GoldenFiles/heading.in.html | 149 +++++ .../testdata/GoldenFiles/heading.out.md | 133 ++++ .../testdata/GoldenFiles/image.in.html | 118 ++++ .../testdata/GoldenFiles/image.out.md | 95 +++ .../testdata/GoldenFiles/link.in.html | 267 +++++++++ .../testdata/GoldenFiles/link.out.md | 242 ++++++++ .../testdata/GoldenFiles/list.in.html | 191 ++++++ .../testdata/GoldenFiles/list.out.md | 153 +++++ .../testdata/GoldenFiles/metadata.in.html | 55 ++ .../testdata/GoldenFiles/metadata.out.md | 29 + plugin/commonmark/validation.go | 99 +++ plugin/commonmark/validation_test.go | 88 +++ plugin/strikethrough/strikethrough.go | 103 ++++ plugin/strikethrough/strikethrough_test.go | 95 +++ plugin/strikethrough/testdata/.gitattributes | 4 + .../GoldenFiles/strikethrough.in.html | 4 + .../testdata/GoldenFiles/strikethrough.out.md | 5 + 183 files changed, 13589 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/go.yml create mode 100644 .github/workflows/release.yml create mode 100644 .gitignore create mode 100644 .goreleaser.yaml create mode 100644 CONTRIBUTING.md create mode 100644 README.md create mode 100644 SECURITY.md create mode 100644 cli/cmd/cli_run.go create mode 100644 cli/cmd/cmd_convert.go create mode 100644 cli/cmd/cmd_help.go create mode 100644 cli/cmd/cmd_version.go create mode 100644 cli/cmd/errors.go create mode 100644 cli/cmd/exec.go create mode 100644 cli/cmd/exec_test.go create mode 100644 cli/cmd/flags.go create mode 100644 cli/cmd/flags_categorize.go create mode 100644 cli/cmd/flags_test.go create mode 100644 cli/cmd/print.go create mode 100644 cli/cmd/testdata/.gitattributes create mode 100644 cli/cmd/testdata/TestExecute/[argument_unknown]_html/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[argument_unknown]_html/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[argument_unknown]_list_of_files/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[argument_unknown]_list_of_files/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[argument_unknown]_long_string/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[argument_unknown]_long_string/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[argument_unknown]_version/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[argument_unknown]_version/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[convert]_strong_default/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[convert]_strong_default/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[convert]_strong_equal_underscore/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[convert]_strong_equal_underscore/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[convert]_strong_space_underscore/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[convert]_strong_space_underscore/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[flag_misspelled]_underscore/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[flag_misspelled]_underscore/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[flag_unknown]_with_pipe/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[flag_unknown]_with_pipe/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[flag_unknown]_with_terminal/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[flag_unknown]_with_terminal/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[general]_help_pipe/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[general]_help_pipe/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[general]_help_terminal/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[general]_help_terminal/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[general]_no_content/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[general]_no_content/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[general]_version_pipe/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[general]_version_pipe/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[general]_version_terminal/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[general]_version_terminal/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[validation]_discouraged_value/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[validation]_discouraged_value/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[validation]_invalid_value/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[validation]_invalid_value/stdout.golden create mode 100644 cli/cmd/testdata/TestExecute/[validation]_no_value/stderr.golden create mode 100644 cli/cmd/testdata/TestExecute/[validation]_no_value/stdout.golden create mode 100644 cli/cmd/util_pipe.go create mode 100644 cli/main.go create mode 100644 collapse/collapse.go create mode 100644 collapse/collapse_test.go create mode 100644 collapse/whitespace.go create mode 100644 collapse/whitespace_test.go create mode 100644 convert.go create mode 100644 convert_test.go create mode 100644 converter/base.go create mode 100644 converter/base_test.go create mode 100644 converter/convert.go create mode 100644 converter/convert_test.go create mode 100644 converter/converter.go create mode 100644 converter/ctx.go create mode 100644 converter/ctx_test.go create mode 100644 converter/escape.go create mode 100644 converter/escape_test.go create mode 100644 converter/keep_remove.go create mode 100644 converter/plugin.go create mode 100644 converter/prioritized.go create mode 100644 converter/prioritized_test.go create mode 100644 converter/register.go create mode 100644 converter/render.go create mode 100644 converter/status.go create mode 100644 converter/url.go create mode 100644 converter/url_test.go create mode 100644 examples/basics/main.go create mode 100644 examples/options/main.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 internal/domutils/add_space.go create mode 100644 internal/domutils/add_space_test.go create mode 100644 internal/domutils/adjacent.go create mode 100644 internal/domutils/adjacent_test.go create mode 100644 internal/domutils/alternatives.go create mode 100644 internal/domutils/alternatives_test.go create mode 100644 internal/domutils/domutils.go create mode 100644 internal/domutils/empty_code.go create mode 100644 internal/domutils/empty_code_test.go create mode 100644 internal/domutils/list_end_comment.go create mode 100644 internal/domutils/redundant.go create mode 100644 internal/domutils/redundant_test.go create mode 100644 internal/domutils/span.go create mode 100644 internal/domutils/span_test.go create mode 100644 internal/domutils/swap.go create mode 100644 internal/domutils/swap_test.go create mode 100644 internal/escape/elem_backslash.go create mode 100644 internal/escape/elem_code.go create mode 100644 internal/escape/elem_code_test.go create mode 100644 internal/escape/elem_divider.go create mode 100644 internal/escape/elem_divider_test.go create mode 100644 internal/escape/elem_header.go create mode 100644 internal/escape/elem_header_test.go create mode 100644 internal/escape/elem_image.go create mode 100644 internal/escape/elem_image_test.go create mode 100644 internal/escape/elem_italic_bold.go create mode 100644 internal/escape/elem_italic_bold_test.go create mode 100644 internal/escape/elem_list.go create mode 100644 internal/escape/elem_list_test.go create mode 100644 internal/escape/elem_quote.go create mode 100644 internal/escape/elem_quote_test.go create mode 100644 internal/escape/replacer.go create mode 100644 internal/escape/util.go create mode 100644 internal/escape/util_test.go create mode 100644 internal/tester/dom.go create mode 100644 internal/tester/dom_representation.go create mode 100644 internal/tester/goldenfiles.go create mode 100644 internal/tester/round_trip.go create mode 100644 internal/textutils/codefence.go create mode 100644 internal/textutils/codefence_test.go create mode 100644 internal/textutils/collapse_code.go create mode 100644 internal/textutils/collapse_code_test.go create mode 100644 internal/textutils/consecutive_newlines.go create mode 100644 internal/textutils/consecutive_newlines_test.go create mode 100644 internal/textutils/delimiter.go create mode 100644 internal/textutils/delimiter_test.go create mode 100644 internal/textutils/escape_multiline.go create mode 100644 internal/textutils/escape_multiline_test.go create mode 100644 internal/textutils/prefix_lines.go create mode 100644 internal/textutils/prefix_lines_test.go create mode 100644 internal/textutils/quote.go create mode 100644 internal/textutils/quote_test.go create mode 100644 internal/textutils/surrounding_spaces.go create mode 100644 internal/textutils/surrounding_spaces_test.go create mode 100644 marker/marker.go create mode 100644 marker/marker_test.go create mode 100644 plugin/commonmark/commonmark.go create mode 100644 plugin/commonmark/commonmark_test.go create mode 100644 plugin/commonmark/handle_pre_render.go create mode 100644 plugin/commonmark/handle_render.go create mode 100644 plugin/commonmark/options.go create mode 100644 plugin/commonmark/render_blockquote.go create mode 100644 plugin/commonmark/render_bold_italic.go create mode 100644 plugin/commonmark/render_bold_italic_test.go create mode 100644 plugin/commonmark/render_break.go create mode 100644 plugin/commonmark/render_code.go create mode 100644 plugin/commonmark/render_comment.go create mode 100644 plugin/commonmark/render_divider.go create mode 100644 plugin/commonmark/render_heading.go create mode 100644 plugin/commonmark/render_image.go create mode 100644 plugin/commonmark/render_link.go create mode 100644 plugin/commonmark/render_list.go create mode 100644 plugin/commonmark/testdata/.gitattributes create mode 100644 plugin/commonmark/testdata/GoldenFiles/blockquote.in.html create mode 100644 plugin/commonmark/testdata/GoldenFiles/blockquote.out.md create mode 100644 plugin/commonmark/testdata/GoldenFiles/bold.in.html create mode 100644 plugin/commonmark/testdata/GoldenFiles/bold.out.md create mode 100644 plugin/commonmark/testdata/GoldenFiles/code.in.html create mode 100644 plugin/commonmark/testdata/GoldenFiles/code.out.md create mode 100644 plugin/commonmark/testdata/GoldenFiles/heading.in.html create mode 100644 plugin/commonmark/testdata/GoldenFiles/heading.out.md create mode 100644 plugin/commonmark/testdata/GoldenFiles/image.in.html create mode 100644 plugin/commonmark/testdata/GoldenFiles/image.out.md create mode 100644 plugin/commonmark/testdata/GoldenFiles/link.in.html create mode 100644 plugin/commonmark/testdata/GoldenFiles/link.out.md create mode 100644 plugin/commonmark/testdata/GoldenFiles/list.in.html create mode 100644 plugin/commonmark/testdata/GoldenFiles/list.out.md create mode 100644 plugin/commonmark/testdata/GoldenFiles/metadata.in.html create mode 100644 plugin/commonmark/testdata/GoldenFiles/metadata.out.md create mode 100644 plugin/commonmark/validation.go create mode 100644 plugin/commonmark/validation_test.go create mode 100644 plugin/strikethrough/strikethrough.go create mode 100644 plugin/strikethrough/strikethrough_test.go create mode 100644 plugin/strikethrough/testdata/.gitattributes create mode 100644 plugin/strikethrough/testdata/GoldenFiles/strikethrough.in.html create mode 100644 plugin/strikethrough/testdata/GoldenFiles/strikethrough.out.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..ef7d38f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,30 @@ +--- +name: Bug report +about: Create a report to help us improve +title: "\U0001F41B Bug" +labels: bug +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**HTML Input** +```html +

Title

+``` + + +**Generated Markdown** +````markdown +# Title +```` + +**Expected Markdown** +````markdown +# Title!!! +```` + +**Additional context** +Add any other context about the problem here. For example, if you changed the default options or used a plugin. Also adding the version from the `go.mod` is helpful. diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..0c13a07 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,9 @@ +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "gomod" + directory: "/" + schedule: + interval: "weekly" diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml new file mode 100644 index 0000000..fbb64b7 --- /dev/null +++ b/.github/workflows/go.yml @@ -0,0 +1,58 @@ +name: Go + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + + # Test the latest go version + # and upload the test coverage. + test_latest: + name: Go latest stable + + runs-on: ubuntu-latest + steps: + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: 'stable' + check-latest: true + + - name: Checkout code + uses: actions/checkout@v4 + + - name: Build + run: go build -v . + + - name: Test + run: go test ./... -v -race -coverprofile=coverage.txt -covermode=atomic + + # - uses: codecov/codecov-action@v4 + # with: + # files: ./coverage.txt + # token: ${{ secrets.CODECOV_TOKEN }} + + # Test the latest three golang version + # on different operating systems. + test_versions: + strategy: + matrix: + go: ['1.22'] + os: [ubuntu-latest, macos-latest, windows-latest] + name: Go ${{ matrix.go }} on ${{ matrix.os }} + + runs-on: ${{ matrix.os }} + steps: + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: ${{ matrix.go }} + + - name: Checkout code + uses: actions/checkout@v4 + + - name: Test + run: go test ./... -v -race -cover \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..c2941be --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,36 @@ +name: goreleaser + +on: + push: + tags: + - '*' + +permissions: + contents: write + +jobs: + goreleaser: + runs-on: ubuntu-latest + steps: + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: 'stable' + check-latest: true + + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Run GoReleaser + uses: goreleaser/goreleaser-action@v6 + with: + distribution: goreleaser + version: 'latest' + args: release --clean + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9a3b10d --- /dev/null +++ b/.gitignore @@ -0,0 +1,27 @@ + +# - - - - - General - - - - - # + +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, build with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +.DS_Store + + + +# - - - - - Project Specific - - - - - # + +NOTES.md +.tmp + + +dist/ diff --git a/.goreleaser.yaml b/.goreleaser.yaml new file mode 100644 index 0000000..29eded3 --- /dev/null +++ b/.goreleaser.yaml @@ -0,0 +1,46 @@ + +# The lines below are called `modelines`. See `:help modeline` +# Feel free to remove those if you don't want/need to use them. +# yaml-language-server: $schema=https://goreleaser.com/static/schema.json +# vim: set ts=2 sw=2 tw=0 fo=cnqoj + +version: 2 + +before: + hooks: + - go mod tidy + +builds: + - env: + - CGO_ENABLED=0 + goos: + - linux + - windows + - darwin + + # Note: We only use goreleaser for the CLI, + # so we have to go into the "cli" directory. + dir: cli + binary: html2markdown + +archives: + - format: tar.gz + # this name template makes the OS and Arch compatible with the results of `uname`. + name_template: >- + {{ .ProjectName }}_ + {{- title .Os }}_ + {{- if eq .Arch "amd64" }}x86_64 + {{- else if eq .Arch "386" }}i386 + {{- else }}{{ .Arch }}{{ end }} + {{- if .Arm }}v{{ .Arm }}{{ end }} + # use zip for windows archives + format_overrides: + - goos: windows + format: zip + +changelog: + sort: asc + filters: + exclude: + - "^docs:" + - "^test:" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md new file mode 100644 index 0000000..142f95b --- /dev/null +++ b/README.md @@ -0,0 +1,120 @@ +# html-to-markdown + +> [!WARNING] +> This is an **early experimental version** of the library. +> +> We encourage testing and bug reporting. However, please note: +> +> - Not production-ready +> - Default options are well-tested, but custom configurations have limited coverage +> - Functionality is currently restricted +> - Focus is on stabilization and core features +> - No compatibility guarantee +> - Only use `htmltomarkdown.ConvertString()` and `htmltomarkdown.ConvertNode()` from the root package. They are _unlikely_ to change. +> - Other functions and nested packages are _very like_ to change. + +--- + +## Golang Library + +```go +package main + +import ( + "fmt" + "log" + + htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2" +) + +func main() { + input := `Bold Text` + + markdown, err := htmltomarkdown.ConvertString(input) + if err != nil { + log.Fatal(err) + } + fmt.Println(markdown) + // Output: **Bold Text** +} +``` + +- 🧑‍💻 [Example code, basics](/examples/basics/main.go) + +The function `htmltomarkdown.ConvertString()` is just a small wrapper around `converter.NewConverter()` and `commonmark.NewCommonmarkPlugin()`. If you want more control, use the following: + +```go +package main + +import ( + "fmt" + "log" + + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark" +) + +func main() { + input := `Bold Text` + + conv := converter.NewConverter( + converter.WithPlugins( + commonmark.NewCommonmarkPlugin( + commonmark.WithStrongDelimiter("__"), + // ...additional configurations for the plugin + ), + ), + ) + + markdown, err := conv.ConvertString(input) + if err != nil { + log.Fatal(err) + } + fmt.Println(markdown) + // Output: __Bold Text__ +} +``` + +- 🧑‍💻 [Example code, options](/examples/options/main.go) + +> [!NOTE] +> If you use `NewConverter` directly make sure to also **register the commonmark plugin**. + +--- + +--- + +## CLI - Using it on the command line + +Using the Golang library provides the most customization, while the CLI is the simplest way to get started. + +### Installation + +Download the pre-compiled binaries from the [releases page](https://github.com/JohannesKaufmann/html-to-markdown/releases) and copy them to the desired location. + +```bash +html2markdown --version +``` + +> [!NOTE] +> Make sure that `--version` prints `2.X.X` as there is a different CLI for V2 of the converter. + +## Usage + +```bash +$ echo "important" | html2markdown + +**important** +``` + +```text +$ curl --no-progress-meter http://example.com | html2markdown + +# Example Domain + +This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. + +[More information...](https://www.iana.org/domains/example) +``` + +_(The cli does not support every option yet. Over time more customization will be added)_ diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..0a028f6 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,6 @@ +# Security Policy + +## Reporting a Vulnerability + +Please report (suspected) security vulnerabilities to johannes@joina.de with the subject _"Security html-to-markdown"_ and you will receive a response within 48 hours. + diff --git a/cli/cmd/cli_run.go b/cli/cmd/cli_run.go new file mode 100644 index 0000000..4963c26 --- /dev/null +++ b/cli/cmd/cli_run.go @@ -0,0 +1,30 @@ +package cmd + +func Run( + stdin ReadWriterWithStat, + stdout ReadWriterWithStat, + stderr ReadWriterWithStat, + + osArgs []string, + + release Release, +) { + + cli := CLI{ + Stdin: stdin, + Stdout: stdout, + Stderr: stderr, + + OsArgs: osArgs, + + Release: release, + } + + // - - - - - init - - - - - // + if err := cli.Init(); err != nil { + panic(err) + } + + // - - - - - exec - - - - - // + cli.Execute() +} diff --git a/cli/cmd/cmd_convert.go b/cli/cmd/cmd_convert.go new file mode 100644 index 0000000..fc3891d --- /dev/null +++ b/cli/cmd/cmd_convert.go @@ -0,0 +1,48 @@ +package cmd + +import ( + "bytes" + "fmt" + + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark" +) + +func overrideValidationError(e *commonmark.ValidateConfigError) error { + + // TODO: Maybe OptionFunc should already validate and return an error? + // Then it would be easier to override the Key since we have once + // place to assemble the []OptionFunc and directly treat the errors... + + switch e.Key { + case "StrongDelimiter": + e.Key = "opt-strong-delimiter" + } + + e.KeyWithValue = fmt.Sprintf("--%s=%q", e.Key, e.Value) + return e +} +func (cli *CLI) convert(input []byte) ([]error, error) { + + conv := converter.NewConverter( + converter.WithPlugins( + commonmark.NewCommonmarkPlugin( + commonmark.WithStrongDelimiter(cli.config.strongDelimiter), + ), + ), + ) + + r := bytes.NewReader(input) + markdown, err := conv.ConvertReader(r) + if err != nil { + e, ok := err.(*commonmark.ValidateConfigError) + if ok { + return nil, overrideValidationError(e) + } + + return nil, err + } + + fmt.Fprintln(cli.Stdout, string(markdown)) + return nil, nil +} diff --git a/cli/cmd/cmd_help.go b/cli/cmd/cmd_help.go new file mode 100644 index 0000000..4957535 --- /dev/null +++ b/cli/cmd/cmd_help.go @@ -0,0 +1,108 @@ +package cmd + +import ( + "flag" + "fmt" + "io" + "sort" + "strings" + "text/template" +) + +var usageTemplate = ` +# html2markdown - convert html to markdown [version {{ .Version }}] + +Convert HTML to Markdown. Even works with entire websites! + +## Basics + +By default the "Commonmark" Plugin will be enabled. You can customize the options, +for example changing the appearance of bold with --opt-strong-delimiter="__" + +Other Plugins can also be enabled. For example "GitHub Flavored Markdown" (GFM) +extends Commonmark with more features. + + +## Escaping + +Some characters have a special meaning in markdown. The library escapes these — if necessary. +See the documentation for more info. + + +## Security + +Once you convert this markdown *back* to HTML you need to be careful of malicious content. +Use a HTML sanitizer before displaying the HTML in the browser! + + +## Examples + + echo "important" | html2markdown + + curl --no-progress-meter http://example.com | html2markdown + + +## Flags + + -v, --version + show the version of html2markdown and exit + + --help + +{{ range .Flags }} + --{{ .Name }}{{ with .Usage }} +{{ . | indent 8 }}{{ end }} +{{ end }} + + +For more information visit the documentation: +https://github.com/Johanneskaufmann/html-to-markdown + +` + +var templateFuncs = template.FuncMap{ + "indent": func(spaces int, v string) string { + pad := strings.Repeat(" ", spaces) + return pad + strings.Replace(v, "\n", "\n"+pad, -1) + }, +} + +func tmpl(w io.Writer, text string, data interface{}) error { + t := template.New("usage") + t.Funcs(templateFuncs) + + _, err := t.Parse(text) + if err != nil { + return err + } + return t.Execute(w, data) +} + +func (cli *CLI) initUsageText() error { + var flags []*flag.Flag + cli.flags.VisitAll(func(f *flag.Flag) { + if f.Name == "v" || f.Name == "version" { + // We manually mention these in the usage + return + } + flags = append(flags, f) + }) + sort.Slice(flags, func(i, j int) bool { + return flags[i].Name < flags[j].Name + }) + + data := map[string]any{ + "Version": cli.Release.Version, + "Flags": flags, + } + err := tmpl(&cli.usageText, usageTemplate, data) + if err != nil { + return err + } + + return nil +} + +func (cli CLI) printUsage() { + fmt.Fprint(cli.Stdout, cli.usageText.String()) +} diff --git a/cli/cmd/cmd_version.go b/cli/cmd/cmd_version.go new file mode 100644 index 0000000..bd292c7 --- /dev/null +++ b/cli/cmd/cmd_version.go @@ -0,0 +1,11 @@ +package cmd + +import "fmt" + +func (cli CLI) printVersion() { + fmt.Fprintf(cli.Stdout, "%s\n\n", projectBinary) + + fmt.Fprintf(cli.Stdout, "GitVersion: %s\n", cli.Release.Version) + fmt.Fprintf(cli.Stdout, "GitCommit: %s\n", cli.Release.Commit) + fmt.Fprintf(cli.Stdout, "BuildDate: %s\n", cli.Release.Date) +} diff --git a/cli/cmd/errors.go b/cli/cmd/errors.go new file mode 100644 index 0000000..3b13481 --- /dev/null +++ b/cli/cmd/errors.go @@ -0,0 +1,66 @@ +package cmd + +import ( + "fmt" + "io" + + "github.com/muesli/termenv" +) + +type CLIError struct { + cause error + printers []Printer +} + +func extractCLIError(err error) (CLIError, bool) { + if cliErr, ok := err.(*CLIError); ok { + return *cliErr, true + } + + return CLIError{ + cause: err, + }, false +} + +func NewCLIError(cause error, printers ...Printer) error { + return &CLIError{ + cause: cause, + printers: printers, + } +} +func (e CLIError) Error() string { + return e.cause.Error() +} +func (e CLIError) PrintDetails(w io.Writer) { + errPrinter := ColoredBox("error", e.cause.Error()) + + // Prepend the error printer + e.printers = append([]Printer{errPrinter}, e.printers...) + + for _, printer := range e.printers { + w.Write([]byte("\n")) + printer.Print(w) + } + w.Write([]byte("\n")) +} + +func (cli CLI) PrintErr(err error) { + if err == nil { + return + } + + e, _ := extractCLIError(err) + e.PrintDetails(cli.Stderr) +} +func (cli CLI) PrintWarn(err error) { + if err == nil { + return + } + + output := termenv.NewOutput(cli.Stderr) + + prefix := output.String("warning:").Background(termenv.ANSIYellow).Foreground(termenv.ANSIBrightWhite).String() + message := output.String(err.Error()).Foreground(termenv.ANSIYellow).String() + + fmt.Fprintf(cli.Stderr, "\n%s %s\n\n", prefix, message) +} diff --git a/cli/cmd/exec.go b/cli/cmd/exec.go new file mode 100644 index 0000000..1d65eaa --- /dev/null +++ b/cli/cmd/exec.go @@ -0,0 +1,147 @@ +package cmd + +import ( + "bytes" + "flag" + "fmt" + "io" + "os" + "strings" +) + +var ( + projectBinary = "html2markdown" +) + +// OsExiter is the function used when the app exits. If not set defaults to os.Exit. +var OsExiter = os.Exit + +// - - - - - - - - - - - - - // + +type Config struct { + // args are the positional (non-flag) command-line arguments. + args []string + + version bool + + // - - - - // + + strongDelimiter string + + plugins []string +} + +// Release holds the information (from the 3 ldflags) that goreleaser sets. +type Release struct { + // Current Git tag (the v prefix is stripped) + Version string + + // Current git commit SHA + Commit string + + // Date in the RFC3339 format + Date string +} +type CLI struct { + Stdin ReadWriterWithStat + Stdout ReadWriterWithStat + Stderr ReadWriterWithStat + + OsArgs []string + + Release Release + + isStdinPipe bool + isStdoutPipe bool + isStderrPipe bool + + flags *flag.FlagSet + config Config + + usageText bytes.Buffer +} + +func (cli *CLI) Init() error { + var err error + cli.isStdinPipe, err = isPipe(cli.Stdin) + if err != nil { + return fmt.Errorf("error while checking stdin for is pipe: %w", err) + } + cli.isStdoutPipe, err = isPipe(cli.Stdout) + if err != nil { + return fmt.Errorf("error while checking stdout for is pipe: %w", err) + } + cli.isStderrPipe, err = isPipe(cli.Stderr) + if err != nil { + return fmt.Errorf("error while checking stderr for is pipe: %w", err) + } + + cli.initFlags(cli.OsArgs[0]) + + err = cli.initUsageText() + if err != nil { + return fmt.Errorf("error while initializing the usage text: %w", err) + } + + return nil +} +func (cli *CLI) Execute() { + + warnings, err := cli.run() + + for _, warning := range warnings { + cli.PrintWarn(warning) + } + + if err == flag.ErrHelp { + cli.printUsage() + + OsExiter(0) + return + } else if err != nil { + cli.PrintErr(err) + + OsExiter(1) // General Error + return + } else { + OsExiter(0) + return + } +} + +func (cli *CLI) run() ([]error, error) { + + err := cli.parseFlags(cli.OsArgs[1:]) + if err != nil { + return nil, err + } + + if len(cli.config.args) != 0 { + + return nil, NewCLIError( + fmt.Errorf("unknown arguments: %s", strings.Join(cli.config.args, " ")), + Paragraph("Here is how you can use the CLI:"), + CodeBlock(`echo "important" | html2markdown`), + ) + } + + if cli.config.version { + cli.printVersion() + return nil, nil + } + + if !cli.isStdinPipe { + return nil, NewCLIError( + fmt.Errorf("the html input should be piped into the cli"), + Paragraph("Here is how you can use the CLI:"), + CodeBlock(`echo "important" | html2markdown`), + ) + } + + html, err := io.ReadAll(cli.Stdin) + if err != nil { + return nil, err + } + + return cli.convert(html) +} diff --git a/cli/cmd/exec_test.go b/cli/cmd/exec_test.go new file mode 100644 index 0000000..32d3328 --- /dev/null +++ b/cli/cmd/exec_test.go @@ -0,0 +1,319 @@ +package cmd + +import ( + "bytes" + "fmt" + "io/fs" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/sebdah/goldie/v2" +) + +func init() { + OsExiter = func(code int) { + fmt.Println("OS_EXITER_CALLED", code) + } +} + +type MockFileInfo struct { + mode os.FileMode +} + +func (info MockFileInfo) Name() string { return "" } +func (info MockFileInfo) Size() int64 { return 1 } +func (info MockFileInfo) Mode() os.FileMode { return info.mode } +func (info MockFileInfo) ModTime() time.Time { return time.Now() } +func (info MockFileInfo) IsDir() bool { return false } +func (info MockFileInfo) Sys() interface{} { return nil } + +type FakeFile struct { + bytes.Buffer + mode os.FileMode +} + +func (f FakeFile) Stat() (fs.FileInfo, error) { + return &MockFileInfo{mode: f.mode}, nil +} + +const ( + modePipe = fs.FileMode(33554864) // "prw-rw----" + modeTerminal = fs.FileMode(69206416) // "Dcrw--w----" +) + +type CLIInput struct { + modeStdin os.FileMode + modeStdout os.FileMode + modeStderr os.FileMode + + inputStdin []byte + inputArgs []string +} + +func cliTester(t *testing.T, input CLIInput) { + if input.modeStdin == modeTerminal && input.inputStdin != nil { + t.Fatal("invalid test: cannot provide stdin without pipe mode") + } + + stdin := &FakeFile{mode: input.modeStdin} + stdout := &FakeFile{mode: input.modeStdout} + stderr := &FakeFile{mode: input.modeStderr} + + if input.inputStdin != nil { + stdin.Write(input.inputStdin) + } + + release := Release{ + Version: "2.3.4-test", + Commit: "ca82a6dff817ec66f44342007202690a93763949", + Date: "2024-08-18T13:03:43Z", + } + + Run(stdin, stdout, stderr, input.inputArgs, release) + + if len(stdout.Bytes()) == 0 && len(stderr.Bytes()) == 0 { + t.Fatal("neither stdout nor stderr have any content") + } + + g := goldie.New(t) + g.Assert(t, filepath.Join(t.Name(), "stdout"), stdout.Bytes()) + g.Assert(t, filepath.Join(t.Name(), "stderr"), stderr.Bytes()) +} + +func TestExecute(t *testing.T) { + testCases := []struct { + desc string + input CLIInput + }{ + + // - - - - - flag: version / help - - - - - // + { + desc: "[general] version terminal", + + input: CLIInput{ + modeStdin: modeTerminal, + modeStdout: modeTerminal, + modeStderr: modeTerminal, + + inputArgs: []string{"html2markdown", "--version"}, + }, + }, + { + desc: "[general] version pipe", + + input: CLIInput{ + modeStdin: modePipe, + modeStdout: modePipe, + modeStderr: modePipe, + + inputArgs: []string{"html2markdown", "--version"}, + }, + }, + { + desc: "[general] help terminal", + + input: CLIInput{ + modeStdin: modeTerminal, + modeStdout: modeTerminal, + modeStderr: modeTerminal, + + inputArgs: []string{"html2markdown", "--help"}, + }, + }, + { + desc: "[general] help pipe", + + input: CLIInput{ + modeStdin: modePipe, + modeStdout: modePipe, + modeStderr: modePipe, + + inputArgs: []string{"html2markdown", "--help"}, + }, + }, + + // - - - - - no content - - - - - // + { + desc: "[general] no content", + + input: CLIInput{ + modeStdin: modeTerminal, + modeStdout: modeTerminal, + modeStderr: modeTerminal, + + inputStdin: nil, + inputArgs: []string{"html2markdown"}, + }, + }, + + // - - - - - arguments - - - - - // + { + desc: "[argument unknown] version", + + input: CLIInput{ + modeStdin: modeTerminal, + modeStdout: modeTerminal, + modeStderr: modeTerminal, + + inputArgs: []string{"html2markdown", `version`}, + }, + }, + { + desc: "[argument unknown] html", + + input: CLIInput{ + modeStdin: modeTerminal, + modeStdout: modeTerminal, + modeStderr: modeTerminal, + + inputArgs: []string{"html2markdown", `"text"`}, + }, + }, + { + desc: "[argument unknown] long string", + + input: CLIInput{ + modeStdin: modeTerminal, + modeStdout: modeTerminal, + modeStderr: modeTerminal, + + inputArgs: []string{"html2markdown", strings.Repeat("12456789", 40)}, + }, + }, + { + desc: "[argument unknown] list of files", + + input: CLIInput{ + modeStdin: modeTerminal, + modeStdout: modeTerminal, + modeStderr: modeTerminal, + + // The ** was treated as a file glob + inputArgs: []string{"html2markdown", "--opt-strong-delimiter", "CONTRIBUTING.md", "README.md", "SECURITY.md", "a.html", "b.html", "c.html", "d.html", "e.html", "f.html"}, + }, + }, + + // - - - - - flags - - - - - // + { + desc: "[flag unknown] with pipe", + + input: CLIInput{ + modeStdin: modePipe, + modeStdout: modePipe, + modeStderr: modePipe, + + inputArgs: []string{"html2markdown", "--this-does-not-exist"}, + }, + }, + { + desc: "[flag unknown] with terminal", + + input: CLIInput{ + modeStdin: modeTerminal, + modeStdout: modeTerminal, + modeStderr: modeTerminal, + + inputArgs: []string{"html2markdown", "--this-does-not-exist"}, + }, + }, + + { + desc: "[flag misspelled] underscore", + + input: CLIInput{ + modeStdin: modePipe, + modeStdout: modePipe, + modeStderr: modePipe, + + inputStdin: []byte("text"), + // Someone accidentally used underscores instead of dashes + inputArgs: []string{"html2markdown", "--opt_strong_delimiter="}, + }, + }, + + // - - - - - converting - - - - - // + { + desc: "[convert] strong default", + + input: CLIInput{ + modeStdin: modePipe, + modeStdout: modePipe, + modeStderr: modePipe, + + inputStdin: []byte("text"), + inputArgs: []string{"html2markdown"}, + }, + }, + { + desc: "[convert] strong equal underscore", + + input: CLIInput{ + modeStdin: modePipe, + modeStdout: modePipe, + modeStderr: modePipe, + + inputStdin: []byte("text"), + // Note: We dont test the quoted version "__" since that is already unquoted by bash/go + inputArgs: []string{"html2markdown", `--opt-strong-delimiter=__`}, + }, + }, + { + desc: "[convert] strong space underscore", + + input: CLIInput{ + modeStdin: modePipe, + modeStdout: modePipe, + modeStderr: modePipe, + + inputStdin: []byte("text"), + inputArgs: []string{"html2markdown", `--opt-strong-delimiter`, `__`}, + }, + }, + + // - - - - - validation of options - - - - - // + { + desc: "[validation] no value", + + input: CLIInput{ + modeStdin: modePipe, + modeStdout: modePipe, + modeStderr: modePipe, + + inputStdin: []byte("text"), + inputArgs: []string{"html2markdown", `--opt-strong-delimiter=`}, + }, + }, + { + desc: "[validation] invalid value", + + input: CLIInput{ + modeStdin: modePipe, + modeStdout: modePipe, + modeStderr: modePipe, + + inputStdin: []byte("text"), + inputArgs: []string{"html2markdown", `--opt-strong-delimiter=1234`}, + }, + }, + { + desc: "[validation] discouraged value", + + input: CLIInput{ + modeStdin: modePipe, + modeStdout: modePipe, + modeStderr: modePipe, + + inputStdin: []byte("text"), + inputArgs: []string{"html2markdown", `--opt-strong-delimiter=*`}, + }, + }, + } + for _, tC := range testCases { + t.Run(tC.desc, func(t *testing.T) { + cliTester(t, tC.input) + }) + } +} diff --git a/cli/cmd/flags.go b/cli/cmd/flags.go new file mode 100644 index 0000000..11d0024 --- /dev/null +++ b/cli/cmd/flags.go @@ -0,0 +1,80 @@ +package cmd + +import ( + "flag" + "fmt" + "io" + "strings" + "unicode" +) + +type FlagString string + +func (a *FlagString) Scan(state fmt.ScanState, verb rune) error { + token, err := state.Token(true, func(r rune) bool { + return unicode.IsLetter(r) || r == '-' + }) + if err != nil { + return err + } + *a = FlagString(token) + return nil +} + +func flagStringSlice(elems *[]string) func(string) error { + return func(raw string) error { + values := strings.Split(raw, ",") + + for _, val := range values { + val = strings.TrimSpace(val) + if val == "" { + continue + } + + *elems = append(*elems, val) + } + return nil + } +} + +func (cli *CLI) initFlags(progname string) { + cli.flags = flag.NewFlagSet(progname, flag.ContinueOnError) + cli.flags.SetOutput(io.Discard) + + // - - - // + + cli.flags.BoolVar(&cli.config.version, "version", false, "display the version") + cli.flags.BoolVar(&cli.config.version, "v", false, "display the version") + + // cli.flags.BoolVar(&cli.config.help, "help", false, "display help") + + cli.flags.StringVar( + &cli.config.strongDelimiter, + "opt-strong-delimiter", + "**", + `Make bold text. Should be indicated by two asterisks or two underscores? +"**" or "__" (default: "**")`, + ) + + // cli.flags.StringVar(&cli.config.strongDelimiter, "opt-heading-style", "", "") + // cli.flags.StringVar(&cli.config.strongDelimiter, "opt-horizontal-rule", "", "") + // cli.flags.StringVar(&cli.config.strongDelimiter, "opt-bullet-list-marker", "", "") + + // TODO: how to disable commonmark plugin? + // --plugin_commonmark=false + // --plugin.commonmark=false + // --no-plugin="cm" / --disable-plugin="cm" + // But what if we have conflicting flags??? + cli.flags.Func("plugins", "which plugins should be enabled?", flagStringSlice(&cli.config.plugins)) +} + +func (cli *CLI) parseFlags(args []string) error { + err := cli.flags.Parse(args) + if err != nil { + return cli.categorizeFlagError(err) + } + + cli.config.args = cli.flags.Args() + + return nil +} diff --git a/cli/cmd/flags_categorize.go b/cli/cmd/flags_categorize.go new file mode 100644 index 0000000..e49e3d1 --- /dev/null +++ b/cli/cmd/flags_categorize.go @@ -0,0 +1,68 @@ +package cmd + +import ( + "flag" + "fmt" + "strings" + "unicode/utf8" + + "github.com/agnivade/levenshtein" +) + +const flagProvidedButNotDefinedErr = "flag provided but not defined: -" + +func formatFlag(name string) string { + if len(name) == 1 { + return "-" + name + } + return "--" + name +} +func (cli *CLI) getAlternativeFlag(unknownFlag string) string { + var closestDistance int = 10000 + var closestFlag string + + cli.flags.VisitAll(func(f *flag.Flag) { + + distance := levenshtein.ComputeDistance(f.Name, unknownFlag) + + if distance < closestDistance { + closestDistance = distance + closestFlag = f.Name + } + }) + + fmt.Printf("%q <> %q -> %d \n", unknownFlag, closestFlag, closestDistance) + + if closestDistance >= utf8.RuneCountInString(unknownFlag) { + return "" + } + if closestDistance > 4 { + return "" + } + return closestFlag +} +func (cli *CLI) categorizeFlagError(err error) error { + if err == nil { + return nil + } + + message := err.Error() + + if strings.HasPrefix(message, flagProvidedButNotDefinedErr) { + flagName := strings.TrimPrefix(message, flagProvidedButNotDefinedErr) + + err := fmt.Errorf("unknown flag: %s", formatFlag(flagName)) + + alternative := cli.getAlternativeFlag(flagName) + if alternative == "" { + return NewCLIError(err) + } + + return NewCLIError( + err, + Paragraph(fmt.Sprintf("Did you mean %s instead?", formatFlag(alternative))), + ) + } + + return err +} diff --git a/cli/cmd/flags_test.go b/cli/cmd/flags_test.go new file mode 100644 index 0000000..65bf230 --- /dev/null +++ b/cli/cmd/flags_test.go @@ -0,0 +1,47 @@ +package cmd + +import ( + "reflect" + "testing" +) + +func TestFlagStringSlice(t *testing.T) { + testCases := []struct { + desc string + inputs []string + expected []string + }{ + { + desc: "simple flag", + inputs: []string{"a"}, + expected: []string{"a"}, + }, + { + desc: "two flags", + inputs: []string{"a,b", "c"}, + expected: []string{"a", "b", "c"}, + }, + { + desc: "with seperator", + inputs: []string{"a,", ",b"}, + expected: []string{"a", "b"}, + }, + { + desc: "with spaces", + inputs: []string{"a, ,b", " ,c"}, + expected: []string{"a", "b", "c"}, + }, + } + for _, tC := range testCases { + t.Run(tC.desc, func(t *testing.T) { + var result []string + for _, input := range tC.inputs { + flagStringSlice(&result)(input) + } + + if !reflect.DeepEqual(result, tC.expected) { + t.Errorf("expected %v but got %v", tC.expected, result) + } + }) + } +} diff --git a/cli/cmd/print.go b/cli/cmd/print.go new file mode 100644 index 0000000..189e62f --- /dev/null +++ b/cli/cmd/print.go @@ -0,0 +1,59 @@ +package cmd + +import ( + "fmt" + "io" + + "github.com/muesli/termenv" +) + +type Printer interface { + Print(w io.Writer) +} + +// - - - - - - - // + +type coloredBox struct { + prefix string + text string +} + +func ColoredBox(prefix string, text string) Printer { + return &coloredBox{prefix, text} +} + +func (p coloredBox) Print(w io.Writer) { + output := termenv.NewOutput(w) + + prefix := output.String(p.prefix + ":").Background(termenv.ANSIRed).Foreground(termenv.ANSIBrightWhite).String() + message := output.String(p.text).Foreground(termenv.ANSIRed).String() + + fmt.Fprintf(w, "%s %s\n", prefix, message) +} + +// - - - - - - - // + +type paragraph struct { + text string +} + +func Paragraph(text string) Printer { + return ¶graph{text} +} +func (p paragraph) Print(w io.Writer) { + fmt.Fprintln(w, p.text) +} + +// - - - - - - - // + +type codeBlock struct { + code string +} + +func CodeBlock(code string) Printer { + return &codeBlock{code} +} +func (cb codeBlock) Print(w io.Writer) { + // TODO: what about indenting multiline? + fmt.Fprintf(w, " %s\n", cb.code) +} diff --git a/cli/cmd/testdata/.gitattributes b/cli/cmd/testdata/.gitattributes new file mode 100644 index 0000000..a8d2daa --- /dev/null +++ b/cli/cmd/testdata/.gitattributes @@ -0,0 +1,4 @@ + +# Leave the files untouched. Otherwise they might be +# changed when cloning the repo on Windows... +* -text diff --git a/cli/cmd/testdata/TestExecute/[argument_unknown]_html/stderr.golden b/cli/cmd/testdata/TestExecute/[argument_unknown]_html/stderr.golden new file mode 100644 index 0000000..48312bb --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[argument_unknown]_html/stderr.golden @@ -0,0 +1,7 @@ + +error: unknown arguments: "text" + +Here is how you can use the CLI: + + echo "important" | html2markdown + diff --git a/cli/cmd/testdata/TestExecute/[argument_unknown]_html/stdout.golden b/cli/cmd/testdata/TestExecute/[argument_unknown]_html/stdout.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[argument_unknown]_list_of_files/stderr.golden b/cli/cmd/testdata/TestExecute/[argument_unknown]_list_of_files/stderr.golden new file mode 100644 index 0000000..9b86664 --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[argument_unknown]_list_of_files/stderr.golden @@ -0,0 +1,7 @@ + +error: unknown arguments: README.md SECURITY.md a.html b.html c.html d.html e.html f.html + +Here is how you can use the CLI: + + echo "important" | html2markdown + diff --git a/cli/cmd/testdata/TestExecute/[argument_unknown]_list_of_files/stdout.golden b/cli/cmd/testdata/TestExecute/[argument_unknown]_list_of_files/stdout.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[argument_unknown]_long_string/stderr.golden b/cli/cmd/testdata/TestExecute/[argument_unknown]_long_string/stderr.golden new file mode 100644 index 0000000..245fc1c --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[argument_unknown]_long_string/stderr.golden @@ -0,0 +1,7 @@ + +error: unknown arguments: 12456789124567891245678912456789124567891245678912456789124567891245678912456789124567891245678912456789124567891245678912456789124567891245678912456789124567891245678912456789124567891245678912456789124567891245678912456789124567891245678912456789124567891245678912456789124567891245678912456789124567891245678912456789 + +Here is how you can use the CLI: + + echo "important" | html2markdown + diff --git a/cli/cmd/testdata/TestExecute/[argument_unknown]_long_string/stdout.golden b/cli/cmd/testdata/TestExecute/[argument_unknown]_long_string/stdout.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[argument_unknown]_version/stderr.golden b/cli/cmd/testdata/TestExecute/[argument_unknown]_version/stderr.golden new file mode 100644 index 0000000..8070be6 --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[argument_unknown]_version/stderr.golden @@ -0,0 +1,7 @@ + +error: unknown arguments: version + +Here is how you can use the CLI: + + echo "important" | html2markdown + diff --git a/cli/cmd/testdata/TestExecute/[argument_unknown]_version/stdout.golden b/cli/cmd/testdata/TestExecute/[argument_unknown]_version/stdout.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[convert]_strong_default/stderr.golden b/cli/cmd/testdata/TestExecute/[convert]_strong_default/stderr.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[convert]_strong_default/stdout.golden b/cli/cmd/testdata/TestExecute/[convert]_strong_default/stdout.golden new file mode 100644 index 0000000..e5180c8 --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[convert]_strong_default/stdout.golden @@ -0,0 +1 @@ +**text** diff --git a/cli/cmd/testdata/TestExecute/[convert]_strong_equal_underscore/stderr.golden b/cli/cmd/testdata/TestExecute/[convert]_strong_equal_underscore/stderr.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[convert]_strong_equal_underscore/stdout.golden b/cli/cmd/testdata/TestExecute/[convert]_strong_equal_underscore/stdout.golden new file mode 100644 index 0000000..25a055e --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[convert]_strong_equal_underscore/stdout.golden @@ -0,0 +1 @@ +__text__ diff --git a/cli/cmd/testdata/TestExecute/[convert]_strong_space_underscore/stderr.golden b/cli/cmd/testdata/TestExecute/[convert]_strong_space_underscore/stderr.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[convert]_strong_space_underscore/stdout.golden b/cli/cmd/testdata/TestExecute/[convert]_strong_space_underscore/stdout.golden new file mode 100644 index 0000000..25a055e --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[convert]_strong_space_underscore/stdout.golden @@ -0,0 +1 @@ +__text__ diff --git a/cli/cmd/testdata/TestExecute/[flag_misspelled]_underscore/stderr.golden b/cli/cmd/testdata/TestExecute/[flag_misspelled]_underscore/stderr.golden new file mode 100644 index 0000000..664e5bb --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[flag_misspelled]_underscore/stderr.golden @@ -0,0 +1,5 @@ + +error: unknown flag: --opt_strong_delimiter + +Did you mean --opt-strong-delimiter instead? + diff --git a/cli/cmd/testdata/TestExecute/[flag_misspelled]_underscore/stdout.golden b/cli/cmd/testdata/TestExecute/[flag_misspelled]_underscore/stdout.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[flag_unknown]_with_pipe/stderr.golden b/cli/cmd/testdata/TestExecute/[flag_unknown]_with_pipe/stderr.golden new file mode 100644 index 0000000..3043673 --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[flag_unknown]_with_pipe/stderr.golden @@ -0,0 +1,3 @@ + +error: unknown flag: --this-does-not-exist + diff --git a/cli/cmd/testdata/TestExecute/[flag_unknown]_with_pipe/stdout.golden b/cli/cmd/testdata/TestExecute/[flag_unknown]_with_pipe/stdout.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[flag_unknown]_with_terminal/stderr.golden b/cli/cmd/testdata/TestExecute/[flag_unknown]_with_terminal/stderr.golden new file mode 100644 index 0000000..3043673 --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[flag_unknown]_with_terminal/stderr.golden @@ -0,0 +1,3 @@ + +error: unknown flag: --this-does-not-exist + diff --git a/cli/cmd/testdata/TestExecute/[flag_unknown]_with_terminal/stdout.golden b/cli/cmd/testdata/TestExecute/[flag_unknown]_with_terminal/stdout.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[general]_help_pipe/stderr.golden b/cli/cmd/testdata/TestExecute/[general]_help_pipe/stderr.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[general]_help_pipe/stdout.golden b/cli/cmd/testdata/TestExecute/[general]_help_pipe/stdout.golden new file mode 100644 index 0000000..83daf0a --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[general]_help_pipe/stdout.golden @@ -0,0 +1,53 @@ + +# html2markdown - convert html to markdown [version 2.3.4-test] + +Convert HTML to Markdown. Even works with entire websites! + +## Basics + +By default the "Commonmark" Plugin will be enabled. You can customize the options, +for example changing the appearance of bold with --opt-strong-delimiter="__" + +Other Plugins can also be enabled. For example "GitHub Flavored Markdown" (GFM) +extends Commonmark with more features. + + +## Escaping + +Some characters have a special meaning in markdown. The library escapes these — if necessary. +See the documentation for more info. + + +## Security + +Once you convert this markdown *back* to HTML you need to be careful of malicious content. +Use a HTML sanitizer before displaying the HTML in the browser! + + +## Examples + + echo "important" | html2markdown + + curl --no-progress-meter http://example.com | html2markdown + + +## Flags + + -v, --version + show the version of html2markdown and exit + + --help + + + --opt-strong-delimiter + Make bold text. Should be indicated by two asterisks or two underscores? + "**" or "__" (default: "**") + + --plugins + which plugins should be enabled? + + + +For more information visit the documentation: +https://github.com/Johanneskaufmann/html-to-markdown + diff --git a/cli/cmd/testdata/TestExecute/[general]_help_terminal/stderr.golden b/cli/cmd/testdata/TestExecute/[general]_help_terminal/stderr.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[general]_help_terminal/stdout.golden b/cli/cmd/testdata/TestExecute/[general]_help_terminal/stdout.golden new file mode 100644 index 0000000..83daf0a --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[general]_help_terminal/stdout.golden @@ -0,0 +1,53 @@ + +# html2markdown - convert html to markdown [version 2.3.4-test] + +Convert HTML to Markdown. Even works with entire websites! + +## Basics + +By default the "Commonmark" Plugin will be enabled. You can customize the options, +for example changing the appearance of bold with --opt-strong-delimiter="__" + +Other Plugins can also be enabled. For example "GitHub Flavored Markdown" (GFM) +extends Commonmark with more features. + + +## Escaping + +Some characters have a special meaning in markdown. The library escapes these — if necessary. +See the documentation for more info. + + +## Security + +Once you convert this markdown *back* to HTML you need to be careful of malicious content. +Use a HTML sanitizer before displaying the HTML in the browser! + + +## Examples + + echo "important" | html2markdown + + curl --no-progress-meter http://example.com | html2markdown + + +## Flags + + -v, --version + show the version of html2markdown and exit + + --help + + + --opt-strong-delimiter + Make bold text. Should be indicated by two asterisks or two underscores? + "**" or "__" (default: "**") + + --plugins + which plugins should be enabled? + + + +For more information visit the documentation: +https://github.com/Johanneskaufmann/html-to-markdown + diff --git a/cli/cmd/testdata/TestExecute/[general]_no_content/stderr.golden b/cli/cmd/testdata/TestExecute/[general]_no_content/stderr.golden new file mode 100644 index 0000000..69d0a90 --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[general]_no_content/stderr.golden @@ -0,0 +1,7 @@ + +error: the html input should be piped into the cli + +Here is how you can use the CLI: + + echo "important" | html2markdown + diff --git a/cli/cmd/testdata/TestExecute/[general]_no_content/stdout.golden b/cli/cmd/testdata/TestExecute/[general]_no_content/stdout.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[general]_version_pipe/stderr.golden b/cli/cmd/testdata/TestExecute/[general]_version_pipe/stderr.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[general]_version_pipe/stdout.golden b/cli/cmd/testdata/TestExecute/[general]_version_pipe/stdout.golden new file mode 100644 index 0000000..56ec1e8 --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[general]_version_pipe/stdout.golden @@ -0,0 +1,5 @@ +html2markdown + +GitVersion: 2.3.4-test +GitCommit: ca82a6dff817ec66f44342007202690a93763949 +BuildDate: 2024-08-18T13:03:43Z diff --git a/cli/cmd/testdata/TestExecute/[general]_version_terminal/stderr.golden b/cli/cmd/testdata/TestExecute/[general]_version_terminal/stderr.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[general]_version_terminal/stdout.golden b/cli/cmd/testdata/TestExecute/[general]_version_terminal/stdout.golden new file mode 100644 index 0000000..56ec1e8 --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[general]_version_terminal/stdout.golden @@ -0,0 +1,5 @@ +html2markdown + +GitVersion: 2.3.4-test +GitCommit: ca82a6dff817ec66f44342007202690a93763949 +BuildDate: 2024-08-18T13:03:43Z diff --git a/cli/cmd/testdata/TestExecute/[validation]_discouraged_value/stderr.golden b/cli/cmd/testdata/TestExecute/[validation]_discouraged_value/stderr.golden new file mode 100644 index 0000000..c043047 --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[validation]_discouraged_value/stderr.golden @@ -0,0 +1,3 @@ + +error: invalid value for --opt-strong-delimiter="*" must be exactly 2 characters of "**" or "__" + diff --git a/cli/cmd/testdata/TestExecute/[validation]_discouraged_value/stdout.golden b/cli/cmd/testdata/TestExecute/[validation]_discouraged_value/stdout.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[validation]_invalid_value/stderr.golden b/cli/cmd/testdata/TestExecute/[validation]_invalid_value/stderr.golden new file mode 100644 index 0000000..4cf6826 --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[validation]_invalid_value/stderr.golden @@ -0,0 +1,3 @@ + +error: invalid value for --opt-strong-delimiter="1234" must be exactly 2 characters of "**" or "__" + diff --git a/cli/cmd/testdata/TestExecute/[validation]_invalid_value/stdout.golden b/cli/cmd/testdata/TestExecute/[validation]_invalid_value/stdout.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[validation]_no_value/stderr.golden b/cli/cmd/testdata/TestExecute/[validation]_no_value/stderr.golden new file mode 100644 index 0000000..e69de29 diff --git a/cli/cmd/testdata/TestExecute/[validation]_no_value/stdout.golden b/cli/cmd/testdata/TestExecute/[validation]_no_value/stdout.golden new file mode 100644 index 0000000..e5180c8 --- /dev/null +++ b/cli/cmd/testdata/TestExecute/[validation]_no_value/stdout.golden @@ -0,0 +1 @@ +**text** diff --git a/cli/cmd/util_pipe.go b/cli/cmd/util_pipe.go new file mode 100644 index 0000000..769e1a7 --- /dev/null +++ b/cli/cmd/util_pipe.go @@ -0,0 +1,25 @@ +package cmd + +import ( + "io" + "io/fs" + "os" +) + +type ReadWriterWithStat interface { + io.ReadWriter + + Stat() (fs.FileInfo, error) +} + +func isPipe(f ReadWriterWithStat) (bool, error) { + stat, err := f.Stat() + if err != nil { + return false, err + } + + if stat.Mode()&os.ModeCharDevice == 0 { + return true, nil + } + return false, nil +} diff --git a/cli/main.go b/cli/main.go new file mode 100644 index 0000000..cc9dc8e --- /dev/null +++ b/cli/main.go @@ -0,0 +1,30 @@ +package main + +import ( + "os" + + "github.com/JohannesKaufmann/html-to-markdown/v2/cli/cmd" +) + +var ( + // These are set by goreleaser: + version = "dev" + commit = "none" + date = "unknown" +) + +func main() { + release := cmd.Release{ + Version: version, + Commit: commit, + Date: date, + } + + cmd.Run( + os.Stdin, + os.Stdout, + os.Stderr, + os.Args, + release, + ) +} diff --git a/collapse/collapse.go b/collapse/collapse.go new file mode 100644 index 0000000..58a81fb --- /dev/null +++ b/collapse/collapse.go @@ -0,0 +1,187 @@ +/* + +The function to collapse whitespace was adapted from the "turndown" library by Dom Christie, +which was adapted from the "collapse-whitespace" library by Luc Thevenard. + +It was ported from Javascript to Golang by Johannes Kaufmann for the use in the "html-to-markdown" library. +To increase performance the use of regex was replaced by custom code. + +https://github.com/wooorm/collapse-white-space +https://github.com/mixmark-io/turndown +https://github.com/JohannesKaufmann/html-to-markdown + +----------- + +MIT License + +Copyright (c) 2017 Dom Christie +Copyright (c) 2014 Luc Thevenard +Copyright (c) 2018 Johannes Kaufmann + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +// collapse can collapse whitespace in html elements. +// +// It is a port from the Javascript library "turndown" to Golang. +package collapse + +import ( + "strings" + + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +// Note: Originally in the javascript version, this just checked for "pre". +// I changed it, to also return true for "code" +func isPreOrCode(node *html.Node) bool { + name := dom.NodeName(node) + + return name == "pre" || name == "code" +} + +func next(prev *html.Node, current *html.Node) *html.Node { + if (prev != nil && prev.Parent == current) || isPreOrCode(current) { + if current.NextSibling != nil { + return current.NextSibling + } + + return current.Parent + } + + if current.FirstChild != nil { + return current.FirstChild + } + if current.NextSibling != nil { + return current.NextSibling + } + + return current.Parent +} + +var blockElements = []string{ + "address", "article", "aside", "audio", "blockquote", "body", "canvas", "center", "dd", "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "html", "isindex", "li", "main", "menu", "nav", "noframes", "noscript", "ol", "output", "p", "pre", "section", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "ul", +} + +var voidElements = []string{ + // Note: Compared to the javascript implementation, I removed "source" + "area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link", "meta", "param" /* "source, "*/, "track", "wbr", +} + +func isBlock(node *html.Node) bool { + name := dom.NodeName(node) + + for _, elem := range blockElements { + if elem == name { + return true + } + } + return false +} +func isVoid(node *html.Node) bool { + name := dom.NodeName(node) + + for _, elem := range voidElements { + if elem == name { + return true + } + } + return false +} + +func remove(node *html.Node) *html.Node { + next := node.NextSibling + if next == nil { + next = node.Parent + } + + node.Parent.RemoveChild(node) + + return next + +} + +func Collapse(element *html.Node) { + if element.FirstChild == nil || isPreOrCode(element) { + return + } + + var prevText *html.Node = nil + var keepLeadingWs = false + + var prev *html.Node = nil + var node = next(prev, element) + + for node != element { + if node.Type == html.TextNode /* node.nodeType == 4 */ { // Node.TEXT_NODE or Node.CDATA_SECTION_NODE + var text = replaceAnyWhitespaceWithSpace(node.Data) + + if (prevText == nil || strings.HasSuffix(prevText.Data, " ")) && + !keepLeadingWs && text[0] == ' ' { + text = text[1:] + } + + // `text` might be empty at this point. + if text == "" { + node = remove(node) + continue + } + + node.Data = text + + prevText = node + } else if node.Type == html.ElementNode { // Node.ELEMENT_NODE + if isBlock(node) || dom.NodeName(node) == "br" { + if prevText != nil { + prevText.Data = strings.TrimSuffix(prevText.Data, " ") + } + + prevText = nil + keepLeadingWs = false + } else if isVoid(node) || isPreOrCode(node) || node.Data == "code" { + // Avoid trimming space around non-block, non-BR void elements and inline PRE. + prevText = nil + keepLeadingWs = true + } else if prevText != nil { + // Drop protection if set previously. + keepLeadingWs = false + } + } else if node.Type == html.CommentNode { + // TODO: Is this enough to keep the comments? Does this cause other problems? + } else { + // E.g. DoctypeNode + + node = remove(node) + continue + } + + var nextNode = next(prev, node) + prev = node + node = nextNode + } + + if prevText != nil { + prevText.Data = strings.TrimSuffix(prevText.Data, " ") + if prevText.Data == "" { + remove(prevText) + } + } +} diff --git a/collapse/collapse_test.go b/collapse/collapse_test.go new file mode 100644 index 0000000..4998e85 --- /dev/null +++ b/collapse/collapse_test.go @@ -0,0 +1,289 @@ +package collapse + +import ( + "bytes" + "strings" + "testing" + + "golang.org/x/net/html" +) + +func getBody(doc *html.Node) *html.Node { + var body *html.Node + + var finder func(*html.Node) + finder = func(node *html.Node) { + if node.Type == html.ElementNode && node.Data == "body" { + body = node + return + } + for child := node.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + } + finder(doc) + + return body +} + +func TestCollapse_DocType(t *testing.T) { + // The DOCTYPE gets removed + input := `` + + doc, err := html.Parse(strings.NewReader(input)) + if err != nil { + t.Error(err) + } + + Collapse(doc) + + var buf bytes.Buffer + err = html.Render(&buf, doc) + if err != nil { + t.Error(err) + } + + expected := `` + if buf.String() != expected { + t.Errorf("expected %q but got %q", expected, buf.String()) + } +} + +func TestCollapse_NoFirstChild(t *testing.T) { + boldNode := &html.Node{ + Type: html.ElementNode, + Data: "strong", + } + + Collapse(boldNode) + + var buf bytes.Buffer + err := html.Render(&buf, boldNode) + if err != nil { + t.Error(err) + } + + expected := `` + if buf.String() != expected { + t.Errorf("expected %q but got %q", expected, buf.String()) + } +} + +func TestCollapse_StartWithCode(t *testing.T) { + textNode := &html.Node{ + Type: html.TextNode, + Data: " text ", + } + codeNode := &html.Node{ + Type: html.ElementNode, + Data: "code", + } + codeNode.AppendChild(textNode) + + Collapse(codeNode) + + var buf bytes.Buffer + err := html.Render(&buf, codeNode) + if err != nil { + t.Error(err) + } + + expected := ` text ` + if buf.String() != expected { + t.Errorf("expected %q but got %q", expected, buf.String()) + } +} + +func TestCollapse_TwoTextNodes(t *testing.T) { + node1 := &html.Node{ + Type: html.ElementNode, + Data: "span", + } + + node2 := &html.Node{ + Type: html.TextNode, + Data: " a ", + } + node3 := &html.Node{ + Type: html.TextNode, + Data: " b ", + } + node1.AppendChild(node2) + node1.AppendChild(node3) + + Collapse(node1) + + var buf bytes.Buffer + err := html.Render(&buf, node1) + if err != nil { + t.Error(err) + } + + expected := `a b` + if buf.String() != expected { + t.Errorf("expected %q but got %q", expected, buf.String()) + } +} + +func TestCollapse_LastTextIsEmpty(t *testing.T) { + node1 := &html.Node{ + Type: html.ElementNode, + Data: "span", + } + + node2 := &html.Node{ + Type: html.TextNode, + Data: "text", + } + node3 := &html.Node{ + Type: html.TextNode, + Data: " ", + } + node1.AppendChild(node2) + node1.AppendChild(node3) + + Collapse(node1) + + var buf bytes.Buffer + err := html.Render(&buf, node1) + if err != nil { + t.Error(err) + } + + expected := `text` + if buf.String() != expected { + t.Errorf("expected %q but got %q", expected, buf.String()) + } +} + +func TestCollapse_Table(t *testing.T) { + runs := []struct { + desc string + input string + expected string + }{ + { + desc: "basic example", + input: "

Foo bar

Words

", + expected: "

Foo bar

Words

", + }, + { + desc: "without whitespace", + input: "

SomeText

", + expected: "

SomeText

", + }, + { + desc: "with one space & space in paragraph", + input: "

Some text.

", + expected: "

Some text.

", + }, + { + desc: "with one space", + input: "

Some text.

", + expected: "

Some text.

", + }, + { + desc: "with three space", + input: "

Some text.

", + expected: "

Some text.

", + }, + { + desc: "with three space (at beginning of paragraph)", + input: "

text.

", + expected: "

text.

", + }, + { + desc: "with image between", + input: `

a b

`, + expected: `

a b

`, + }, + { + desc: "spans directly next to each other", + input: "

(Text A)(Text B)

", + expected: "

(Text A)(Text B)

", + }, + { + desc: "spans with newline between each other", + input: "

\n(Text A)\n(Text B)\n

", + expected: "

(Text A) (Text B)

", + }, + { + desc: "code with space", + input: "

aaa

", + // Note: This is different thant the javascript implementation. + // We want the space to be preserved. + expected: "

aaa

", + }, + { + desc: "#text in sample", + input: ` +

+
+ Browse + + or ask. +
+

+ `, + expected: `

Browseor ask.

`, + }, + + // - - - - - - // + { + desc: "mdn example: inline formatting context", + input: "

Hello \n\t\t\t\t World!\t

", + expected: "

Hello World!

", + // -> https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace + }, + { + desc: "mdn example: block formatting contexts", + input: "\n\t
Hello
\n\n
World!
\n", + expected: "
Hello
World!
", + }, + + // - - - - - - Comments - - - - - - // + { + desc: "#comment inside paragraph", + input: `

beforeafter

`, + expected: `

beforeafter

`, + }, + { + desc: "#comment inside paragraph (with spaces)", + input: `

before after

`, + expected: `

before after

`, + }, + { + desc: "#comment inside div", + input: `
beforeafter
`, + expected: `
beforeafter
`, + }, + { + desc: "#comment inside div (with spaces)", + input: `
before after
`, + expected: `
before after
`, + }, + } + + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + doc, err := html.Parse(strings.NewReader(run.input)) + if err != nil { + t.Error(err) + } + + Collapse(doc) + + var buf bytes.Buffer + err = html.Render(&buf, getBody(doc)) + if err != nil { + t.Error(err) + } + + if buf.String() != run.expected { + t.Errorf("expected %q but got %q", run.expected, buf.String()) + } + }) + } +} diff --git a/collapse/whitespace.go b/collapse/whitespace.go new file mode 100644 index 0000000..af13284 --- /dev/null +++ b/collapse/whitespace.go @@ -0,0 +1,79 @@ +package collapse + +import ( + "unsafe" +) + +func byteSliceToString(b []byte) string { + /* #nosec G103 */ + return *(*string)(unsafe.Pointer(&b)) +} + +func replaceAnyWhitespaceWithSpace(source string) string { + if source == "" { + return source + } + + // Some performance optimizations: + // - If no replacement was done, we return the original slice and dont allocate. + // - We batch appends + var ret []byte + makeIfNeeded := func() { + if ret == nil { + ret = make([]byte, 0, len(source)) + } + } + + startNormal := 0 + startMatch := -1 + for i := 0; i < len(source); i++ { + isWhitespace := source[i] == ' ' || source[i] == '\r' || source[i] == '\n' || source[i] == '\t' + + if startMatch == -1 && isWhitespace { + // Start of newlines + startMatch = i + continue + } else if startMatch != -1 && isWhitespace { + // Middle of newlines + continue + } else if startMatch != -1 { + // Character after the last newline character + + count := i - startMatch + if count == 1 && source[startMatch] == ' ' { + // There was only one `isWhitespace` match & that is a space. + // So the replacement would be exactly the same... + } else { + makeIfNeeded() + ret = append(ret, source[startNormal:startMatch]...) + ret = append(ret, byte(' ')) + startNormal = i + } + + startMatch = -1 + } + } + + if startMatch == -1 && startNormal == 0 { + // a) no changes need to be done + } else if startMatch == -1 { + // b) Only the normal characters until the end still need to be added + makeIfNeeded() + ret = append(ret, source[startNormal:]...) + } else if ret == nil && len(source)-startMatch == 1 && source[startMatch] == ' ' { + // c) There is a match, but it is exactly the same as the replacement + // If there is no new slice, we can skip the replacement. + } else { + // d) The match still needs to be replaced (and possible the previous normal characters be added) + makeIfNeeded() + ret = append(ret, source[startNormal:startMatch]...) + ret = append(ret, byte(' ')) + } + + if ret == nil { + // Huray, we did not do any allocations with make() + // and instead just return the original slice. + return source + } + return byteSliceToString(ret) +} diff --git a/collapse/whitespace_test.go b/collapse/whitespace_test.go new file mode 100644 index 0000000..e55a924 --- /dev/null +++ b/collapse/whitespace_test.go @@ -0,0 +1,247 @@ +package collapse + +import ( + "regexp" + "strings" + "testing" +) + +// This is the alternative (but slower) function that uses regex. +func _regexReplaceAnyWhitespaceWithSpace(text string) string { + var rAnyWhitespace = regexp.MustCompile(`[ \r\n\t]+`) + + return rAnyWhitespace.ReplaceAllString(text, " ") +} + +func TestReplaceAnyWhitespaceWithSpace(t *testing.T) { + runs := []struct { + desc string + input string + expected string + }{ + { + desc: "empty", + input: "", + expected: "", + }, + { + desc: "one space", + input: " ", + expected: " ", + }, + { + desc: "two spaces", + input: " ", + expected: " ", + }, + { + desc: "many spaces", + input: " ", + expected: " ", + }, + { + desc: "one newline", + input: "\n", + expected: " ", + }, + { + desc: "many newlines", + input: "\n\n\n\n", + expected: " ", + }, + { + desc: "combination of newlines and spaces", + input: "\n a \nb \nc\n", + expected: " a b c ", + }, + { + desc: "special dash", + input: " \u2013 ", + expected: " \u2013 ", + }, + { + desc: "no spaces in text", + input: "abcdef", + expected: "abcdef", + }, + { + desc: "one space in text", + input: "abc def", + expected: "abc def", + }, + { + desc: "two spaces in text", + input: "abc def", + expected: "abc def", + }, + { + desc: "one newline in text", + input: "abc\ndef", + expected: "abc def", + }, + { + desc: "two newlines in text", + input: "abc\n\ndef", + expected: "abc def", + }, + { + desc: "a newline and space in text", + input: "abc \ndef", + expected: "abc def", + }, + { + desc: "one space before text", + input: " abcdef", + expected: " abcdef", + }, + { + desc: "two spaces before text", + input: " abcdef", + expected: " abcdef", + }, + { + desc: "one space after text", + input: "abcdef ", + expected: "abcdef ", + }, + { + desc: "two spaces after text", + input: "abcdef ", + expected: "abcdef ", + }, + { + desc: "multiple spaces before & one space after", + input: " or ", + expected: " or ", + }, + { + desc: "multiple spaces before & multiple spaces after", + input: " or ", + expected: " or ", + }, + { + desc: "one space before & multiple spaces after", + input: " or ", + expected: " or ", + }, + } + + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + t.Run("Regex Version", func(t *testing.T) { + output := _regexReplaceAnyWhitespaceWithSpace(run.input) + + if output != run.expected { + t.Errorf("expected %q but got %q", run.expected, output) + } + }) + + t.Run("New Version", func(t *testing.T) { + output := replaceAnyWhitespaceWithSpace(run.input) + if output != run.expected { + t.Errorf("expected %q but got %q", run.expected, output) + } + + // Instead of writing all tests twice... + output2 := replaceAnyWhitespaceWithSpace(strings.ReplaceAll(run.input, " ", "\n")) + if output2 != run.expected { + t.Errorf("for newlines: expected %q but got %q", run.expected, output2) + } + }) + + }) + } +} + +func FuzzReplaceAnyWhitespaceWithSpace(f *testing.F) { + f.Add("abc def") + f.Add(" ") + f.Add("abc\n\ndef") + + f.Fuzz(func(t *testing.T, orig string) { + output1 := _regexReplaceAnyWhitespaceWithSpace(orig) + output2 := replaceAnyWhitespaceWithSpace(orig) + + if output1 != output2 { + t.Errorf("input:%q => regex: %q function: %q", orig, output1, output2) + } + }) +} + +func TestReplaceAnyWhitespaceWithSpace_Allocs(t *testing.T) { + const N = 1000 + + runs := []struct { + desc string + input string + expectedAllocs float64 + }{ + { + desc: "empty string", + input: "", + expectedAllocs: 0, + }, + { + desc: "one space", + input: " ", + expectedAllocs: 0, + }, + { + desc: "no spaces", + input: "abcdef", + expectedAllocs: 0, + }, + { + desc: "one space at start", + input: " abcdef", + expectedAllocs: 0, + }, + { + desc: "one space at end", + input: "abcdef ", + expectedAllocs: 0, + }, + { + desc: "one space in middle", + input: "abc def", + expectedAllocs: 0, + }, + { + desc: "one space at start, middle and end", + input: " abc def ", + expectedAllocs: 0, + }, + { + desc: "one space at start, middle and end", + input: "some longer text with spaces", + expectedAllocs: 0, + }, + + { + desc: "multiple spaces", + input: "abc def", + expectedAllocs: 1, + }, + { + desc: "multiple newlines & spaces", + input: "\n\nab cdef \n", + expectedAllocs: 1, + }, + { + desc: "longer string", + input: strings.Repeat("Lorem Ipsum is simply dummy text", 10), + expectedAllocs: 0, + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + avg := testing.AllocsPerRun(N, func() { + output := replaceAnyWhitespaceWithSpace(run.input) + _ = output + }) + if avg != run.expectedAllocs { + t.Errorf("expected %f allocations but got %f", run.expectedAllocs, avg) + } + }) + } +} diff --git a/convert.go b/convert.go new file mode 100644 index 0000000..5a26deb --- /dev/null +++ b/convert.go @@ -0,0 +1,23 @@ +package htmltomarkdown + +import ( + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark" + "golang.org/x/net/html" +) + +func ConvertString(htmlInput string) (string, error) { + conv := converter.NewConverter( + converter.WithPlugins(commonmark.NewCommonmarkPlugin()), + ) + + return conv.ConvertString(htmlInput) +} + +func ConvertNode(doc *html.Node) ([]byte, error) { + conv := converter.NewConverter( + converter.WithPlugins(commonmark.NewCommonmarkPlugin()), + ) + + return conv.ConvertNode(doc) +} diff --git a/convert_test.go b/convert_test.go new file mode 100644 index 0000000..0a6e22a --- /dev/null +++ b/convert_test.go @@ -0,0 +1,143 @@ +package htmltomarkdown_test + +import ( + "fmt" + "log" + "strings" + "sync" + "testing" + + htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2" + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark" + "golang.org/x/net/html" +) + +func ExampleConvertString() { + input := `Bold Text` + + markdown, err := htmltomarkdown.ConvertString(input) + if err != nil { + log.Fatal(err) + } + fmt.Println(markdown) + // Output: **Bold Text** +} +func ExampleConvertNode() { + input := `Bold Text` + + doc, err := html.Parse(strings.NewReader(input)) + if err != nil { + log.Fatal(err) + } + + markdown, err := htmltomarkdown.ConvertNode(doc) + if err != nil { + log.Fatal(err) + } + fmt.Println(string(markdown)) + // Output: **Bold Text** +} + +func TestConvertString_WindowsCarriageReturn(t *testing.T) { + testCases := []struct { + desc string + + input string + expected string + }{ + { + desc: "just newlines", + + input: "\r\n\r\n\r\n\r\n", + expected: "", + }, + { + desc: "inside strong", + + input: "Bold\r\n\r\n\r\n\r\nText", + expected: "**Bold Text**", + }, + { + desc: "inside paragraph", + + input: "

Some\r\n\r\n\r\n\r\nText

", + expected: "Some Text", + }, + { + desc: "inside list", + + input: "
  • Some\r\n\r\n\r\n\r\nText
", + expected: "- Some Text", + }, + } + for _, tC := range testCases { + t.Run(tC.desc, func(t *testing.T) { + output, err := htmltomarkdown.ConvertString(tC.input) + if err != nil { + log.Fatal(err) + } + if output != tC.expected { + t.Errorf("expected %q but got %q", tC.expected, output) + } + }) + } +} + +func TestDataRaceDetector(t *testing.T) { + conv := converter.NewConverter( + converter.WithPlugins(commonmark.NewCommonmarkPlugin()), + ) + + input := `italic text` + + var wg sync.WaitGroup + + for i := 0; i < 500; i++ { + wg.Add(1) + go func() { + conv.Register.EscapedChar('~') + conv.Register.UnEscaper( + func(chars []byte, index int) int { return -1 }, + converter.PriorityStandard, + ) + conv.Register.PreRenderer( + func(ctx converter.Context, doc *html.Node) {}, + converter.PriorityStandard, + ) + conv.Register.TextTransformer( + func(ctx converter.Context, content string) string { return content }, + converter.PriorityStandard, + ) + conv.Register.Renderer( + func(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + return converter.RenderTryNext + }, + converter.PriorityStandard, + ) + conv.Register.PostRenderer( + func(ctx converter.Context, content []byte) []byte { + return content + }, + converter.PriorityStandard, + ) + + conv.Register.TagStrategy("script", converter.StrategyHTMLBlock) + + output, err := conv.ConvertString(input, converter.WithDomain("example.com")) + if err != nil { + t.Error(err) + } + _ = output + + output2, err := conv.ConvertString(input) + if err != nil { + t.Error(err) + } + _ = output2 + + wg.Done() + }() + } + wg.Wait() +} diff --git a/converter/base.go b/converter/base.go new file mode 100644 index 0000000..2ccde34 --- /dev/null +++ b/converter/base.go @@ -0,0 +1,133 @@ +package converter + +import ( + "bytes" + "strings" + + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/collapse" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/domutils" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/textutils" + "github.com/JohannesKaufmann/html-to-markdown/v2/marker" + + "golang.org/x/net/html" +) + +func (conv *Converter) registerBase() { + conv.Register.TagStrategy("#comment", StrategyRemoveNode) + conv.Register.TagStrategy("head", StrategyRemoveNode) + conv.Register.TagStrategy("script", StrategyRemoveNode) + conv.Register.TagStrategy("style", StrategyRemoveNode) + conv.Register.TagStrategy("link", StrategyRemoveNode) + conv.Register.TagStrategy("meta", StrategyRemoveNode) + + conv.Register.TagStrategy("iframe", StrategyRemoveNode) + conv.Register.TagStrategy("noscript", StrategyRemoveNode) + + conv.Register.TagStrategy("input", StrategyRemoveNode) + conv.Register.TagStrategy("textarea", StrategyRemoveNode) + + // "tr" is not in the `IsBlockNode` list, + // but we want to treat is as a block anyway. + conv.Register.TagStrategy("tr", StrategyMarkdownBlock) + + conv.Register.PreRenderer(conv.preRenderRemove, PriorityEarly) + + // Note: The priority is low, so that collapse runs _after_ all the other functions + conv.Register.PreRenderer(conv.preRenderCollapse, PriorityLate) + + conv.Register.Renderer(conv.handleRender, PriorityStandard) + + conv.Register.TextTransformer(conv.handleTextTransform, PriorityStandard) + + conv.Register.PostRenderer(conv.postRenderTrimContent, PriorityStandard) + conv.Register.PostRenderer(conv.postRenderUnescapeContent, PriorityStandard+20) +} + +func (conv *Converter) preRenderRemove(ctx Context, doc *html.Node) { + var finder func(node *html.Node) + finder = func(node *html.Node) { + name := dom.NodeName(node) + + if val, _ := conv.getTagStrategy(name); val == StrategyRemoveNode { + dom.RemoveNode(node) + return + } + + for child := node.FirstChild; child != nil; child = child.NextSibling { + // Because we are sometimes removing a node, this causes problems + // with the for loop. Using `defer` is a cool trick! + // https://gist.github.com/loopthrough/17da0f416054401fec355d338727c46e + defer finder(child) + } + } + finder(doc) + + // - - - - - - - // + + // After removing elements (see above) it can happen that we have + // two #text nodes right next to each other. This would cause problems + // with the collapse so we merge them together. + domutils.MergeAdjacentTextNodes(doc) +} + +func (conv *Converter) preRenderCollapse(ctx Context, doc *html.Node) { + collapse.Collapse(doc) +} + +func (conv *Converter) handleRender(ctx Context, w Writer, n *html.Node) RenderStatus { + name := dom.NodeName(n) + + switch name { + case "#text": + return conv.renderText(ctx, w, n) + } + + return RenderTryNext +} + +func (conv *Converter) handleTextTransform(ctx Context, content string) string { + + // TODO: reduce conversion between types + content = string(conv.escapeContent([]byte(content))) + + return content +} + +var characterEntityReplacer = strings.NewReplacer( + // We are not using `html.EscapeString` because we + // care about fewer characters + "<", "<", + ">", ">", + "&", "&", +) + +func (conv *Converter) renderText(ctx Context, w Writer, n *html.Node) RenderStatus { + content := n.Data + + // TODO: similar to UnEscapers also only escape if nessesary. + // "<" only if not followed by space + // "&" only if character entity + content = characterEntityReplacer.Replace(content) + + for _, handler := range conv.getTextTransformHandlers() { + content = handler.Value(ctx, content) + } + + w.WriteString(content) + return RenderSuccess +} + +func (conv *Converter) postRenderTrimContent(ctx Context, result []byte) []byte { + // Remove whitespace from the beginning & end + result = bytes.TrimFunc(result, marker.IsSpace) + + // Remove too many newlines + result = textutils.TrimConsecutiveNewlines(result) + + return result +} +func (conv *Converter) postRenderUnescapeContent(ctx Context, result []byte) []byte { + result = conv.unEscapeContent(result) + return result +} diff --git a/converter/base_test.go b/converter/base_test.go new file mode 100644 index 0000000..3a24683 --- /dev/null +++ b/converter/base_test.go @@ -0,0 +1,164 @@ +package converter_test + +import ( + "testing" + + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "golang.org/x/net/html" +) + +func TestConvertString_Base(t *testing.T) { + runs := []struct { + desc string + input string + expected string + }{ + // - - - - removing nodes - - - - // + { + desc: "automatically removed", + input: ` +
+ Start + + End +
`, + expected: "Start End", + }, + { + desc: "configured to be removed", + input: ` +
+ Start + To be removed + End +
`, + expected: "Start End", + }, + + // - - - - markdown block - - - - // + { + desc: "automatically a block node", + input: ` +
+ Start +
Article
+ End +
`, + expected: "Start\n\nArticle\n\nEnd", + }, + { + desc: "configured as block node", + input: ` +
+ Start + Block with markdown + End +
`, + // TODO: expected: "Start\n\nBlock **with** markdown\n\nEnd", + // For this the `Collapse` function needs to accept a custom + // `isBlockNode` function that gets info from the tag strategies + + expected: "Start \n\nBlock **with** markdown\n\n End", + }, + + // - - - - markdown leaf - - - - // + { + desc: "automatically a leaf node", + input: ` +
+ Start + Span + End +
`, + expected: "Start Span End", + }, + { + desc: "default a leaf node", + input: ` +
+ Start + Random + End +
`, + expected: "Start Random End", + }, + { + desc: "configured as leaf node", + input: ` +
+ Start + Leaf + End +
`, + expected: "Start Leaf End", + }, + { + desc: "overridden to be not removed", + input: ` +
+ Start + + End +
`, + expected: "Start Style End", + }, + + // - - - - keep as html - - - - // + { + desc: "configured as html block node", + input: ` +
+ Start +

Test

+ End +
`, + expected: "Start\n\n

Test

\n\nEnd", + }, + // - - - - html shell with markdown children - - - - // + { + desc: "configured as html block with markdown children", + input: ` +
+ +

bold text

+
+
`, + expected: "\n\n**bold** text\n\n", + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + conv := converter.NewConverter() + + conv.Register.Renderer(func(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + name := dom.NodeName(n) + if name == "b" { + w.WriteString("**") + ctx.RenderChildNodes(ctx, w, n) + w.WriteString("**") + + return converter.RenderSuccess + } + + return converter.RenderTryNext + }, converter.PriorityStandard) + + conv.Register.TagStrategy("my_remove_node", converter.StrategyRemoveNode) + conv.Register.TagStrategy("my_markdown_block", converter.StrategyMarkdownBlock) + conv.Register.TagStrategy("my_markdown_leaf", converter.StrategyMarkdownLeaf) + conv.Register.TagStrategy("my_html_block", converter.StrategyHTMLBlock) + conv.Register.TagStrategy("my_html_shell", converter.StrategyHTMLBlockWithMarkdown) + + conv.Register.TagStrategy("style", converter.StrategyMarkdownLeaf) + + out, err := conv.ConvertString(run.input) + if err != nil { + t.Error(err) + } + if out != run.expected { + t.Errorf("expected %q but got %q", run.expected, out) + } + }) + } +} diff --git a/converter/convert.go b/converter/convert.go new file mode 100644 index 0000000..cf01b3f --- /dev/null +++ b/converter/convert.go @@ -0,0 +1,118 @@ +package converter + +import ( + "bytes" + "context" + "errors" + "io" + "strings" + + "golang.org/x/net/html" +) + +type convertOption struct { + domain string + context context.Context +} +type convertOptionFunc func(o *convertOption) + +func WithContext(ctx context.Context) convertOptionFunc { + return func(o *convertOption) { + o.context = ctx + } +} +func WithDomain(domain string) convertOptionFunc { + return func(o *convertOption) { + o.domain = domain + } +} + +func (conv *Converter) setError(err error) { + conv.m.Lock() + defer conv.m.Unlock() + + conv.err = err +} +func (conv *Converter) getError() error { + conv.m.RLock() + defer conv.m.RUnlock() + + return conv.err +} + +var errNoRenderHandlers = errors.New("no render handlers are registered. did you forget to register the commonmark plugin?") + +func (conv *Converter) ConvertNode(doc *html.Node, opts ...convertOptionFunc) ([]byte, error) { + + if err := conv.getError(); err != nil { + // There can be errors while calling `Init` on the plugins (e.g. validation errors). + // Now is the first opportunity where we can return that error. + return nil, err + } + + conv.m.Lock() + option := &convertOption{} + for _, fn := range opts { + fn(option) + } + conv.m.Unlock() + + // If there are no render handlers registered (apart from the base) this is + // usually a user error - since people want the Commonmark Plugin in 99% of cases. + countBaseRenderHandlers := 1 + if len(conv.getRenderHandlers()) == countBaseRenderHandlers { + return nil, errNoRenderHandlers + } + + // - - - - - - - - - - - - - - - - - - - // + + state := newGlobalState() + + if option.context == nil { + option.context = context.Background() + } + ctx := option.context + ctx = provideDomain(ctx, option.domain) + ctx = provideAssembleAbsoluteURL(ctx, defaultAssembleAbsoluteURL) + ctx = state.provideGlobalState(ctx) + + customCtx := newConverterContext(ctx, conv) + + // - - - - - - - - - - - - - - - - - - - // + + // Pre-Render + for _, handler := range conv.getPreRenderHandlers() { + handler.Value(customCtx, doc) + } + + // Render + var buf bytes.Buffer + conv.handleRenderNode(customCtx, &buf, doc) + + // Post-Render + result := buf.Bytes() + for _, handler := range conv.getPostRenderHandlers() { + result = handler.Value(customCtx, result) + } + + return result, nil +} + +func (conv *Converter) ConvertReader(r io.Reader, opts ...convertOptionFunc) ([]byte, error) { + doc, err := html.Parse(r) + if err != nil { + return nil, err + } + + return conv.ConvertNode(doc, opts...) +} + +func (conv *Converter) ConvertString(htmlInput string, opts ...convertOptionFunc) (string, error) { + r := strings.NewReader(htmlInput) + output, err := conv.ConvertReader(r, opts...) + if err != nil { + return "", err + } + + return string(output), nil +} diff --git a/converter/convert_test.go b/converter/convert_test.go new file mode 100644 index 0000000..bd4bfed --- /dev/null +++ b/converter/convert_test.go @@ -0,0 +1,78 @@ +package converter_test + +import ( + "testing" + + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "golang.org/x/net/html" +) + +func TestConvertString(t *testing.T) { + conv := converter.NewConverter() + + preRenderer := func(ctx converter.Context, doc *html.Node) { + for _, node := range dom.AllNodes(doc) { + name := dom.NodeName(node) + + if name == "test" { + node.Attr[0].Val = "other_value" + } + } + } + renderer := func(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + name := dom.NodeName(n) + + if name == "#text" { + w.WriteString(n.Data) + return converter.RenderSuccess + } else if name == "test" { + val := dom.GetAttributeOr(n, "key", "") + w.WriteString(val) + return converter.RenderSuccess + } + + return converter.RenderTryNext + } + postRenderer := func(ctx converter.Context, content []byte) []byte { + return content + } + + conv.Register.PreRenderer(preRenderer, converter.PriorityStandard) + + conv.Register.Renderer(renderer, converter.PriorityStandard) + conv.Register.PostRenderer(postRenderer, converter.PriorityStandard) + + output, err := conv.ConvertString(`beforeafter`) + if err != nil { + t.Error(err) + } + + expected := "beforeother_valueafter" + if output != expected { + t.Errorf("expected %q but got %q", expected, output) + } +} + +func TestConvertString_ErrNoRenderHandlers(t *testing.T) { + conv := converter.NewConverter() + _, err := conv.ConvertString("bold text") + if err == nil { + t.Fatal("expected an error") + } + if err.Error() != "no render handlers are registered. did you forget to register the commonmark plugin?" { + t.Fatal("expected a different error but got", err) + } + + // - - - - // + + // Now that we registered something we should not receive an error anymore... + conv.Register.Renderer(func(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + return converter.RenderTryNext + }, converter.PriorityStandard) + + _, err = conv.ConvertString("bold text") + if err != nil { + t.Fatal("did not expect an error since we registered a renderer") + } +} diff --git a/converter/converter.go b/converter/converter.go new file mode 100644 index 0000000..44cfe60 --- /dev/null +++ b/converter/converter.go @@ -0,0 +1,44 @@ +package converter + +import "sync" + +type Converter struct { + m sync.RWMutex + + err error + + preRenderHandlers prioritizedSlice[HandlePreRenderFunc] + renderHandlers prioritizedSlice[HandleRenderFunc] + postRenderHandlers prioritizedSlice[HandlePostRenderFunc] + + textTransformHandlers prioritizedSlice[HandleTextTransformFunc] + + markdownChars map[rune]interface{} + unEscapeHandlers prioritizedSlice[HandleUnEscapeFunc] + + tagStrategies map[string]tagStrategy + + Register register +} + +type converterOption = func(c *Converter) error + +func NewConverter(opts ...converterOption) *Converter { + conv := &Converter{ + markdownChars: make(map[rune]interface{}), + tagStrategies: make(map[string]tagStrategy), + } + conv.Register = register{conv} + + conv.registerBase() + + for _, opt := range opts { + err := opt(conv) + if err != nil { + conv.setError(err) + break + } + } + + return conv +} diff --git a/converter/ctx.go b/converter/ctx.go new file mode 100644 index 0000000..72f6d00 --- /dev/null +++ b/converter/ctx.go @@ -0,0 +1,186 @@ +package converter + +import ( + "context" + "fmt" + + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +// func GetValue[K string, V any](ctx context.Context, key K) V { +// val, _ := ctx.Value(key).(V) +// return val +// } +// func SetValue[K string, V any](ctx context.Context, key K, val V) context.Context { +// return context.WithValue(ctx, key, val) +// } + +type ctxKey string + +const ( + ctxKeyAssembleAbsoluteURL ctxKey = "AssembleAbsoluteURL" + ctxKeyDomain ctxKey = "Domain" + + ctxKeySetState ctxKey = "SetState" + ctxKeyUpdateState ctxKey = "UpdateState" + ctxKeyGetState ctxKey = "GetState" +) + +func provideDomain(ctx context.Context, domain string) context.Context { + return context.WithValue(ctx, ctxKeyDomain, domain) +} +func GetDomain(ctx context.Context) string { + domain, ok := ctx.Value(ctxKeyDomain).(string) + if !ok { + fmt.Println("[warning] value ctxKeyDomain is different") + return "" + } + + return domain +} + +// - - - - - - - - - - - - - - - - - - - - - // + +type AssembleAbsoluteURLFunc func(elem Element, rawURL string, domain string) string + +func assembleAbsoluteURL(ctx context.Context, elem Element, rawURL string) string { + domain := GetDomain(ctx) + + // TODO: since this gets passed down from the converter, it doesn't have to provided from the ctx anymore + fn, ok := ctx.Value(ctxKeyAssembleAbsoluteURL).(AssembleAbsoluteURLFunc) + if !ok { + fmt.Println("[warning] func ctxKeyAssembleAbsoluteURL is different") + return "" + } + + return fn(elem, rawURL, domain) +} + +func provideAssembleAbsoluteURL(ctx context.Context, fn AssembleAbsoluteURLFunc) context.Context { + return context.WithValue(ctx, ctxKeyAssembleAbsoluteURL, fn) +} + +// - - - - - - - - - - - - - - - - - - - - - // + +type SetStateFunc func(key string, val any) +type UpdateStateFunc func(key string, fn func(any) any) +type GetStateFunc func(key string) any + +type globalState struct { + data map[string]any +} + +func newGlobalState() *globalState { + + return &globalState{ + data: make(map[string]any), + } +} + +func (s *globalState) setState(key string, val any) { + s.data[key] = val +} +func (s *globalState) updateState(key string, fn func(any) any) { + s.data[key] = fn(s.data[key]) +} +func (s *globalState) getState(key string) any { + return s.data[key] +} + +func (s *globalState) provideGlobalState(ctx context.Context) context.Context { + + var setState SetStateFunc = s.setState + var updateState UpdateStateFunc = s.updateState + var getState GetStateFunc = s.getState + + ctx = context.WithValue(ctx, ctxKeySetState, setState) + ctx = context.WithValue(ctx, ctxKeyUpdateState, updateState) + ctx = context.WithValue(ctx, ctxKeyGetState, getState) + + return ctx +} + +func GetState[V any](ctx context.Context, key string) V { + fn := ctx.Value(ctxKeyGetState).(GetStateFunc) + + val, _ := fn(key).(V) + + return val +} + +func SetState[V any](ctx context.Context, key string, val V) { + fn := ctx.Value(ctxKeySetState).(SetStateFunc) + + fn(key, val) +} + +func UpdateState[V any](ctx context.Context, key string, fn func(V) V) { + updater := ctx.Value(ctxKeyUpdateState).(UpdateStateFunc) + + updater(key, func(val any) any { + value, ok := val.(V) + if !ok && val != nil { + // TODO: slog? + fmt.Println("[warning] val is different than V in UpdateState") + } + + return fn(value) + }) +} + +// - - - - - - // + +// Context extends the normal context.Context with some additional +// methods useful for the process of converting. +type Context interface { + context.Context + + AssembleAbsoluteURL(ctx Context, elem Element, rawURL string) string + + GetTagStrategy(tagName string) (tagStrategy, bool) + + RenderNodes(ctx Context, w Writer, nodes ...*html.Node) + RenderChildNodes(ctx Context, w Writer, n *html.Node) + + UnEscapeContent(content []byte) []byte + + WithValue(key any, val any) Context +} + +type converterContext struct { + context.Context + conv *Converter +} + +func newConverterContext(ctx context.Context, conv *Converter) Context { + return &converterContext{ + Context: ctx, + conv: conv, + } +} + +func (c *converterContext) AssembleAbsoluteURL(ctx Context, elem Element, rawURL string) string { + return assembleAbsoluteURL(ctx, elem, rawURL) +} + +func (c *converterContext) RenderNodes(ctx Context, w Writer, nodes ...*html.Node) { + c.conv.handleRenderNodes(ctx, w, nodes...) +} +func (c *converterContext) RenderChildNodes(ctx Context, w Writer, n *html.Node) { + c.conv.handleRenderNodes(ctx, w, dom.AllChildNodes(n)...) +} + +func (c *converterContext) GetTagStrategy(tagName string) (tagStrategy, bool) { + return c.conv.getTagStrategy(tagName) +} +func (c *converterContext) UnEscapeContent(content []byte) []byte { + return c.conv.unEscapeContent(content) +} + +func (c *converterContext) WithValue(key any, val any) Context { + return &converterContext{ + Context: context.WithValue(c.Context, key, val), + conv: c.conv, + } +} diff --git a/converter/ctx_test.go b/converter/ctx_test.go new file mode 100644 index 0000000..fd160f4 --- /dev/null +++ b/converter/ctx_test.go @@ -0,0 +1,54 @@ +package converter + +import ( + "context" + "testing" +) + +func TestState(t *testing.T) { + state := newGlobalState() + + ctx := context.Background() + ctx = state.provideGlobalState(ctx) + + val := GetState[int](ctx, "key") + if val != 0 { + t.Errorf("expected different value but got %d", val) + } + + SetState[int](ctx, "key", 10) + + UpdateState[int](ctx, "key", func(i int) int { + return i + 5 + }) + + val = GetState[int](ctx, "key") + if val != 15 { + t.Errorf("expected different value but got %d", val) + } +} + +func TestContext(t *testing.T) { + conv := NewConverter() + bgCtx := context.Background() + + ctx := newConverterContext(bgCtx, conv) + + ctx1 := ctx.WithValue("keyA", "a1") + if ctx1.Value("keyA") != "a1" { + t.Error("got different value") + } + + ctx2 := ctx.WithValue("keyA", "a2") + if ctx2.Value("keyA") != "a2" { + t.Error("got different value") + } + + ctx3 := ctx.WithValue("keyB", "b1") + if ctx3.Value("keyA") != nil { + t.Error("expected nil value") + } + if ctx3.Value("keyB") != "b1" { + t.Error("got different value") + } +} diff --git a/converter/escape.go b/converter/escape.go new file mode 100644 index 0000000..3247f86 --- /dev/null +++ b/converter/escape.go @@ -0,0 +1,82 @@ +package converter + +import ( + "unicode/utf8" + + "github.com/JohannesKaufmann/html-to-markdown/v2/marker" +) + +const ( + actionKeep = iota + actionEscape = iota +) + +// IMPORTANT: Only internally we assume it is only byte +var placeholderByte byte = marker.BytesMarkerEscaping[0] + +func (conv *Converter) escapeContent(chars []byte) []byte { + + newChars := make([]byte, 0, len(chars)) + for index := 0; index < len(chars); index++ { + if chars[index] == '\u0000' { + // For security reasons, the Unicode character U+0000 must be replaced with the REPLACEMENT CHARACTER (U+FFFD). + newChars = append(newChars, []byte(string('\ufffd'))...) + continue + } + + r, _ := utf8.DecodeRune(chars[index:]) + + isMarkdownChar := conv.checkIsEscapedChar(r) + if isMarkdownChar { + newChars = append(newChars, placeholderByte, chars[index]) + } else { + newChars = append(newChars, chars[index]) + } + } + + return newChars +} + +func (conv *Converter) unEscapeContent(chars []byte) []byte { + checkElements := func(index int) int { + for _, handler := range conv.getUnEscapeHandlers() { + if skip := handler.Value(chars, index); skip != -1 { + return skip + } + } + + return -1 + } + + changes := make([]uint8, len(chars)) + for index := 0; index < len(chars); index++ { + + if chars[index] != placeholderByte { + continue + } + if index+1 >= len(chars) { + break + } + + skip := checkElements(index + 1) + if skip == -1 { + continue + } + changes[index] = actionEscape + index += skip - 1 + } + + newChars := make([]byte, 0, len(chars)) + for index, char := range chars { + if char != placeholderByte { + newChars = append(newChars, char) + continue + } + + // What to do with this placeholder? Should we escape or not? + if changes[index] == actionEscape { + newChars = append(newChars, '\\') + } + } + return newChars +} diff --git a/converter/escape_test.go b/converter/escape_test.go new file mode 100644 index 0000000..5eef1e3 --- /dev/null +++ b/converter/escape_test.go @@ -0,0 +1,55 @@ +package converter + +import ( + "testing" +) + +func TestEscapeContent(t *testing.T) { + conv := NewConverter() + conv.Register.EscapedChar('>') + + input := []byte{'a', '>'} + + output := conv.escapeContent(input) + if len(output) != 3 { + t.Error("expected different length") + } + // Since '>' is a character used for quotes in markdown, + // there should be a marker before it. + if output[0] != 'a' || output[1] != placeholderByte || output[2] != '>' { + t.Error("expected different characters") + } +} + +func TestUnEscapeContent(t *testing.T) { + conv := NewConverter() + conv.Register.UnEscaper(func(chars []byte, index int) int { + if chars[index] != '>' { + return -1 + } + + // A bit too simplistic for demonstration purposes. + // Normally here would be content to check if the escaping is necessary... + return 1 + }, PriorityStandard) + + input := []byte{placeholderByte, 'a', 'b'} + output := conv.unEscapeContent(input) + if len(output) != 2 { + t.Error("expected different length") + } + // No escaping needed + if output[0] != 'a' || output[1] != 'b' { + t.Error("expected different characters") + } + + input = []byte{placeholderByte, '>', 'a'} + output = conv.unEscapeContent(input) + if len(output) != 3 { + t.Error("expected different length") + } + // Escaping needed + if output[0] != '\\' || output[1] != '>' || output[2] != 'a' { + t.Error("expected different characters") + } +} diff --git a/converter/keep_remove.go b/converter/keep_remove.go new file mode 100644 index 0000000..2246221 --- /dev/null +++ b/converter/keep_remove.go @@ -0,0 +1,42 @@ +package converter + +type tagStrategy string + +const ( + // - - - - - removing - - - - - // + + // StrategyRemoveNode will remove that node in the _PreRender_ phase + // with a high priority. + StrategyRemoveNode tagStrategy = "StrategyRemoveNode" + + // - - - - - markdown - - - - - // + + // StrategyMarkdownLeaf will keep the children of this node as markdown. + // + // This is the default for unknown nodes — where there + // is no registered _Render_ handler. + StrategyMarkdownLeaf tagStrategy = "StrategyMarkdownLeaf" + + // StrategyMarkdownBlock will keep the children of this node as markdown + // AND will render newlines. + // + // This is the default for html nodes that have + // a) no registered _Render_ handler AND + // b) where `dom.NameIsBlockNode()` returns true. + StrategyMarkdownBlock tagStrategy = "StrategyMarkdownBlock" + + // - - - - - html - - - - - // + + // TODO: is this needed? + // StrategyHTMLLeaf will render the node as HTML using `html.Render()` + // StrategyHTMLLeaf tagStrategy = "StrategyHTMLLeaf" + + // StrategyHTMLBlock will render the node as HTML using `html.Render()` + StrategyHTMLBlock tagStrategy = "StrategyHTMLBlock" + + // - - - - - html & markdown - - - - - // + + // StrategyHTMLBlockWithMarkdown will render the node as HTML + // and render the children as markdown. + StrategyHTMLBlockWithMarkdown tagStrategy = "StrategyHTMLBlockWithMarkdown" +) diff --git a/converter/plugin.go b/converter/plugin.go new file mode 100644 index 0000000..8e91f57 --- /dev/null +++ b/converter/plugin.go @@ -0,0 +1,22 @@ +package converter + +// Plugin can be used to extends functionality beyond what +// is offered by commonmark. +type Plugin interface { + // Init is called to initialize the plugin. It can be used to + // *validate* the arguments and *register* the rules. + Init(conv *Converter) error +} + +// WithPlugins can be used to add additional functionality to the converter. +func WithPlugins(plugins ...Plugin) converterOption { + return func(c *Converter) error { + for _, plugin := range plugins { + err := plugin.Init(c) + if err != nil { + return err + } + } + return nil + } +} diff --git a/converter/prioritized.go b/converter/prioritized.go new file mode 100644 index 0000000..b0d5196 --- /dev/null +++ b/converter/prioritized.go @@ -0,0 +1,33 @@ +package converter + +import "sort" + +const ( + // PriorityEarly means that the handler will be run **early** in the process. + // To run it even earlier you need to subtract from this number. + PriorityEarly = 100 + + // PriorityStandard is for handlers that don't need to be run in a particular order. + PriorityStandard = 500 + + // PriorityLate means that the handler will be run **late** in the process. + // To run it even later you need to add to this number. + PriorityLate = 1000 +) + +type prioritizedValue[V any] struct { + Value V + Priority int +} + +type prioritizedSlice[V any] []prioritizedValue[V] + +func (s prioritizedSlice[V]) Sort() { + sort.Slice(s, func(i, j int) bool { + return s[i].Priority < s[j].Priority + }) +} + +func prioritized[V any](v V, priority int) prioritizedValue[V] { + return prioritizedValue[V]{v, priority} +} diff --git a/converter/prioritized_test.go b/converter/prioritized_test.go new file mode 100644 index 0000000..6be2c4b --- /dev/null +++ b/converter/prioritized_test.go @@ -0,0 +1,36 @@ +package converter + +import ( + "reflect" + "testing" +) + +func TestPrioritizedSlice(t *testing.T) { + + var values = prioritizedSlice[string]{ + prioritized("b", PriorityStandard), + prioritized("c", PriorityLate), + prioritized("a", PriorityEarly), + } + + values.Sort() + + var expected = prioritizedSlice[string]{ + { + Value: "a", + Priority: PriorityEarly, + }, + { + Value: "b", + Priority: PriorityStandard, + }, + { + Value: "c", + Priority: PriorityLate, + }, + } + + if !reflect.DeepEqual(values, expected) { + t.Errorf("expected %+v but got %+v", expected, values) + } +} diff --git a/converter/register.go b/converter/register.go new file mode 100644 index 0000000..c8606bf --- /dev/null +++ b/converter/register.go @@ -0,0 +1,171 @@ +package converter + +import ( + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +type register struct { + conv *Converter +} + +// - - - - - - - - - - - - - Pre-Render - - - - - - - - - - - - - // + +type HandlePreRenderFunc func(ctx Context, doc *html.Node) + +func (r *register) PreRenderer(fn HandlePreRenderFunc, priority int) { + r.conv.m.Lock() + defer r.conv.m.Unlock() + + handler := prioritized(fn, priority) + r.conv.preRenderHandlers = append(r.conv.preRenderHandlers, handler) +} +func (conv *Converter) getPreRenderHandlers() prioritizedSlice[HandlePreRenderFunc] { + conv.m.RLock() + defer conv.m.RUnlock() + + handlers := make(prioritizedSlice[HandlePreRenderFunc], len(conv.preRenderHandlers)) + copy(handlers, conv.preRenderHandlers) + handlers.Sort() + + return handlers +} + +// - - - - - - - - - - - - - Render - - - - - - - - - - - - - // + +// Writer is an interface that only conforms to the Write* methods of bytes.Buffer +type Writer interface { + Write(p []byte) (n int, err error) + WriteByte(c byte) error + WriteRune(r rune) (n int, err error) + WriteString(s string) (n int, err error) +} + +type HandleRenderFunc func(ctx Context, w Writer, n *html.Node) RenderStatus + +func (r *register) Renderer(fn HandleRenderFunc, priority int) { + r.conv.m.Lock() + defer r.conv.m.Unlock() + + handler := prioritized(fn, priority) + r.conv.renderHandlers = append(r.conv.renderHandlers, handler) +} +func (conv *Converter) getRenderHandlers() prioritizedSlice[HandleRenderFunc] { + conv.m.RLock() + defer conv.m.RUnlock() + + handlers := make(prioritizedSlice[HandleRenderFunc], len(conv.renderHandlers)) + copy(handlers, conv.renderHandlers) + handlers.Sort() + + return handlers +} + +// - - - - - - - - - - - - - Post Render - - - - - - - - - - - - - // + +type HandlePostRenderFunc func(ctx Context, content []byte) []byte + +func (r *register) PostRenderer(fn HandlePostRenderFunc, priority int) { + r.conv.m.Lock() + defer r.conv.m.Unlock() + + handler := prioritized(fn, priority) + r.conv.postRenderHandlers = append(r.conv.postRenderHandlers, handler) +} +func (conv *Converter) getPostRenderHandlers() prioritizedSlice[HandlePostRenderFunc] { + conv.m.RLock() + defer conv.m.RUnlock() + + handlers := make(prioritizedSlice[HandlePostRenderFunc], len(conv.postRenderHandlers)) + copy(handlers, conv.postRenderHandlers) + handlers.Sort() + + return handlers +} + +// - - - - - - - - - - - - - Text - - - - - - - - - - - - - // + +type HandleTextTransformFunc func(ctx Context, content string) string + +func (r *register) TextTransformer(fn HandleTextTransformFunc, priority int) { + r.conv.m.Lock() + defer r.conv.m.Unlock() + + handler := prioritized(fn, priority) + r.conv.textTransformHandlers = append(r.conv.textTransformHandlers, handler) +} +func (conv *Converter) getTextTransformHandlers() prioritizedSlice[HandleTextTransformFunc] { + conv.m.RLock() + defer conv.m.RUnlock() + + handlers := make(prioritizedSlice[HandleTextTransformFunc], len(conv.textTransformHandlers)) + copy(handlers, conv.textTransformHandlers) + handlers.Sort() + + return handlers +} + +// - - - - - - - - - - - - - Escaping - - - - - - - - - - - - - // + +func (r *register) EscapedChar(chars ...rune) { + r.conv.m.Lock() + defer r.conv.m.Unlock() + + for _, char := range chars { + r.conv.markdownChars[char] = struct{}{} + } +} +func (conv *Converter) checkIsEscapedChar(r rune) bool { + conv.m.RLock() + defer conv.m.RUnlock() + + _, ok := conv.markdownChars[r] + return ok +} + +type HandleUnEscapeFunc func(chars []byte, index int) int + +func (r *register) UnEscaper(fn HandleUnEscapeFunc, priority int) { + r.conv.m.Lock() + defer r.conv.m.Unlock() + + handler := prioritized(fn, priority) + r.conv.unEscapeHandlers = append(r.conv.unEscapeHandlers, handler) +} +func (conv *Converter) getUnEscapeHandlers() prioritizedSlice[HandleUnEscapeFunc] { + conv.m.RLock() + defer conv.m.RUnlock() + + handlers := make(prioritizedSlice[HandleUnEscapeFunc], len(conv.unEscapeHandlers)) + copy(handlers, conv.unEscapeHandlers) + handlers.Sort() + + return handlers +} + +// - - - - - - - - - - - - - Tag Strategy - - - - - - - - - - - - - // + +func (r *register) TagStrategy(tagName string, strategy tagStrategy) { + r.conv.m.Lock() + defer r.conv.m.Unlock() + + r.conv.tagStrategies[tagName] = strategy +} +func (conv *Converter) getTagStrategy(tagName string) (tagStrategy, bool) { + conv.m.RLock() + defer conv.m.RUnlock() + + strategy, ok := conv.tagStrategies[tagName] + return strategy, ok +} +func (conv *Converter) getTagStrategyWithFallback(tagName string) tagStrategy { + decision, ok := conv.getTagStrategy(tagName) + if ok { + return decision + } + + if dom.NameIsBlockNode(tagName) { + return StrategyMarkdownBlock + } + return StrategyMarkdownLeaf +} diff --git a/converter/render.go b/converter/render.go new file mode 100644 index 0000000..bf63583 --- /dev/null +++ b/converter/render.go @@ -0,0 +1,65 @@ +package converter + +import ( + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +func (conv *Converter) handleRenderNode(ctx Context, w Writer, node *html.Node) RenderStatus { + for _, handler := range conv.getRenderHandlers() { + status := handler.Value(ctx, w, node) + if status == RenderSuccess { + return status + } + } + + return conv.fallbackRender(ctx, w, node) +} +func (conv *Converter) handleRenderNodes(ctx Context, w Writer, nodes ...*html.Node) { + for _, node := range nodes { + conv.handleRenderNode(ctx, w, node) + } +} + +func (conv *Converter) fallbackRender(ctx Context, w Writer, node *html.Node) RenderStatus { + + name := dom.NodeName(node) + decision := conv.getTagStrategyWithFallback(name) + + if decision == StrategyHTMLBlockWithMarkdown { + w.WriteString("<") + w.WriteString(name) + // TODO: also render the attributes? + w.WriteString(">\n\n") + + conv.handleRenderNodes(ctx, w, dom.AllChildNodes(node)...) + + w.WriteString("\n\n") + return RenderSuccess + } + + if decision == StrategyHTMLBlock { + w.WriteRune('\n') + w.WriteRune('\n') + _ = html.Render(w, node) // TODO: what to do with error? + w.WriteRune('\n') + w.WriteRune('\n') + return RenderSuccess + } + + if decision == StrategyMarkdownBlock { + w.WriteRune('\n') + w.WriteRune('\n') + conv.handleRenderNodes(ctx, w, dom.AllChildNodes(node)...) + w.WriteRune('\n') + w.WriteRune('\n') + + return RenderSuccess + } else { + conv.handleRenderNodes(ctx, w, dom.AllChildNodes(node)...) + + return RenderSuccess + } +} diff --git a/converter/status.go b/converter/status.go new file mode 100644 index 0000000..7ad6054 --- /dev/null +++ b/converter/status.go @@ -0,0 +1,8 @@ +package converter + +type RenderStatus int + +const ( + RenderTryNext RenderStatus = iota + RenderSuccess +) diff --git a/converter/url.go b/converter/url.go new file mode 100644 index 0000000..4aba5ee --- /dev/null +++ b/converter/url.go @@ -0,0 +1,108 @@ +package converter + +import ( + "fmt" + "net/url" + "strings" +) + +type Element string + +const ( + ElementLink Element = "ElementLink" + ElementImage Element = "ElementImage" +) + +var percentEncodingReplacer = strings.NewReplacer( + " ", "%20", + "[", "%5B", + "]", "%5D", + "(", "%28", + ")", "%29", + "<", "%3C", + ">", "%3E", +) + +func defaultAssembleAbsoluteURL(elem Element, rawURL string, domain string) string { + rawURL = strings.TrimSpace(rawURL) + + if rawURL == "#" { + // Golangs url.Parse does not seem to distinguish between + // no fragment and an empty fragment. + return rawURL + } + + // Increase the chance that the url will be parsed + rawURL = strings.ReplaceAll(rawURL, "\n", "%0A") + rawURL = strings.ReplaceAll(rawURL, "\t", "%09") + + u, err := url.Parse(rawURL) + if err != nil { + fmt.Printf("[invalid_url] err=%v url=%q \n", err, rawURL) + + // We can't do anything with this url because it is invalid + return percentEncodingReplacer.Replace(rawURL) + } + + if u.Scheme == "data" { + // This is a data uri (for example an inline base64 image) + return percentEncodingReplacer.Replace(rawURL) + } + + // The default Query().Encode() encodes the query parameters "sorted by key". + // Instead we want to keep the original order, but still encode the parameters. + u.RawQuery = ParseAndEncodeQuery(u.RawQuery) + + // For better compatibility (especially in regards to mailto links), + // instead of encoding a space with a "+" we use ""%20" to prevent + // e.g. the email reading "Hi+Johannes" instead of "Hi Johannes" + u.RawQuery = strings.ReplaceAll(u.RawQuery, "+", "%20") + + if domain != "" { + if u.Scheme == "" { + u.Scheme = "http" + } + if u.Host == "" { + u.Host = domain + } + } + + return percentEncodingReplacer.Replace(u.String()) +} + +// - - - - // + +func decodeAndEncode(original string) string { + s, err := url.QueryUnescape(original) + if err != nil { + return original + } + + return url.QueryEscape(s) +} + +func ParseAndEncodeQuery(rawQuery string) string { + if rawQuery == "" { + return "" + } + + rawParts := strings.Split(rawQuery, "&") + encodedParts := make([]string, len(rawParts)) + + for i, part := range rawParts { + splitted := strings.SplitN(part, "=", 2) + + if len(splitted) == 1 { + // A: Just the key + encodedParts[i] = decodeAndEncode(splitted[0]) + } else if splitted[1] == "" { + // B: The key and the equal sign + encodedParts[i] = decodeAndEncode(splitted[0]) + "=" + } else { + // C: The key and the equal sign and the value + encodedParts[i] = decodeAndEncode(splitted[0]) + "=" + decodeAndEncode(splitted[1]) + } + } + + return strings.Join(encodedParts, "&") +} diff --git a/converter/url_test.go b/converter/url_test.go new file mode 100644 index 0000000..989aed5 --- /dev/null +++ b/converter/url_test.go @@ -0,0 +1,305 @@ +package converter + +import ( + "testing" +) + +func TestDefaultAssembleAbsoluteURL(t *testing.T) { + runs := []struct { + desc string + + element Element + input string + domain string + + expected string + }{ + { + desc: "with whitespaces around", + input: " example.com \n ", + domain: "", + + expected: "example.com", + }, + { + desc: "empty fragment", + + element: ElementLink, + input: "#", + domain: "", + + expected: "#", + }, + { + desc: "fragment", + + element: ElementLink, + input: "#heading", + domain: "", + + expected: "#heading", + }, + { + desc: "fragment with space", + + element: ElementLink, + input: "#my heading", + domain: "", + + expected: "#my%20heading", + }, + { + desc: "no domain", + + element: ElementLink, + input: "/page.html?key=val#hash", + domain: "", + + expected: "/page.html?key=val#hash", + }, + { + desc: "with domain", + + element: ElementLink, + input: "/page.html?key=val#hash", + domain: "test.com", + + expected: "http://test.com/page.html?key=val#hash", + }, + { + desc: "data uri", + + element: ElementLink, + input: "data:image/gif;base64,R0lGODlhEAAQAMQAAORHHOVSKudfOulrSOp3WOyDZu6QdvCchPGolfO0o/XBs/fNwfjZ0frl3/zy7////wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACH5BAkAABAALAAAAAAQABAAAAVVICSOZGlCQAosJ6mu7fiyZeKqNKToQGDsM8hBADgUXoGAiqhSvp5QAnQKGIgUhwFUYLCVDFCrKUE1lBavAViFIDlTImbKC5Gm2hB0SlBCBMQiB0UjIQA7", + domain: "test.com", + + expected: "data:image/gif;base64,R0lGODlhEAAQAMQAAORHHOVSKudfOulrSOp3WOyDZu6QdvCchPGolfO0o/XBs/fNwfjZ0frl3/zy7////wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACH5BAkAABAALAAAAAAQABAAAAVVICSOZGlCQAosJ6mu7fiyZeKqNKToQGDsM8hBADgUXoGAiqhSvp5QAnQKGIgUhwFUYLCVDFCrKUE1lBavAViFIDlTImbKC5Gm2hB0SlBCBMQiB0UjIQA7", + }, + { + desc: "data uri (with spaces)", + + element: ElementLink, + input: "data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 56 56' width='56' height='56' %3E%3C/svg%3E", + domain: "test.com", + + expected: "data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%2056%2056'%20width='56'%20height='56'%20%3E%3C/svg%3E", + }, + { + desc: "URI scheme", + + element: ElementLink, + input: "slack://open?team=abc", + domain: "test.com", + + expected: "slack://open?team=abc", + }, + + { + desc: "already with http", + + element: ElementLink, + input: "http://www.example.com", + domain: "test.com", + + expected: "http://www.example.com", + }, + { + desc: "already with https", + + element: ElementLink, + input: "https://www.example.com", + domain: "test.com", + + expected: "https://www.example.com", + }, + { + desc: "query parameters", + input: "https://www.example.com?a=1&c=2&b=3&x=&y", + domain: "test.com", + + // Note: If we were to use Query().Encode() the query parameters + // would be re-ordered as "?a=1&b=3&c=2". + // We want to keep the original order! + expected: "https://www.example.com?a=1&c=2&b=3&x=&y", + }, + + { + desc: "invalid url with space", + input: "https://Open Demo", + domain: "", + + expected: "https://Open%20Demo", + }, + { + desc: "invalid url with space and brackets", + input: "https://Open [foo](uri) Demo", + domain: "", + + expected: "https://Open%20%5Bfoo%5D%28uri%29%20Demo", + }, + + { + desc: "mailto", + + element: ElementLink, + input: "mailto:hi@example.com?subject=Mail&cc=someoneelse@example.com", + domain: "test.com", + + expected: "mailto:hi@example.com?subject=Mail&cc=someoneelse%40example.com", + }, + { + desc: "invalid url with newline in mailto", + + element: ElementLink, + input: "mailto:hi@example.com?body=Hello\nJohannes", + domain: "test.com", + + expected: "mailto:hi@example.com?body=Hello%0AJohannes", + }, + { + desc: "mailto with already encoded space", + + element: ElementLink, + input: "mailto:hi@example.com?subject=Hello%20Johannes", + domain: "test.com", + + expected: "mailto:hi@example.com?subject=Hello%20Johannes", + }, + { + desc: "mailto with raw space", + + element: ElementLink, + input: "mailto:hi@example.com?subject=Greetings to Johannes", + domain: "test.com", + + expected: "mailto:hi@example.com?subject=Greetings%20to%20Johannes", + }, + { + desc: "mailto with german 'ä' character", + + element: ElementLink, + input: "mailto:hi@example.com?subject=Sie können gern einen Screenshot anhängen", + domain: "test.com", + + // Note: While a space " " is allowed inside then href attribute, + // in markdown the space would cause the link to not be recognized. + expected: "mailto:hi@example.com?subject=Sie%20k%C3%B6nnen%20gern%20einen%20Screenshot%20anh%C3%A4ngen", + }, + { + desc: "mailto with link", + + element: ElementLink, + input: "mailto:hi@example.com?body=Article: www.website.com/page.html", + domain: "test.com", + + expected: "mailto:hi@example.com?body=Article%3A%20www.website.com%2Fpage.html", + }, + { + desc: "brackets inside link #1", + + element: ElementLink, + input: "foo(and(bar)", + domain: "", + + expected: "foo%28and%28bar%29", + }, + { + desc: "brackets inside link #2", + + element: ElementLink, + input: "[foo](uri)", + domain: "", + + expected: "%5Bfoo%5D%28uri%29", + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + res := defaultAssembleAbsoluteURL(run.element, run.input, run.domain) + if res != run.expected { + t.Errorf("expected '%s' but got '%s'", run.expected, res) + } + }) + } +} + +func TestParseAndEncode(t *testing.T) { + runs := []struct { + desc string + + input string + + expected string + }{ + { + desc: "empty string", + input: "", + expected: "", + }, + { + desc: "one pair", + input: "a=1", + expected: "a=1", + }, + { + desc: "multiple pairs", + input: "a=1&b=2&c=3", + expected: "a=1&b=2&c=3", + }, + { + desc: "keep order of multiple pairs", + input: "a=1&c=2&b=3", + expected: "a=1&c=2&b=3", + }, + { + desc: "encode a space", + input: "a=hello world&b=hello", + expected: "a=hello+world&b=hello", + }, + + { + desc: "value with space is encoded with percent", + input: "key=%20", + expected: "key=+", + }, + { + desc: "key with space is encoded with percent", + input: "%20=value", + expected: "+=value", + }, + { + desc: "key with space is encoded with plus", + input: "key=+", + expected: "key=+", + }, + { + desc: "value with space is encoded with plus", + input: "+=value", + expected: "+=value", + }, + + { + desc: "continue on error at value", + // The error would be: + // invalid URL escape "%" + input: "a=1&b=%&c=hello world", + expected: "a=1&b=%&c=hello+world", + }, + { + desc: "continue on error at key", + // The error would be: + // invalid URL escape "%" + input: "a=1&%=2&c=hello world", + expected: "a=1&%=2&c=hello+world", + }, + } + + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + output := ParseAndEncodeQuery(run.input) + if output != run.expected { + t.Errorf("expected '%s' but got '%s'", run.expected, output) + } + }) + } +} diff --git a/examples/basics/main.go b/examples/basics/main.go new file mode 100644 index 0000000..8008476 --- /dev/null +++ b/examples/basics/main.go @@ -0,0 +1,19 @@ +package main + +import ( + "fmt" + "log" + + htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2" +) + +func main() { + input := `Bold Text` + + markdown, err := htmltomarkdown.ConvertString(input) + if err != nil { + log.Fatal(err) + } + fmt.Println(markdown) + // Output: **Bold Text** +} diff --git a/examples/options/main.go b/examples/options/main.go new file mode 100644 index 0000000..dc63b19 --- /dev/null +++ b/examples/options/main.go @@ -0,0 +1,29 @@ +package main + +import ( + "fmt" + "log" + + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark" +) + +func main() { + input := `Bold Text` + + conv := converter.NewConverter( + converter.WithPlugins( + commonmark.NewCommonmarkPlugin( + commonmark.WithStrongDelimiter("__"), + // ...additional configurations for the plugin + ), + ), + ) + + markdown, err := conv.ConvertString(input) + if err != nil { + log.Fatal(err) + } + fmt.Println(markdown) + // Output: __Bold Text__ +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..7282582 --- /dev/null +++ b/go.mod @@ -0,0 +1,24 @@ +module github.com/JohannesKaufmann/html-to-markdown/v2 + +go 1.22.1 + +require golang.org/x/net v0.28.0 + +require ( + github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364 + github.com/agnivade/levenshtein v1.1.1 + github.com/muesli/termenv v0.15.2 + github.com/sebdah/goldie/v2 v2.5.5 + github.com/yuin/goldmark v1.7.4 +) + +require ( + github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect + github.com/lucasb-eyer/go-colorful v1.2.0 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mattn/go-runewidth v0.0.15 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/rivo/uniseg v0.4.7 // indirect + github.com/sergi/go-diff v1.3.1 // indirect + golang.org/x/sys v0.23.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..3a6f449 --- /dev/null +++ b/go.sum @@ -0,0 +1,51 @@ +github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364 h1:TDlO/A2QqlNhdvH+hDnu8cv1rouhfHgLwhGzJeHGgFQ= +github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364/go.mod h1:U+fBZLZTYiZCOwQUT04V3J4I+0TxyLNnj0R8nBlO4fk= +github.com/agnivade/levenshtein v1.1.1 h1:QY8M92nrzkmr798gCo3kmMyqXFzdQVpxLlGPRBij0P8= +github.com/agnivade/levenshtein v1.1.1/go.mod h1:veldBMzWxcCG2ZvUTKD2kJNRdCk5hVbJomOvKkmgYbo= +github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q= +github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE= +github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k= +github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g= +github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY= +github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U= +github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= +github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo= +github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= +github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= +github.com/sebdah/goldie/v2 v2.5.5 h1:rx1mwF95RxZ3/83sdS4Yp7t2C5TCokvWP4TBRbAyEWY= +github.com/sebdah/goldie/v2 v2.5.5/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI= +github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= +github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= +github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/yuin/goldmark v1.7.4 h1:BDXOHExt+A7gwPCJgPIIq7ENvceR7we7rOS9TNoLZeg= +github.com/yuin/goldmark v1.7.4/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E= +golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= +golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM= +golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= diff --git a/internal/domutils/add_space.go b/internal/domutils/add_space.go new file mode 100644 index 0000000..f26c0d7 --- /dev/null +++ b/internal/domutils/add_space.go @@ -0,0 +1,66 @@ +package domutils + +import ( + "context" + + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +func getFirstChildNode(startNode *html.Node, matchFn func(n *html.Node) bool) *html.Node { + node := startNode.FirstChild + for node != nil { + name := dom.NodeName(node) + if name == "span" { + // A span has no special meaning. So we just skip it... + node = dom.GetNextNeighborNode(node) + } else if matchFn(node) { + return node + } else { + return nil + } + } + + return nil +} + +func getLastChildNode(startNode *html.Node, matchFn func(n *html.Node) bool) *html.Node { + node := startNode.LastChild + for node != nil { + name := dom.NodeName(node) + if name == "span" { + // A span has no special meaning. So we just skip it... + node = dom.GetPrevNeighborNode(node) + } else if matchFn(node) { + return node + } else { + return nil + } + } + + return nil +} +func AddSpace(ctx context.Context, doc *html.Node, isOuterNode, isInnerNode func(*html.Node) bool) { + node := doc + for node != nil { + if isOuterNode(node) { + firstChild := getFirstChildNode(node, isInnerNode) + if firstChild != nil { + prev := getPrevTextNode(node) + if prev != nil { + prev.Data = prev.Data + " " + } + } + + lastChild := getLastChildNode(node, isInnerNode) + if lastChild != nil { + next := getNextTextNode(node) + if next != nil { + next.Data = " " + next.Data + } + } + } + + node = dom.GetNextNeighborElement(node) + } +} diff --git a/internal/domutils/add_space_test.go b/internal/domutils/add_space_test.go new file mode 100644 index 0000000..72e574c --- /dev/null +++ b/internal/domutils/add_space_test.go @@ -0,0 +1,61 @@ +package domutils + +import ( + "context" + "testing" + + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/tester" + "golang.org/x/net/html" +) + +func TestAddSpace(t *testing.T) { + runs := []struct { + desc string + input string + expected string + }{ + { + desc: "space needed before & after", + input: `beforeinline codeafter`, + expected: ` +├─body +│ ├─#text "before " +│ ├─strong +│ │ ├─code +│ │ │ ├─#text "inline code" +│ ├─#text " after" + `, + }, + { + desc: "no surrounding text", + input: `inline code`, + expected: ` +├─body +│ ├─strong +│ │ ├─code +│ │ │ ├─#text "inline code" + `, + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + doc := tester.Parse(t, run.input, "") + + AddSpace(context.Background(), doc, func(n *html.Node) bool { + name := dom.NodeName(n) + if name == "strong" || name == "b" { + return true + } + if name == "em" || name == "i" { + return true + } + return false + }, func(n *html.Node) bool { + return dom.NodeName(n) == "code" + }) + + tester.ExpectRepresentation(t, doc, "output", run.expected) + }) + } +} diff --git a/internal/domutils/adjacent.go b/internal/domutils/adjacent.go new file mode 100644 index 0000000..49faccb --- /dev/null +++ b/internal/domutils/adjacent.go @@ -0,0 +1,75 @@ +package domutils + +import ( + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +func collectAdjacentNodes(node *html.Node, matchFn func(n *html.Node) bool) []*html.Node { + var collectedNodes []*html.Node + + node = node.NextSibling + for node != nil { + name := dom.NodeName(node) + if name == "span" { + // A span has no special meaning. So we just skip it... + node = dom.GetNextNeighborNode(node) + } else if matchFn(node) { + collectedNodes = append(collectedNodes, node) + node = dom.GetNextNeighborNodeExcludingOwnChild(node) + } else { + // Return the collected nodes + return collectedNodes + } + } + + return collectedNodes +} + +func mergeChildren(destinationNode *html.Node, nodes ...*html.Node) { + for _, node := range nodes { + // We move all the children to the `destinationNode`. + children := dom.AllChildNodes(node) + for _, child := range children { + dom.RemoveNode(child) + destinationNode.AppendChild(child) + } + dom.RemoveNode(node) + } +} + +func MergeAdjacent(doc *html.Node, matchFn func(*html.Node) bool) { + node := doc + + for node != nil { + if matchFn(node) { + nextNodes := collectAdjacentNodes(node, matchFn) + + mergeChildren(node, nextNodes...) + } + + node = dom.GetNextNeighborElement(node) + } +} + +// - - - - - - - - // + +func MergeAdjacentTextNodes(n *html.Node) { + if n == nil { + return + } + + var prev *html.Node + for c := n.FirstChild; c != nil; { + next := c.NextSibling + if c.Type == html.TextNode && prev != nil && prev.Type == html.TextNode { + // Combine adjacent text nodes + prev.Data += c.Data + n.RemoveChild(c) + } else { + MergeAdjacentTextNodes(c) + prev = c + } + c = next + } +} diff --git a/internal/domutils/adjacent_test.go b/internal/domutils/adjacent_test.go new file mode 100644 index 0000000..ea21b95 --- /dev/null +++ b/internal/domutils/adjacent_test.go @@ -0,0 +1,300 @@ +package domutils + +import ( + "testing" + + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/tester" + "golang.org/x/net/html" +) + +func TestMergeAdjacent(t *testing.T) { + runs := []struct { + desc string + input string + expected string + }{ + { + desc: "don't change other tags", + input: `a b`, + expected: ` +├─body +│ ├─span +│ │ ├─#text "a" +│ ├─#text " " +│ ├─span +│ │ ├─#text "b" + `, + }, + { + desc: "don't change simple strong", + input: `a`, + expected: ` +├─body +│ ├─strong +│ │ ├─#text "a" + `, + }, + { + desc: "dont merge two adjacent strong tags with space between", + input: `a b`, + expected: ` +├─body +│ ├─strong +│ │ ├─#text "a" +│ ├─#text " " +│ ├─strong +│ │ ├─#text "b" + `, + }, + { + desc: "merge two adjacent strong tags without space between", + input: `ab`, + expected: ` +├─body +│ ├─strong +│ │ ├─#text "a" +│ │ ├─#text "b" + `, + }, + { + desc: "merge three adjacent strong tags without space between", + input: `abc`, + expected: ` +├─body +│ ├─strong +│ │ ├─#text "a" +│ │ ├─#text "b" +│ │ ├─#text "c" + `, + }, + { + desc: "merge four adjacent strong tags without space between", + input: `abcd`, + expected: ` +├─body +│ ├─strong +│ │ ├─#text "a" +│ │ ├─#text "b" +│ │ ├─#text "c" +│ │ ├─#text "d" + `, + }, + { + desc: "dont merge if there is tag content between", + input: `a

between

b`, + expected: ` +├─body +│ ├─strong +│ │ ├─#text "a" +│ ├─p +│ │ ├─#text "between" +│ ├─strong +│ │ ├─#text "b" + `, + }, + { + desc: "dont merge if there is #text content between", + input: `a between b`, + expected: ` +├─body +│ ├─strong +│ │ ├─#text "a" +│ ├─#text " between " +│ ├─strong +│ │ ├─#text "b" + `, + }, + { + desc: "dont merge if there is break between", + input: `a
b`, + expected: ` +├─body +│ ├─strong +│ │ ├─#text "a" +│ ├─br +│ ├─strong +│ │ ├─#text "b" + `, + }, + { + desc: "merge three adjacent italic tags without space between", + input: `abc`, + expected: ` +├─body +│ ├─em +│ │ ├─#text "a" +│ │ ├─#text "b" +│ │ ├─#text "c" + `, + }, + + { + desc: "dont merge two nested strong tags with space between", + input: `
A
B`, + expected: ` +├─body +│ ├─div +│ │ ├─strong +│ │ │ ├─#text "A" +│ ├─#text " " +│ ├─strong +│ │ ├─#text "B" + + `, + }, + + { + desc: "(for now) dont merge nested strongs inside div", + input: `
A
B`, + expected: ` +├─body +│ ├─div +│ │ ├─strong +│ │ │ ├─#text "A" +│ ├─strong +│ │ ├─#text "B" + `, + }, + { + desc: "(for now) dont merge deeply nested strongs inside div", + input: `
A
b
`, + expected: ` +├─body +│ ├─div +│ │ ├─div +│ │ │ ├─div +│ │ │ │ ├─strong +│ │ │ │ │ ├─#text "A" +│ │ ├─div +│ │ │ ├─strong +│ │ │ │ ├─#text "b" + `, + }, + + { + desc: "dont merge two nested strong tags enclosed in a", + input: `
AB`, + expected: ` +├─body +│ ├─a (href="/") +│ │ ├─strong +│ │ │ ├─#text "A" +│ ├─strong +│ │ ├─#text "B" + `, + }, + + // - - - - - - - - - - - Span - - - - - - - - - - - // + { + desc: "merge next strong nested in span #1", + input: `

abother text

`, + expected: ` +├─body +│ ├─p +│ │ ├─strong +│ │ │ ├─#text "a" +│ │ │ ├─#text "b" +│ │ ├─span +│ │ ├─#text "other text" + `, + }, + { + desc: "merge next strong nested in span #2", + input: `

abother text

`, + expected: ` +├─body +│ ├─p +│ │ ├─strong +│ │ │ ├─#text "a" +│ │ │ ├─#text "b" +│ │ ├─span +│ │ │ ├─span +│ │ ├─#text "other text" + `, + }, + { + desc: "merge next strong nested in span #3", + input: `

abcother text

`, + expected: ` +├─body +│ ├─p +│ │ ├─strong +│ │ │ ├─#text "a" +│ │ │ ├─#text "b" +│ │ │ ├─#text "c" +│ │ ├─span +│ │ ├─span +│ │ │ ├─#text "other text" + `, + }, + { + desc: "dont merge other span tags", + input: `

aother text

`, + expected: ` +├─body +│ ├─p +│ │ ├─strong +│ │ │ ├─#text "a" +│ │ ├─span +│ │ │ ├─#text "other text" + `, + }, + { + desc: "dont merge span content if space between", + input: `

a b

`, + expected: ` +├─body +│ ├─p +│ │ ├─strong +│ │ │ ├─#text "a" +│ │ ├─span +│ │ │ ├─#text " " +│ │ │ ├─strong +│ │ │ │ ├─#text "b" + `, + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + doc := tester.Parse(t, run.input, "") + + MergeAdjacent(doc, func(n *html.Node) bool { + name := dom.NodeName(n) + return name == "strong" || name == "em" + }) + + tester.ExpectRepresentation(t, doc, "output", run.expected) + }) + } +} + +func TestMergeAdjacentTextNodes(t *testing.T) { + div := &html.Node{ + Type: html.ElementNode, + Data: "div", + } + textOne := &html.Node{ + Type: html.TextNode, + Data: "one", + } + textTwo := &html.Node{ + Type: html.TextNode, + Data: "two", + } + textThree := &html.Node{ + Type: html.TextNode, + Data: "three", + } + div.AppendChild(textOne) + div.AppendChild(textTwo) + div.AppendChild(textThree) + + MergeAdjacentTextNodes(div) + + expected := ` +div +├─#text "onetwothree" + ` + tester.ExpectRepresentation(t, div, "output", expected) +} diff --git a/internal/domutils/alternatives.go b/internal/domutils/alternatives.go new file mode 100644 index 0000000..a3da350 --- /dev/null +++ b/internal/domutils/alternatives.go @@ -0,0 +1,114 @@ +package domutils + +import ( + "context" + + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +// TODO: make this configurable via the options??? +func getMarkdownStructure(name string) string { + switch name { + case "#document", "html", "head", "body", + "blockquote", "ul", "ol", "li": + // A container block can also contain other blocks. + return "container_block" + + // Note: "p" would also be part of "leaf_block" + case "hr", "pre", + "h1", "h2", "h3", "h4", "h5", "h6": + // Leaf blocks can contain inline content + // but NOT other blocks. + return "leaf_block" + + case "#text", "span", "code", + "b", "strong", "i", "em", + "a", "img", "br": + return "inline" + + case "div", "p": + // Since these are just placing newlines, + // we dont categorize them. + return "" + + default: + return "" + } +} + +func headingAlternative(ctx context.Context, node *html.Node) { + node.Data = "strong" + + newChild := &html.Node{ + Type: html.ElementNode, + Data: "br", + } + node.Parent.InsertBefore(newChild, node.NextSibling) +} +func blockquoteAlternative(ctx context.Context, node *html.Node) { + newBefore := &html.Node{Type: html.TextNode, Data: ` "`} + node.Parent.InsertBefore(newBefore, node) + + node.Data = "span" + + newAfter := &html.Node{Type: html.TextNode, Data: `" `} + node.Parent.InsertBefore(newAfter, node.NextSibling) +} +func preAlternative(ctx context.Context, node *html.Node) { + node.Data = "code" +} +func hrAlternative(ctx context.Context, node *html.Node) { + dom.RemoveNode(node) +} + +// TODO: make this configurable via the options? +var alternatives = map[string]func(ctx context.Context, node *html.Node){ + "h1": headingAlternative, + "h2": headingAlternative, + "h3": headingAlternative, + "h4": headingAlternative, + "h5": headingAlternative, + "h6": headingAlternative, + "blockquote": blockquoteAlternative, + "pre": preAlternative, + "hr": hrAlternative, +} + +func LeafBlockAlternatives(ctx context.Context, doc *html.Node) { + var finder func(node *html.Node, isInsideLeafBlock bool, isInsideInline bool) + finder = func(node *html.Node, isInsideLeafBlock bool, isInsideInline bool) { + name := dom.NodeName(node) + + structure := getMarkdownStructure(name) + if (structure == "container_block" || structure == "leaf_block") && (isInsideLeafBlock || isInsideInline) { + // A block inside an inline OR a block inside a leaf-block + // is not valid markdown so cannot be rendered. + // + // For example, you cannot place a blockquote inside a heading. + // + // Instead of this weird output (## Heading > My Quote) + // we try to find alternatives (## Heading "My Quote") + fn, ok := alternatives[name] + if ok { + fn(ctx, node) + } else { + node.Data = "span" + } + } + + // - - - - - - - - - - - - - - - - - - - - - - // + + if structure == "leaf_block" { + isInsideLeafBlock = true + } + if structure == "inline" { + isInsideInline = true + } + + for child := node.FirstChild; child != nil; child = child.NextSibling { + defer finder(child, isInsideLeafBlock, isInsideInline) + } + } + finder(doc, false, false) +} diff --git a/internal/domutils/alternatives_test.go b/internal/domutils/alternatives_test.go new file mode 100644 index 0000000..6184c6d --- /dev/null +++ b/internal/domutils/alternatives_test.go @@ -0,0 +1,82 @@ +package domutils + +import ( + "context" + "testing" + + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/tester" +) + +func TestLeafBlockAlternatives(t *testing.T) { + runs := []struct { + desc string + input string + expected string + }{ + { + desc: "divider in heading", + input: `

Heading

`, + expected: ` +├─body +│ ├─h3 +│ │ ├─#text "Heading" + `, + }, + { + desc: "simple", + input: `

Heading

`, + expected: ` +├─body +│ ├─a (href="/page.html") +│ │ ├─strong +│ │ │ ├─#text "Heading" +│ │ ├─br + `, + }, + { + desc: "two headings", + input: `

Heading A

Heading B

`, + expected: ` +├─body +│ ├─a (href="/page.html") +│ │ ├─strong +│ │ │ ├─#text "Heading A" +│ │ ├─br +│ │ ├─strong +│ │ │ ├─#text "Heading B" +│ │ ├─br + `, + }, + { + desc: "two headings formatted", + input: ` + +

Heading A

+

Heading B

+
+ `, + expected: ` +├─body +│ ├─a (href="/page.html") +│ │ ├─#text "\n\t" +│ │ ├─strong +│ │ │ ├─#text "Heading A" +│ │ ├─br +│ │ ├─#text "\n\t" +│ │ ├─strong +│ │ │ ├─#text "Heading B" +│ │ ├─br +│ │ ├─#text "\n" + `, + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + doc := tester.Parse(t, run.input, "") + + LeafBlockAlternatives(context.TODO(), doc) + + tester.ExpectRepresentation(t, doc, "output", run.expected) + }) + } +} diff --git a/internal/domutils/domutils.go b/internal/domutils/domutils.go new file mode 100644 index 0000000..e0766c7 --- /dev/null +++ b/internal/domutils/domutils.go @@ -0,0 +1,43 @@ +package domutils + +import ( + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +func getNextTextNode(startNode *html.Node) *html.Node { + node := dom.GetNextNeighborNodeExcludingOwnChild(startNode) + + for node != nil { + if node.Type == html.TextNode { + return node + } + + if dom.NodeName(node) == "span" { + // A span has no special meaning. So we just skip it... + node = dom.GetNextNeighborNode(node) + continue + } + + return nil + } + return nil +} +func getPrevTextNode(startNode *html.Node) *html.Node { + node := dom.GetPrevNeighborNodeExcludingOwnChild(startNode) + + for node != nil { + if node.Type == html.TextNode { + return node + } + + if dom.NodeName(node) == "span" { + // A span has no special meaning. So we just skip it... + node = dom.GetPrevNeighborNode(node) + continue + } + + return nil + } + return nil +} diff --git a/internal/domutils/empty_code.go b/internal/domutils/empty_code.go new file mode 100644 index 0000000..d6d81d1 --- /dev/null +++ b/internal/domutils/empty_code.go @@ -0,0 +1,42 @@ +package domutils + +import ( + "context" + + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +func hasTextChildNodes(startNode *html.Node) bool { + var found bool + + var finder func(*html.Node) + finder = func(node *html.Node) { + if node.Type == html.TextNode && node.Data != "" { + found = true + return + } + for child := node.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + } + finder(startNode) + + return found +} + +func RemoveEmptyCode(ctx context.Context, doc *html.Node) { + node := doc + for node != nil { + if dom.NodeName(node) == "code" && !hasTextChildNodes(node) { + next := dom.GetNextNeighborNodeExcludingOwnChild(node) + + dom.RemoveNode(node) + + node = next + continue + } + + node = dom.GetNextNeighborNode(node) + } +} diff --git a/internal/domutils/empty_code_test.go b/internal/domutils/empty_code_test.go new file mode 100644 index 0000000..d504d22 --- /dev/null +++ b/internal/domutils/empty_code_test.go @@ -0,0 +1,74 @@ +package domutils + +import ( + "context" + "testing" + + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/tester" +) + +func TestRemoveEmptyCode(t *testing.T) { + runs := []struct { + desc string + input string + + expectedBefore string + expectedAfter string + }{ + { + desc: "", + input: `

before

middle
after

`, + expectedBefore: ` +├─body +│ ├─p +│ │ ├─#text "before" +│ │ ├─code +│ ├─pre +│ │ ├─code +│ │ │ ├─#text "middle" +│ ├─#text "after" +│ ├─p + `, + expectedAfter: ` +├─body +│ ├─p +│ │ ├─#text "before" +│ ├─pre +│ │ ├─code +│ │ │ ├─#text "middle" +│ ├─#text "after" +│ ├─p + `, + }, + { + desc: "two empty code nodes", + input: `

between

`, + expectedBefore: ` +├─body +│ ├─p +│ │ ├─code +│ ├─#text "between" +│ ├─p +│ │ ├─code + `, + expectedAfter: ` +├─body +│ ├─p +│ ├─#text "between" +│ ├─p + `, + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + doc := tester.Parse(t, run.input, "") + + tester.ExpectRepresentation(t, doc, "before", run.expectedBefore) + + RemoveEmptyCode(context.TODO(), doc) + + tester.ExpectRepresentation(t, doc, "output", run.expectedAfter) + }) + } + +} diff --git a/internal/domutils/list_end_comment.go b/internal/domutils/list_end_comment.go new file mode 100644 index 0000000..e401736 --- /dev/null +++ b/internal/domutils/list_end_comment.go @@ -0,0 +1,74 @@ +package domutils + +import ( + "context" + + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +var ListEndCommentData = "THE END" + +func AddListEndComments(ctx context.Context, doc *html.Node) { + node := doc + for node != nil { + if nameIsList(node) && nextNameIsList(node) { + insertComment(node) + } + + node = dom.GetNextNeighborElement(node) + } +} + +func nameIsList(node *html.Node) bool { + name := dom.NodeName(node) + return name == "ul" || name == "ol" +} + +func insertComment(listNode *html.Node) { + comment := &html.Node{ + Type: html.CommentNode, + Data: ListEndCommentData, + } + listNode.Parent.InsertBefore(comment, listNode.NextSibling) +} + +func nextNameIsList(startNode *html.Node) bool { + node := dom.GetNextNeighborNodeExcludingOwnChild(startNode) + + for node != nil { + name := dom.NodeName(node) + if name == "ul" || name == "ol" { + return true + } + if name == "li" { + return false + } + if name == "#comment" && node.Data == ListEndCommentData { + return false + } + + // If there is any text between two lists + // they are automatically not connected anymore. + if node.Type == html.TextNode { + return false + } + + // - - - - // + + if name == "hr" { + // A divider already seperates two lists... + return false + } + + // TODO: RunContext.Render() + // -> get acess to keepRemoveMap + + // TODO: look in the KeepRemoveMap? + // e.g. ul then script then ul + + node = dom.GetNextNeighborNode(node) + continue + } + return false +} diff --git a/internal/domutils/redundant.go b/internal/domutils/redundant.go new file mode 100644 index 0000000..d4ebae2 --- /dev/null +++ b/internal/domutils/redundant.go @@ -0,0 +1,29 @@ +package domutils + +import ( + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +func RemoveRedundant(doc *html.Node, matchFn func(*html.Node, *html.Node) bool) { + for _, node := range dom.AllNodes(doc) { + if hasSameTypeAncestor(node, matchFn) { + dom.UnwrapNode(node) + } + } + +} + +func hasSameTypeAncestor(n *html.Node, matchFn func(*html.Node, *html.Node) bool) bool { + if !matchFn(n, n) { + return false + } + + for p := n.Parent; p != nil; p = p.Parent { + if matchFn(n, p) { + return true + } + } + + return false +} diff --git a/internal/domutils/redundant_test.go b/internal/domutils/redundant_test.go new file mode 100644 index 0000000..357ebfe --- /dev/null +++ b/internal/domutils/redundant_test.go @@ -0,0 +1,115 @@ +package domutils + +import ( + "testing" + + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/tester" + "golang.org/x/net/html" +) + +func TestRemoveRedundant(t *testing.T) { + runs := []struct { + desc string + input string + expected string + }{ + { + desc: "don't change other tags", + input: `a b`, + expected: ` +├─body +│ ├─span +│ │ ├─#text "a" +│ ├─#text " " +│ ├─span +│ │ ├─#text "b" + `, + }, + { + desc: "don't change simple strong", + input: `a`, + + expected: ` +├─body +│ ├─strong +│ │ ├─#text "a" + `, + }, + { + desc: "remove double strong", + input: `a`, + + expected: ` +├─body +│ ├─strong +│ │ ├─#text "a" + `, + }, + { + desc: "remove more complicated double strong", + input: `a b c`, + + expected: ` +├─body +│ ├─strong +│ │ ├─#text "a" +│ │ ├─#text " b " +│ │ ├─#text "c" + `, + }, + + { + desc: "leave italic inside bold", + input: `ABC`, + + expected: ` +├─body +│ ├─strong +│ │ ├─#text "A" +│ │ ├─em +│ │ │ ├─#text "B" +│ │ ├─#text "C" + `, + }, + { + desc: "dont leave other italic inside another italic", + input: `ABC`, + + expected: ` +├─body +│ ├─i +│ │ ├─#text "A" +│ │ ├─#text "B" +│ │ ├─#text "C" + `, + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + doc := tester.Parse(t, run.input, "") + + RemoveRedundant(doc, func(a, b *html.Node) bool { + isItalic := func(n *html.Node) bool { + name := dom.NodeName(n) + return name == "em" || name == "i" + } + isBold := func(n *html.Node) bool { + name := dom.NodeName(n) + return name == "strong" || name == "b" + } + + if isItalic(a) && isItalic(b) { + return true + } + if isBold(a) && isBold(b) { + return true + } + + return false + }) + + tester.ExpectRepresentation(t, doc, "output", run.expected) + }) + } +} diff --git a/internal/domutils/span.go b/internal/domutils/span.go new file mode 100644 index 0000000..638d24f --- /dev/null +++ b/internal/domutils/span.go @@ -0,0 +1,50 @@ +package domutils + +import ( + "context" + + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +func isFakeSpan(node *html.Node) bool { + name := dom.NodeName(node) + if name != "span" { + return false + } + + var containsBlockNode = false + + var finder func(*html.Node) + finder = func(node *html.Node) { + name := dom.NodeName(node) + if dom.NameIsBlockNode(name) { + containsBlockNode = true + return + } + + for child := node.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + } + finder(node) + + return containsBlockNode +} + +// RenameFakeSpans renames all "span" nodes to "div" if +// any block element is found as a child. +func RenameFakeSpans(ctx context.Context, doc *html.Node) { + var finder func(node *html.Node) + finder = func(node *html.Node) { + if isFakeSpan(node) { + node.Data = "div" + } + + for child := node.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + } + + finder(doc) +} diff --git a/internal/domutils/span_test.go b/internal/domutils/span_test.go new file mode 100644 index 0000000..e54f19b --- /dev/null +++ b/internal/domutils/span_test.go @@ -0,0 +1,82 @@ +package domutils + +import ( + "context" + "testing" + + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/tester" +) + +func TestRenameFakeSpans(t *testing.T) { + runs := []struct { + desc string + input string + expected string + }{ + { + desc: "don't change other tags", + input: `

a

b

`, + expected: ` +├─body +│ ├─p +│ │ ├─#text "a" +│ ├─#text " " +│ ├─p +│ │ ├─#text "b" + `, + }, + { + desc: "don't change simple span", + input: `a`, + + expected: ` +├─body +│ ├─span +│ │ ├─#text "a" + `, + }, + { + desc: "don't change span with inline element", + input: `link content`, + + expected: ` +├─body +│ ├─span +│ │ ├─a +│ │ │ ├─#text "link content" + `, + }, + { + desc: "change span with block element", + input: `

paragraph content

`, + + expected: ` +├─body +│ ├─div +│ │ ├─p +│ │ │ ├─#text "paragraph content" + `, + }, + { + desc: "change multiple spans with block element", + input: `

paragraph content

`, + + expected: ` +├─body +│ ├─div +│ │ ├─div +│ │ │ ├─p +│ │ │ │ ├─#text "paragraph content" + `, + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + doc := tester.Parse(t, run.input, "") + + RenameFakeSpans(context.TODO(), doc) + + tester.ExpectRepresentation(t, doc, "output", run.expected) + }) + } +} diff --git a/internal/domutils/swap.go b/internal/domutils/swap.go new file mode 100644 index 0000000..b161707 --- /dev/null +++ b/internal/domutils/swap.go @@ -0,0 +1,51 @@ +package domutils + +import ( + "context" + "slices" + "strings" + + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +func swapTagsOfNodes(node1, node2 *html.Node) { + if node1.Type != html.ElementNode || node2.Type != html.ElementNode { + panic("swap only works with element nodes") + } + + tempDataAtom := node1.DataAtom + tempData := node1.Data + tempAttr := node1.Attr + + node1.DataAtom = node2.DataAtom + node1.Data = node2.Data + node1.Attr = node2.Attr + + node2.DataAtom = tempDataAtom + node2.Data = tempData + node2.Attr = tempAttr +} + +func isEmptyText(node *html.Node) bool { + return node.Type == html.TextNode && strings.TrimSpace(node.Data) == "" +} +func SwapTags(ctx context.Context, doc *html.Node, isOuterNode, isInnerNode func(*html.Node) bool) { + var finder func(*html.Node) + finder = func(node *html.Node) { + if isOuterNode(node) { + childs := dom.AllChildNodes(node) + childs = slices.DeleteFunc(childs, isEmptyText) + + if len(childs) == 1 && isInnerNode(childs[0]) { + swapTagsOfNodes(node, childs[0]) + return + } + } + + for child := node.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + } + finder(doc) +} diff --git a/internal/domutils/swap_test.go b/internal/domutils/swap_test.go new file mode 100644 index 0000000..ee531cd --- /dev/null +++ b/internal/domutils/swap_test.go @@ -0,0 +1,567 @@ +package domutils + +import ( + "context" + "testing" + + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/tester" + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +func generateANodes() *html.Node { + div := &html.Node{ + Namespace: "", + Type: html.ElementNode, + DataAtom: atom.Div, + + Attr: []html.Attribute{}, + Data: "div", + } + link := &html.Node{ + Namespace: "", + Type: html.ElementNode, + DataAtom: atom.A, + + Attr: []html.Attribute{ + { + Namespace: "", + Key: "KeyA", + Val: "ValA", + }, + }, + + Data: "a", + } + text := &html.Node{ + Namespace: "", + Type: html.TextNode, + DataAtom: 0, + + Data: "ContentA", + } + link.AppendChild(text) + div.AppendChild(link) + + return div +} +func generateBNodes() *html.Node { + div := &html.Node{ + Namespace: "", + Type: html.ElementNode, + DataAtom: atom.Main, + + Attr: []html.Attribute{}, + Data: "main", + } + link := &html.Node{ + Namespace: "", + Type: html.ElementNode, + DataAtom: atom.B, + + Attr: []html.Attribute{ + { + Namespace: "", + Key: "KeyB", + Val: "ValB", + }, + }, + + Data: "b", + } + text := &html.Node{ + Namespace: "", + Type: html.TextNode, + DataAtom: 0, + + Data: "ContentB", + } + link.AppendChild(text) + div.AppendChild(link) + + return div +} +func TestSwapTagsOfNodes_Basics(t *testing.T) { + a := generateANodes() + b := generateBNodes() + + swapTagsOfNodes(a.FirstChild, b.FirstChild) + + // These fields should have changed: + if a.FirstChild.DataAtom != atom.B { + t.Error("expected different a atom") + } + if a.FirstChild.Data != "b" { + t.Error("expected different a data") + } + if len(a.FirstChild.Attr) != 1 { + t.Error("expected different a attributes length") + } + if a.FirstChild.Attr[0].Key != "KeyB" { + t.Error("expected different a attribute key") + } + if a.FirstChild.Attr[0].Val != "ValB" { + t.Error("expected different a attribute key") + } + + // The pointers should NOT have changed: + if a.FirstChild.Parent.Data != "div" { + t.Error("expected the same parent for a") + } + if a.FirstChild.FirstChild.Data != "ContentA" { + t.Error("expected the same text for a") + } + + // - - - // + + if b.FirstChild.DataAtom != atom.A { + t.Error("expected different b atom") + } + if b.FirstChild.Data != "a" { + t.Error("expected different b data") + } + if len(b.FirstChild.Attr) != 1 { + t.Error("expected different b attributes length") + } + if b.FirstChild.Attr[0].Key != "KeyA" { + t.Error("expected different b attribute key") + } + if b.FirstChild.Attr[0].Val != "ValA" { + t.Error("expected different b attribute key") + } + + // The pointers should NOT have changed: + if b.FirstChild.Parent.Data != "main" { + t.Error("expected the same parent for b") + } + if b.FirstChild.FirstChild.Data != "ContentB" { + t.Error("expected the same text for b") + } +} + +func TestSwapTags_HeadingLink(t *testing.T) { + runs := []struct { + desc string + + input string + startFrom string + + expected string + }{ + { + desc: "simple", + + input: `

Heading

`, + startFrom: "body", + + expected: ` +├─body +│ ├─h3 +│ │ ├─a (href="/page.html") +│ │ │ ├─#text "Heading" + `, + }, + { + desc: "simple with whitespace", + + input: ` + +

Heading

+
+ `, + startFrom: "body", + + expected: ` +├─body +│ ├─h3 +│ │ ├─#text "\n\t" +│ │ ├─a (href="/page.html") +│ │ │ ├─#text "Heading" +│ │ ├─#text "\n" + `, + }, + { + desc: "more content", + + input: ` + +

ReiseinspirationBeste Orte in Berlin

+
+ `, + startFrom: "body", + + expected: ` +├─body +│ ├─h3 +│ │ ├─#text "\n\t" +│ │ ├─a (href="/reisen") +│ │ │ ├─span +│ │ │ │ ├─#text "Reiseinspiration" +│ │ │ ├─span +│ │ │ │ ├─#text "Beste Orte in Berlin" +│ │ ├─#text "\n" + `, + }, + { + desc: "not possible", + + input: ` + +

Heading

+

Some other content

+
+ `, + startFrom: "body", + + expected: ` +├─body +│ ├─a (href="/page.html") +│ │ ├─#text "\n\t" +│ │ ├─h3 +│ │ │ ├─#text "Heading" +│ │ ├─#text "\n\t" +│ │ ├─p +│ │ │ ├─#text "Some other content" +│ │ ├─#text "\n" + `, + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + doc := tester.Parse(t, run.input, run.startFrom) + + isLink := func(n *html.Node) bool { + return dom.NodeName(n) == "a" + } + isHeading := func(n *html.Node) bool { + name := dom.NodeName(n) + + if name == "h1" || name == "h2" || name == "h3" || name == "h4" || name == "h5" || name == "h6" { + return true + } + return false + } + SwapTags(context.TODO(), doc, isLink, isHeading) + + tester.ExpectRepresentation(t, doc, "output", run.expected) + + }) + } +} + +func TestSwapTags_PreCode(t *testing.T) { + runs := []struct { + desc string + input string + + expectedBefore string + expectedAfter string + }{ + + // - - - - - Pre - - - - - // + { + desc: "div with pre: keep", + input: "
content
", + + expectedBefore: ` +├─body +│ ├─div +│ │ ├─pre +│ │ │ ├─#text "content" + `, + expectedAfter: ` +├─body +│ ├─div +│ │ ├─pre +│ │ │ ├─#text "content" + `, + }, + { + desc: "p with pre: keep", + input: "

content

", + + // The
 is a block node, so cannot be in a paragraph.
+			expectedBefore: `
+├─body
+│ ├─p
+│ ├─pre
+│ │ ├─#text "content"
+│ ├─p
+			`,
+			expectedAfter: `
+├─body
+│ ├─p
+│ ├─pre
+│ │ ├─#text "content"
+│ ├─p
+			`,
+		},
+		// - - - - - Code - - - - - //
+		{
+			desc:  "div with code: keep",
+			input: "
content
", + + expectedBefore: ` +├─body +│ ├─div +│ │ ├─code +│ │ │ ├─#text "content" + `, + expectedAfter: ` +├─body +│ ├─div +│ │ ├─code +│ │ │ ├─#text "content" + `, + }, + { + desc: "p with code: keep", + input: "

content

", + + expectedBefore: ` +├─body +│ ├─p +│ │ ├─code +│ │ │ ├─#text "content" + `, + expectedAfter: ` +├─body +│ ├─p +│ │ ├─code +│ │ │ ├─#text "content" + `, + }, + + // - - - - - Nested in correct order - - - - - // + { + desc: "keep correct code block", + input: `
content
`, + + expectedBefore: ` +├─body +│ ├─div +│ │ ├─pre +│ │ │ ├─code +│ │ │ │ ├─#text "content" + `, + expectedAfter: ` +├─body +│ ├─div +│ │ ├─pre +│ │ │ ├─code +│ │ │ │ ├─#text "content" + `, + }, + // - - - - - Nested in wrong order - - - - - // + { + desc: "swap wrong code block", + input: `
content
`, + + expectedBefore: ` +├─body +│ ├─div +│ │ ├─code +│ │ │ ├─pre +│ │ │ │ ├─#text "content" + `, + expectedAfter: ` +├─body +│ ├─div +│ │ ├─pre +│ │ │ ├─code +│ │ │ │ ├─#text "content" + `, + }, + { + desc: "html parsing already causes swap", + input: `

content

`, + + // Notice how the html parsing already looks different... + expectedBefore: ` +├─body +│ ├─p +│ │ ├─code +│ ├─pre +│ │ ├─code +│ │ │ ├─#text "content" +│ ├─p + `, + expectedAfter: ` +├─body +│ ├─p +│ │ ├─code +│ ├─pre +│ │ ├─code +│ │ │ ├─#text "content" +│ ├─p + `, + }, + + { + desc: "different ast then expected", + input: `

beforea

b
cafter

`, + + expectedBefore: ` +├─body +│ ├─p +│ │ ├─#text "before" +│ │ ├─code +│ │ │ ├─#text "a" +│ ├─pre +│ │ ├─code +│ │ │ ├─#text "b" +│ ├─code +│ │ ├─#text "c" +│ ├─#text "after" +│ ├─p + `, + expectedAfter: ` +├─body +│ ├─p +│ │ ├─#text "before" +│ │ ├─code +│ │ │ ├─#text "a" +│ ├─pre +│ │ ├─code +│ │ │ ├─#text "b" +│ ├─code +│ │ ├─#text "c" +│ ├─#text "after" +│ ├─p + `, + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + doc := tester.Parse(t, run.input, "") + + tester.ExpectRepresentation(t, doc, "before", run.expectedBefore) + + isCode := func(n *html.Node) bool { + return dom.NodeName(n) == "code" + } + isPre := func(n *html.Node) bool { + return dom.NodeName(n) == "pre" + } + SwapTags(context.TODO(), doc, isCode, isPre) + + tester.ExpectRepresentation(t, doc, "output", run.expectedAfter) + }) + } +} + +func TestSwapTags_StrongLinks(t *testing.T) { + runs := []struct { + desc string + input string + + expectedAfter string + }{ + { + desc: "swap strong and link", + input: `

beforemiddleafter

`, + + expectedAfter: ` +├─body +│ ├─p +│ │ ├─#text "before" +│ │ ├─a (href="/") +│ │ │ ├─strong +│ │ │ │ ├─#text "middle" +│ │ ├─#text "after" + `, + }, + { + desc: "empty span", + input: `

beforewith empty spanafter

`, + + expectedAfter: ` +├─body +│ ├─p +│ │ ├─#text "before" +│ │ ├─strong +│ │ │ ├─span +│ │ │ ├─a (href="/") +│ │ │ │ ├─#text "with empty span" +│ │ │ ├─span +│ │ ├─#text "after" + `, + }, + { + desc: "span with spaces", + input: `

before with empty span after

`, + + expectedAfter: ` +├─body +│ ├─p +│ │ ├─#text "before" +│ │ ├─strong +│ │ │ ├─span +│ │ │ │ ├─#text " " +│ │ │ ├─a (href="/") +│ │ │ │ ├─#text "with empty span" +│ │ │ ├─span +│ │ │ │ ├─#text " " +│ │ ├─#text "after" + `, + }, + { + desc: "spans nested", + input: `

before with empty span after

`, + + expectedAfter: ` +├─body +│ ├─p +│ │ ├─#text "before" +│ │ ├─strong +│ │ │ ├─span +│ │ │ │ ├─span +│ │ │ │ │ ├─#text " " +│ │ │ │ ├─#text " " +│ │ │ ├─a (href="/") +│ │ │ │ ├─#text "with empty span" +│ │ │ ├─span +│ │ │ │ ├─span +│ │ │ │ │ ├─#text " " +│ │ │ │ ├─#text " " +│ │ ├─#text "after" + `, + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + doc := tester.Parse(t, run.input, "") + + isBoldOrItalic := func(node *html.Node) bool { + name := dom.NodeName(node) + if name == "strong" || name == "b" { + return true + } + if name == "em" || name == "i" { + return true + } + + return false + } + + isLink := func(node *html.Node) bool { + return dom.NodeName(node) == "a" + } + + // Remove all unnessesary span tags + // for _, node := range dom.GetAllNodes(doc) { + // name := dom.NodeName(node) + // if name == "span" { + // dom.UnwrapNode(node) + // } + // } + + // collapse.Collapse(doc) + + SwapTags(context.TODO(), doc, isBoldOrItalic, isLink) + + tester.ExpectRepresentation(t, doc, "output", run.expectedAfter) + }) + } +} diff --git a/internal/escape/elem_backslash.go b/internal/escape/elem_backslash.go new file mode 100644 index 0000000..a5aba2c --- /dev/null +++ b/internal/escape/elem_backslash.go @@ -0,0 +1,9 @@ +package escape + +func IsBackslash(chars []byte, index int) int { + if chars[index] != '\\' { + return -1 + } + + return 1 +} diff --git a/internal/escape/elem_code.go b/internal/escape/elem_code.go new file mode 100644 index 0000000..65adaaa --- /dev/null +++ b/internal/escape/elem_code.go @@ -0,0 +1,45 @@ +package escape + +func IsFencedCode(chars []byte, index int) int { + if chars[index] != '`' && chars[index] != '~' { + return -1 + } + + for i := index - 1; i >= 0; i-- { + if chars[i] == ' ' || chars[i] == placeholderByte { + continue + } + if chars[i] == '\n' { + break + } + + return -1 + } + + count := 1 + i := index + 1 + for ; i < len(chars); i++ { + if chars[i] == placeholderByte { + continue + } + if chars[i] == '`' || chars[i] == '~' { + count++ + continue + } + + break + } + if count < 3 { + return -1 + } + + return i - index +} + +func IsInlineCode(chars []byte, index int) int { + if chars[index] != '`' { + return -1 + } + + return 1 +} diff --git a/internal/escape/elem_code_test.go b/internal/escape/elem_code_test.go new file mode 100644 index 0000000..ae89695 --- /dev/null +++ b/internal/escape/elem_code_test.go @@ -0,0 +1,129 @@ +package escape + +import ( + "reflect" + "testing" +) + +func TestIsFencedCode(t *testing.T) { + runs := []struct { + name string + chars []byte + + expected []int + }{ + { + name: "not needed", + chars: []byte{'a', 'b', 'c'}, + expected: []int{-1, -1, -1}, + }, + { + name: "only two", + chars: []byte{placeholderByte, '`', placeholderByte, '`', 'a'}, + expected: []int{-1, -1, -1, -1, -1}, + }, + { + name: "other chars before", + chars: []byte{'a', ' ', placeholderByte, '`', placeholderByte, '`', placeholderByte, '`', 'a'}, + expected: []int{-1, -1, -1, -1, -1, -1, -1, -1, -1}, + }, + { + name: "just beginning", + chars: []byte{placeholderByte, '`', placeholderByte, '`', placeholderByte, '`', 'a'}, + expected: []int{-1, 5, -1, -1, -1, -1, -1}, + }, + { + name: "just beginning (with tilde)", + chars: []byte{placeholderByte, '~', placeholderByte, '~', placeholderByte, '~', 'a'}, + expected: []int{-1, 5, -1, -1, -1, -1, -1}, + }, + { + name: "just beginning (with space before)", + chars: []byte{' ', placeholderByte, '`', placeholderByte, '`', placeholderByte, '`', 'a'}, + expected: []int{-1, -1, 5, -1, -1, -1, -1, -1}, + }, + { + name: "just beginning (with newline before)", + chars: []byte{'\n', placeholderByte, '`', placeholderByte, '`', placeholderByte, '`', 'a'}, + expected: []int{-1, -1, 5, -1, -1, -1, -1, -1}, + }, + { + name: "simple", + chars: []byte{placeholderByte, '`', placeholderByte, '`', placeholderByte, '`', '\n', 'a', '\n', placeholderByte, '`', placeholderByte, '`', placeholderByte, '`'}, + expected: []int{-1, 5, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1}, + }, + { + name: "only one end delimiter", + chars: []byte{placeholderByte, '`', placeholderByte, '`', placeholderByte, '`', '\n', 'a', '\n', placeholderByte, '`'}, + expected: []int{-1, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + }, + } + for _, run := range runs { + t.Run(run.name, func(t *testing.T) { + var actual []int + for index := range run.chars { + output := IsFencedCode(run.chars, index) + + actual = append(actual, output) + } + + if !reflect.DeepEqual(actual, run.expected) { + t.Errorf("expected %+v (l:%d) but got %+v (l:%d)", run.expected, len(run.expected), actual, len(actual)) + } + + }) + } +} + +func TestIsInlineCode(t *testing.T) { + runs := []struct { + name string + chars []byte + + expected []int + }{ + { + name: "not needed", + chars: []byte{'a', 'b', 'c'}, + expected: []int{-1, -1, -1}, + }, + // { + // name: "one delimiter inside text, no end delimiter", + // chars: []byte{'a', '`', 'b', ' ', '\n', '\n', 'a'}, + // expected: []int{-1, -1, -1}, + // }, + { + name: "simple", + chars: []byte{'`', 'a', '`'}, + // expected: []int{1, -1, -1}, + expected: []int{1, -1, 1}, + }, + // { + // name: "without content", + // chars: []byte{'`', '`'}, + // expected: []int{1, -1}, + // }, + // { + // name: "code inside normal text", + // chars: []byte{'a', '`', 'b', '`', 'a'}, + // expected: []int{-1, 3, -1, -1, -1}, + // }, + + // TODO: also nested: ``some `code` text`` + + } + for _, run := range runs { + t.Run(run.name, func(t *testing.T) { + var actual []int + for index := range run.chars { + output := IsInlineCode(run.chars, index) + + actual = append(actual, output) + } + + if !reflect.DeepEqual(actual, run.expected) { + t.Errorf("expected %+v but got %+v", run.expected, actual) + } + }) + } +} diff --git a/internal/escape/elem_divider.go b/internal/escape/elem_divider.go new file mode 100644 index 0000000..25ec9af --- /dev/null +++ b/internal/escape/elem_divider.go @@ -0,0 +1,47 @@ +package escape + +func IsDivider(chars []byte, index int) int { + if chars[index] != '-' && chars[index] != '_' && chars[index] != '*' { + return -1 + } + + for i := index - 1; i >= 0; i-- { + if chars[i] == '\n' { + break + } + if chars[i] == placeholderByte { + continue + } + if chars[i] == ' ' { + continue + } + + return -1 + } + + count := 1 + lastChar := len(chars) + for i := index + 1; i < len(chars); i++ { + if chars[i] == placeholderByte { + continue + } + if chars[i] == ' ' { + continue + } + if chars[i] == chars[index] { + count++ + continue + } + if chars[i] == '\n' { + lastChar = i + break + } + + return -1 + } + + if count >= 3 { + return lastChar - index + } + return -1 +} diff --git a/internal/escape/elem_divider_test.go b/internal/escape/elem_divider_test.go new file mode 100644 index 0000000..e02445e --- /dev/null +++ b/internal/escape/elem_divider_test.go @@ -0,0 +1,79 @@ +package escape + +import ( + "reflect" + "testing" +) + +func TestIsDivider(t *testing.T) { + runs := []struct { + name string + chars []byte + + expected []int + }{ + { + name: "not needed", + chars: []byte{'a', 'b', 'c'}, + expected: []int{-1, -1, -1}, + }, + { + name: "two dashes", + chars: []byte{'-', '-'}, + expected: []int{-1, -1}, + }, + { + name: "char afterwards", + chars: []byte{'-', '-', '-', ' ', 'a'}, + expected: []int{-1, -1, -1, -1, -1}, + }, + + { + name: "three dashes", + chars: []byte{'-', '-', '-'}, + expected: []int{3, -1, -1}, + }, + { + name: "five dashes", + chars: []byte{'-', '-', '-', '-', '-'}, + expected: []int{5, -1, -1, -1, -1}, + }, + { + name: "space after", + chars: []byte{'-', '-', '-', ' '}, + expected: []int{4, -1, -1, -1}, + }, + { + name: "newline after", + chars: []byte{'-', '-', '-', '\n'}, + expected: []int{3, -1, -1, -1}, + }, + + { + name: "newline and space before", + chars: []byte{'\n', ' ', '-', '-', '-'}, + expected: []int{-1, -1, 3, -1, -1}, + }, + { + name: "with placeholders", + chars: []byte{'\n', ' ', placeholderByte, '-', placeholderByte, '-', placeholderByte, '-'}, + expected: []int{-1, -1, -1, 5, -1, -1, -1, -1}, + }, + } + for _, run := range runs { + + t.Run(run.name, func(t *testing.T) { + var actual []int + for index := range run.chars { + output := IsDivider(run.chars, index) + + actual = append(actual, output) + } + + if !reflect.DeepEqual(actual, run.expected) { + t.Errorf("expected %+v but got %+v", run.expected, actual) + } + }) + + } +} diff --git a/internal/escape/elem_header.go b/internal/escape/elem_header.go new file mode 100644 index 0000000..77fee06 --- /dev/null +++ b/internal/escape/elem_header.go @@ -0,0 +1,77 @@ +package escape + +func IsAtxHeader(chars []byte, index int) int { + if chars[index] != '#' { + return -1 + } + + for i := index - 1; i >= 0; i-- { + if chars[i] == '\n' { + break + } + if chars[i] == placeholderByte { + continue + } + if chars[i] == ' ' { + continue + } + return -1 + } + + nPoundSigns := 1 + for i := index + 1; i < len(chars); i++ { + if chars[i] == '#' { + nPoundSigns++ + + if nPoundSigns > 6 { + return -1 + } + continue + } + + if chars[i] == placeholderByte { + continue + } + if chars[i] == ' ' || chars[i] == '\t' || chars[i] == '\n' || chars[i] == '\r' { + // TODO: fix calculation with placeholder (maybe own for loop construct?) + // Returns the count of # that we encountered + return i - index + } + + return -1 + } + return 1 +} + +func IsSetextHeader(chars []byte, index int) int { + if chars[index] != '=' && chars[index] != '-' { + return -1 + } + + var newlineCount int + for i := index - 1; i >= 0; i-- { + if chars[i] == placeholderByte || chars[i] == ' ' { + continue + } + + if chars[i] == '\n' { + newlineCount++ + continue + } + + if newlineCount == 0 { + // Without any newlines, this character is on the same line + // as the delimiter. So the delimiter is inside a normal text... + return -1 + } else if newlineCount == 1 { + // The heading content is on the line above the delimiter + // which qualifies for a setext heading... + return 1 + } else { + return -1 + } + + } + + return -1 +} diff --git a/internal/escape/elem_header_test.go b/internal/escape/elem_header_test.go new file mode 100644 index 0000000..c66b1fb --- /dev/null +++ b/internal/escape/elem_header_test.go @@ -0,0 +1,157 @@ +package escape + +import ( + "reflect" + "testing" +) + +func TestIsAtxHeader(t *testing.T) { + + runs := []struct { + name string + chars []rune + + expected []int + }{ + { + name: "not needed", + chars: []rune{'a', 'b', 'c'}, + expected: []int{-1, -1, -1}, + }, + { + name: "inside text", + chars: []rune{'a', placeholderRune, '#', ' ', 'a'}, + expected: []int{-1, -1, -1, -1, -1}, + }, + { + name: "inside text with space between", + chars: []rune{'a', ' ', placeholderRune, '#', ' ', 'a'}, + expected: []int{-1, -1, -1, -1, -1, -1}, + }, + { + name: "h1 at start of file", + chars: []rune{placeholderRune, '#', ' ', 'a', 'b'}, + expected: []int{-1, 1, -1, -1, -1}, + }, + { + name: "h1 at start of line", + chars: []rune{'\n', placeholderRune, '#', ' ', 'a', 'b'}, + expected: []int{-1, -1, 1, -1, -1, -1}, + }, + { + name: "h1 with space before", + chars: []rune{' ', placeholderRune, '#', ' ', 'a', 'b'}, + expected: []int{-1, -1, 1, -1, -1, -1}, + }, + { + name: "h2", + chars: []rune{placeholderRune, '#', placeholderRune, '#', ' ', 'a', 'b'}, + expected: []int{-1, 3, -1, -1, -1, -1, -1}, + }, + { + name: "h4", + chars: []rune{placeholderRune, '#', placeholderRune, '#', placeholderRune, '#', placeholderRune, '#', ' ', 'a'}, + expected: []int{-1, 7, -1, -1, -1, -1, -1, -1, -1, -1}, + }, + { + name: "h6", + chars: []rune{placeholderRune, '#', placeholderRune, '#', placeholderRune, '#', placeholderRune, '#', placeholderRune, '#', placeholderRune, '#', ' ', 'a'}, + expected: []int{-1, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + }, + { + name: "no h7", + chars: []rune{placeholderRune, '#', placeholderRune, '#', placeholderRune, '#', placeholderRune, '#', placeholderRune, '#', placeholderRune, '#', placeholderRune, '#', ' ', 'a'}, + expected: []int{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + }, + { + name: "tag", + chars: []rune{placeholderRune, '#', 'a'}, + expected: []int{-1, -1, -1}, + }, + { + name: "also take empty heading", + chars: []rune{placeholderRune, '#', ' ', '\n', 'a'}, + expected: []int{-1, 1, -1, -1, -1}, + }, + { + name: "nothing afterwards", + chars: []rune{placeholderRune, '#'}, + expected: []int{-1, 1}, + }, + { + name: "tab before content", + chars: []rune{placeholderRune, '#', '\t', 'a'}, + expected: []int{-1, 1, -1, -1}, + }, + } + for _, run := range runs { + + t.Run(run.name, func(t *testing.T) { + + var actual []int + for index, _ := range run.chars { + b := []byte(string(run.chars)) + output := IsAtxHeader(b, index) + + actual = append(actual, output) + } + + if !reflect.DeepEqual(actual, run.expected) { + t.Errorf("expected %+v but got %+v", run.expected, actual) + } + }) + + } +} + +func TestIsSetextHeader(t *testing.T) { + + runs := []struct { + name string + chars []rune + + expected []int + }{ + { + name: "not needed", + chars: []rune{'a', 'b', 'c'}, + expected: []int{-1, -1, -1}, + }, + { + name: "inside text", + chars: []rune{'a', placeholderRune, '=', 'b'}, + expected: []int{-1, -1, -1, -1}, + }, + { + name: "blank line before", + chars: []rune{'a', '\n', '\n', placeholderRune, '='}, + expected: []int{-1, -1, -1, -1, -1}, + }, + { + name: "with heading", + chars: []rune{'a', '\n', placeholderRune, '='}, + expected: []int{-1, -1, -1, 1}, + }, + { + name: "at start of file", + chars: []rune{placeholderRune, '='}, + expected: []int{-1, -1}, + }, + } + for _, run := range runs { + t.Run(run.name, func(t *testing.T) { + + var actual []int + for index, _ := range run.chars { + b := []byte(string(run.chars)) + output := IsSetextHeader(b, index) + + actual = append(actual, output) + } + + if !reflect.DeepEqual(actual, run.expected) { + t.Errorf("expected %+v but got %+v", run.expected, actual) + } + }) + } +} diff --git a/internal/escape/elem_image.go b/internal/escape/elem_image.go new file mode 100644 index 0000000..e1d9b5d --- /dev/null +++ b/internal/escape/elem_image.go @@ -0,0 +1,36 @@ +package escape + +func IsImageOrLink(chars []byte, index int) int { + if chars[index] == '!' { + return isImageOrLinkStartExclamation(chars, index) + } + if chars[index] == '[' { + return isImageOrLinkStartBracket(chars, index) + } + + return -1 +} + +func isImageOrLinkStartExclamation(chars []byte, index int) int { + nextIndex := index + 1 + if nextIndex < len(chars) && chars[nextIndex] == '[' { + // It could be the start of an image + return 1 + } + + return -1 +} + +func isImageOrLinkStartBracket(chars []byte, index int) int { + for i := index + 1; i < len(chars); i++ { + if chars[i] == '\n' { + return -1 + } + + if chars[i] == ']' { + return 1 + } + } + + return -1 +} diff --git a/internal/escape/elem_image_test.go b/internal/escape/elem_image_test.go new file mode 100644 index 0000000..d3ffd38 --- /dev/null +++ b/internal/escape/elem_image_test.go @@ -0,0 +1,72 @@ +package escape + +import ( + "reflect" + "testing" +) + +func TestIsImageOrLink(t *testing.T) { + runs := []struct { + name string + chars []byte + + expected []int + }{ + { + name: "not needed", + chars: []byte{'a', 'b', 'c'}, + expected: []int{-1, -1, -1}, + }, + { + name: "image A", + chars: []byte{placeholderByte, '!', placeholderByte, '[', 'a', ']'}, + expected: []int{-1, -1, -1, 1, -1, -1}, + }, + { + name: "image B", + chars: []byte{'!', placeholderByte, '[', 'a', ']'}, + expected: []int{-1, -1, 1, -1, -1}, + }, + { + name: "image C", + chars: []byte{placeholderByte, '!', '[', 'a', ']'}, + expected: []int{-1, 1, 1, -1, -1}, + }, + { + name: "multiple starting brackets", + chars: []byte{'[', '[', '[', 'a', ']'}, + expected: []int{1, 1, 1, -1, -1}, + }, + { + name: "newline in content", + chars: []byte{'[', 'a', '\n', ']'}, + expected: []int{-1, -1, -1, -1}, + }, + { + name: "at end of file", + chars: []byte{'[', 'a'}, + expected: []int{-1, -1}, + }, + { + name: "at end of file", + chars: []byte{'!'}, + expected: []int{-1}, + }, + } + for _, run := range runs { + + t.Run(run.name, func(t *testing.T) { + var actual []int + for index := range run.chars { + output := IsImageOrLink(run.chars, index) + + actual = append(actual, output) + } + + if !reflect.DeepEqual(actual, run.expected) { + t.Errorf("expected %+v but got %+v", run.expected, actual) + } + }) + + } +} diff --git a/internal/escape/elem_italic_bold.go b/internal/escape/elem_italic_bold.go new file mode 100644 index 0000000..84381af --- /dev/null +++ b/internal/escape/elem_italic_bold.go @@ -0,0 +1,21 @@ +package escape + +import ( + "unicode" +) + +func IsItalicOrBold(chars []byte, index int) int { + if chars[index] != '*' && chars[index] != '_' { + return -1 + } + + next := getNextAsRune(chars, index) + + nextIsWhitespace := unicode.IsSpace(next) || next == 0 + if nextIsWhitespace { + // "not followed by Unicode whitespace" + return -1 + } + + return 1 +} diff --git a/internal/escape/elem_italic_bold_test.go b/internal/escape/elem_italic_bold_test.go new file mode 100644 index 0000000..4a75405 --- /dev/null +++ b/internal/escape/elem_italic_bold_test.go @@ -0,0 +1,204 @@ +package escape + +import ( + "reflect" + "testing" +) + +func TestIsItalicOrBold(t *testing.T) { + runs := []struct { + desc string + chars []rune + index int + + expected int + }{ + { + desc: "not needed", + chars: []rune("normal text"), + index: 0, + + expected: -1, + }, + + { + desc: "nothing before", + chars: []rune("*a"), + index: 0, + + expected: 1, + }, + { + desc: "newline before", + chars: []rune("\n *a"), + index: 2, + + expected: 1, + }, + { + desc: "text and space before", + chars: []rune("text *a"), + index: 5, + + expected: 1, + }, + + { + desc: "character directly before", + chars: []rune("a*a"), + index: 1, + + expected: 1, + }, + { + desc: "point before", + chars: []rune(".*a"), + index: 1, + + expected: 1, + }, + + // - - - - // + { + desc: "char after", + chars: []rune(" *a"), + index: 1, + expected: 1, + }, + { + desc: "point after", + chars: []rune(" *."), + index: 1, + expected: 1, + }, + { + desc: "nothing after", + chars: []rune(" *"), + index: 1, + expected: -1, + }, + { + desc: "space after", + chars: []rune(" * "), + index: 1, + expected: -1, + }, + { + desc: "newline after", + chars: []rune(" *\n"), + index: 1, + expected: -1, + }, + + // - - - - // + // { + // desc: "char before & point after", + // chars: []rune("a*."), + // index: 1, + // expected: -1, + // }, + { + desc: "space before & point after", + chars: []rune(" *."), + index: 1, + expected: 1, + }, + { + desc: "point before & point after", + chars: []rune(".*."), + index: 1, + expected: 1, + }, + + // - - - - // + { + desc: "exclamation mark as content", + chars: []rune("*!*"), + index: 0, + expected: 1, + }, + { + desc: "special char before", + chars: []rune("$*content*"), + index: 1, + expected: 1, + }, + { + desc: "$ before", + chars: []rune("0$*!*"), + index: 2, + expected: 1, + }, + // { + // desc: "\x00 after", + // chars: []rune("*\x00*"), + // index: 0, + // expected: 1, + // }, + // { + // desc: "some random input #1", + // chars: []rune("\xac\xac\xac*!0*"), + // index: 3, + // expected: 1, + // }, + // { + // desc: "random input #1", + // chars: []rune{'\xac', '*', 'a', '*'}, //"*!0*"), + // index: 2, + // expected: 1, + // }, + } + + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + b := []byte(string(run.chars)) + + t.Log("input:", string(b)) + match := IsItalicOrBold(b, run.index) + if match != run.expected { + t.Errorf("expected %d but got %d", run.expected, match) + } + }) + } + +} + +func TestIsItalicOrBold_All(t *testing.T) { + runs := []struct { + name string + chars []rune + + expected []int + }{ + { + name: "random input #1", + chars: []rune{'\xac', '*', '!', '0', '*'}, + expected: []int{-1, -1, 1, -1, -1, -1}, + }, + { + name: "random input #2", + chars: []rune{'\xac', '\xac', '\xac', '*', '!', '0', '*'}, + expected: []int{-1, -1, -1, -1, -1, -1, 1, -1, -1, -1}, + }, + } + for _, run := range runs { + + t.Run(run.name, func(t *testing.T) { + + bytes := []byte(string(run.chars)) + + var actual []int + for index := range bytes { + output := IsItalicOrBold(bytes, index) + + actual = append(actual, output) + } + + if !reflect.DeepEqual(actual, run.expected) { + t.Errorf("expected %+v but got %+v", run.expected, actual) + } + }) + + } + +} diff --git a/internal/escape/elem_list.go b/internal/escape/elem_list.go new file mode 100644 index 0000000..3b9d4cb --- /dev/null +++ b/internal/escape/elem_list.go @@ -0,0 +1,67 @@ +package escape + +import ( + "unicode" +) + +func IsUnorderedList(chars []byte, index int) int { + if chars[index] != '-' && chars[index] != '*' && chars[index] != '+' { + return -1 + } + + for i := index - 1; i >= 0; i-- { + if chars[i] == '\n' { + break + } + if chars[i] == ' ' { + continue + } + if chars[i] == placeholderByte { + continue + } + return -1 + } + + next := getNext(chars, index) + if IsSpace(next) || next == 0 { + return 1 + } + + return -1 +} + +func IsOrderedList(chars []byte, index int) int { + if chars[index] != '.' && chars[index] != ')' { + return -1 + } + + // Directly before the dot needs to be a digit + prev := getPrevAsRune(chars, index) + if !unicode.IsDigit(prev) { + return -1 + } + + for i := index - 1; i >= 0; i-- { + if chars[i] == '\n' { + break + } + if chars[i] == ' ' { + continue + } + if chars[i] == placeholderByte { + continue + } + if IsDigit(chars[i]) { + continue + } + + return -1 + } + + next := getNext(chars, index) + if IsSpace(next) || next == 0 { + return 1 + } + + return -1 +} diff --git a/internal/escape/elem_list_test.go b/internal/escape/elem_list_test.go new file mode 100644 index 0000000..f5da904 --- /dev/null +++ b/internal/escape/elem_list_test.go @@ -0,0 +1,123 @@ +package escape + +import ( + "reflect" + "testing" +) + +func TestIsUnorderedList(t *testing.T) { + runs := []struct { + name string + chars []byte + + expected []int + }{ + { + name: "not needed", + chars: []byte{'a', 'b', 'c'}, + expected: []int{-1, -1, -1}, + }, + { + name: "dash inside text", + chars: []byte{'a', '-', ' ', 'b'}, + expected: []int{-1, -1, -1, -1}, + }, + { + name: "dash and directly text", + chars: []byte{'-', 'a', 'b'}, + expected: []int{-1, -1, -1}, + }, + { + name: "two lists", + chars: []byte{placeholderByte, '-', ' ', 'a', '\n', placeholderByte, '-', ' ', 'b'}, + expected: []int{-1, 1, -1, -1, -1, -1, 1, -1, -1}, + }, + { + name: "space before list", + chars: []byte{' ', '-', ' ', 'a'}, + expected: []int{-1, 1, -1, -1}, + }, + } + for _, run := range runs { + + t.Run(run.name, func(t *testing.T) { + var actual []int + for index := range run.chars { + output := IsUnorderedList(run.chars, index) + + actual = append(actual, output) + } + + if !reflect.DeepEqual(actual, run.expected) { + t.Errorf("expected %+v but got %+v", run.expected, actual) + } + }) + + } +} + +func TestIsOrderedList(t *testing.T) { + runs := []struct { + name string + chars []byte + + expected []int + }{ + { + name: "not needed", + chars: []byte{'a', 'b', 'c'}, + expected: []int{-1, -1, -1}, + }, + { + name: "simple list", + chars: []byte{'1', '.', ' ', 'a'}, + expected: []int{-1, 1, -1, -1}, + }, + { + name: "bigger list", + chars: []byte{'1', '2', '3', '.', ' ', 'a'}, + expected: []int{-1, -1, -1, 1, -1, -1}, + }, + { + name: "inside text", + chars: []byte{'a', '1', '.', ' ', 'a'}, + expected: []int{-1, -1, -1, -1, -1}, + }, + { + name: "space after dot missing", + chars: []byte{'1', '.', 'a'}, + expected: []int{-1, -1, -1}, + }, + { + name: "number before dot missing", + chars: []byte{'a', '.', 'b'}, + expected: []int{-1, -1, -1}, + }, + { + name: "allow space before dot", + chars: []byte{' ', '1', '.', ' ', 'a'}, + expected: []int{-1, -1, 1, -1, -1}, + }, + { + name: "two lists", + chars: []byte{placeholderByte, '1', '.', ' ', 'a', '\n', placeholderByte, '1', '.', ' ', 'b'}, + expected: []int{-1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1}, + }, + } + for _, run := range runs { + + t.Run(run.name, func(t *testing.T) { + var actual []int + for index := range run.chars { + output := IsOrderedList(run.chars, index) + + actual = append(actual, output) + } + + if !reflect.DeepEqual(actual, run.expected) { + t.Errorf("expected %+v but got %+v", run.expected, actual) + } + }) + + } +} diff --git a/internal/escape/elem_quote.go b/internal/escape/elem_quote.go new file mode 100644 index 0000000..2df00cb --- /dev/null +++ b/internal/escape/elem_quote.go @@ -0,0 +1,23 @@ +package escape + +func IsBlockQuote(chars []byte, index int) int { + if chars[index] != '>' { + return -1 + } + + for i := index - 1; i >= 0; i-- { + if chars[i] == '\n' { + break + } + if chars[i] == placeholderByte { + continue + } + if chars[i] == ' ' { + continue + } + + return -1 + } + + return 1 +} diff --git a/internal/escape/elem_quote_test.go b/internal/escape/elem_quote_test.go new file mode 100644 index 0000000..63ece7b --- /dev/null +++ b/internal/escape/elem_quote_test.go @@ -0,0 +1,67 @@ +package escape + +import ( + "reflect" + "testing" +) + +func TestIsBlockquote(t *testing.T) { + runs := []struct { + name string + chars []byte + + expected []int + }{ + { + name: "allow simple quote", + chars: []byte{'>', ' ', 'a'}, + expected: []int{1, -1, -1}, + }, + { + name: "allow space before", + chars: []byte{' ', '>', ' ', 'a'}, + expected: []int{-1, 1, -1, -1}, + }, + { + name: "allow missing space after", + chars: []byte{'>', 'a'}, + expected: []int{1, -1}, + }, + { + name: "allow newline before", + chars: []byte{'\n', '>', ' ', 'a'}, + expected: []int{-1, 1, -1, -1}, + }, + { + name: "allow newline and space before", + chars: []byte{'\n', ' ', '>', ' ', 'a'}, + expected: []int{-1, -1, 1, -1, -1}, + }, + { + name: "allow placeholder before", + chars: []byte{placeholderByte, '>', ' ', 'a'}, + expected: []int{-1, 1, -1, -1}, + }, + { + name: "dont allow other chars before", + chars: []byte{'a', '>', ' ', 'a'}, + expected: []int{-1, -1, -1, -1}, + }, + } + for _, run := range runs { + + t.Run(run.name, func(t *testing.T) { + var actual []int + for index := range run.chars { + output := IsBlockQuote(run.chars, index) + + actual = append(actual, output) + } + + if !reflect.DeepEqual(actual, run.expected) { + t.Errorf("expected %+v but got %+v", run.expected, actual) + } + }) + + } +} diff --git a/internal/escape/replacer.go b/internal/escape/replacer.go new file mode 100644 index 0000000..c7ef6bb --- /dev/null +++ b/internal/escape/replacer.go @@ -0,0 +1,8 @@ +package escape + +import "github.com/JohannesKaufmann/html-to-markdown/v2/marker" + +var placeholderRune rune = marker.MarkerEscaping + +// IMPORTANT: Only internally we assume it is only byte +var placeholderByte byte = marker.BytesMarkerEscaping[0] diff --git a/internal/escape/util.go b/internal/escape/util.go new file mode 100644 index 0000000..013c6da --- /dev/null +++ b/internal/escape/util.go @@ -0,0 +1,70 @@ +package escape + +import "unicode/utf8" + +// TODO: move to markers package? + +func IsSpace(b byte) bool { + switch b { + case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0: + return true + } + return false +} + +func IsDigit(b byte) bool { + switch b { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return true + } + return false +} + +func getPrev(chars []byte, index int) byte { + for i := index - 1; i >= 0; i-- { + if chars[i] == placeholderByte { + continue + } + return chars[i] + } + return 0 +} + +func getNext(chars []byte, index int) byte { + for i := index + 1; i < len(chars); i++ { + if chars[i] == placeholderByte { + continue + } + return chars[i] + } + return 0 +} + +func getPrevAsRune(chars []byte, index int) rune { + for i := index - 1; i >= 0; i-- { + if chars[i] == placeholderByte { + continue + } + + r, _ := utf8.DecodeLastRune(chars[:i+1]) + + return r + } + return 0 +} +func getNextAsRune(source []byte, index int) rune { + for i := index + 1; i < len(source); i++ { + if source[i] == placeholderByte { + continue + } + + r, _ := utf8.DecodeRune(source[i:]) + return r + } + return 0 +} + +// TODO: make public? +func GetNextAsRune(source []byte, index int) rune { + return getNextAsRune(source, index) +} diff --git a/internal/escape/util_test.go b/internal/escape/util_test.go new file mode 100644 index 0000000..69a71cf --- /dev/null +++ b/internal/escape/util_test.go @@ -0,0 +1,208 @@ +package escape + +import ( + "bytes" + "testing" + "unicode" +) + +const ChineseRune = '字' + +func TestIsSpace(t *testing.T) { + + runs := []struct { + name string + input byte + expected bool + }{ + { + name: "empty", + input: 0, + expected: false, + }, + { + name: "normal char a", + input: 'a', + expected: false, + }, + { + name: "special character ä", + input: 'ä', + expected: false, + }, + { + name: "chinese character #1", + input: []byte(string(ChineseRune))[0], + expected: false, + }, + { + name: "chinese character #2", + input: []byte(string(ChineseRune))[1], + expected: false, + }, + { + name: "chinese character #3", + input: []byte(string(ChineseRune))[2], + expected: false, + }, + { + name: "space", + input: ' ', + expected: true, + }, + { + name: "tab", + input: ' ', + expected: true, + }, + } + for _, run := range runs { + t.Run(run.name, func(t *testing.T) { + t.Run("unicode.IsSpace", func(t *testing.T) { + output := unicode.IsSpace(rune(run.input)) + if output != run.expected { + t.Errorf("for %s expected %v but got %v", string(run.input), run.expected, output) + } + }) + t.Run("escape.IsSpace", func(t *testing.T) { + output := IsSpace(run.input) + if output != run.expected { + t.Errorf("for %s expected %v but got %v", string(run.input), run.expected, output) + } + }) + }) + } +} + +func TestRune(t *testing.T) { + + chars := []rune{ + ' ', + '\n', + '\t', + rune(6), // Acknowledge character + rune(7), // Bell character + + '!', + '"', + '#', + '$', + '%', + '&', + '\'', + '(', + ')', + '*', + '+', + ',', + '-', + '.', + '/', + ':', + ';', + '<', + '=', + '>', + '?', + '@', + '[', + '\\', + ']', + '^', + '_', + '`', + '{', + '|', + '}', + '~', + } + for _, char := range chars { + t.Run(string(char), func(t *testing.T) { + length := len([]byte(string(char))) + + if length != 1 { + t.Errorf("got a length of %d", length) + } + + }) + } +} + +func TestGetPrev(t *testing.T) { + input := []byte{'a', placeholderByte, 'b', 'c'} + + if getPrev(input, 3) != 'b' { + t.Error("expected different output") + } + if getPrev(input, 2) != 'a' { + t.Error("expected different output") + } + if getPrev(input, 1) != 'a' { + t.Error("expected different output") + } + if getPrev(input, 0) != 0 { + t.Error("expected different output") + } +} + +func TestGetNext(t *testing.T) { + input := []byte{'a', placeholderByte, 'b', 'c'} + + if getNext(input, 0) != 'b' { + t.Error("expected different output") + } + if getNext(input, 1) != 'b' { + t.Error("expected different output") + } + if getNext(input, 2) != 'c' { + t.Error("expected different output") + } + if getNext(input, 3) != 0 { + t.Error("expected different output") + } +} + +func TestGetNextAsRune(t *testing.T) { + inputString := "a\a⌘\ab" + inputBytes := []byte{ + 97, // a + 7, // bell (our escape char) + 226, 140, 152, // mac sign + 7, // bell (our escape char) + 98, // b + } + + if !bytes.Equal([]byte(inputString), inputBytes) { + t.Error("the string and byte slice dont match") + } + + nextByte := getNext(inputBytes, 0) + if nextByte != 226 { + t.Error("expected different next byte") + } + + nextRune := getNextAsRune(inputBytes, 0) + if nextRune != '⌘' { + t.Error("expected different next rune") + } + + lastNextRune := getNextAsRune(inputBytes, 6) + if lastNextRune != 0 { + t.Error("expected different last next rune") + } + // - - - - // + + prevByte := getPrev(inputBytes, 6) + if prevByte != 152 { + t.Error("expected different prev byte") + } + + prevRune := getPrevAsRune(inputBytes, 6) + if prevRune != '⌘' { + t.Error("expected different prev rune") + } + firstPrevRune := getPrevAsRune(inputBytes, 0) + if firstPrevRune != 0 { + t.Error("expected different zero prev rune") + } +} diff --git a/internal/tester/dom.go b/internal/tester/dom.go new file mode 100644 index 0000000..4eaf1b5 --- /dev/null +++ b/internal/tester/dom.go @@ -0,0 +1,51 @@ +package tester + +import ( + "bytes" + "strings" + "testing" + + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +func Parse(t *testing.T, rawHTML string, startFrom string) *html.Node { + if startFrom == "" { + startFrom = "body" + } + + rawHTML = strings.TrimSpace(rawHTML) + + doc, err := html.Parse(strings.NewReader(rawHTML)) + if err != nil { + t.Fatal(err) + } + + var b bytes.Buffer + err = html.Render(&b, doc) + if err != nil { + t.Error(err) + } + + var foundNode *html.Node + var finder func(node *html.Node) + finder = func(node *html.Node) { + if foundNode != nil { + return + } + if dom.NodeName(node) == startFrom { + foundNode = node + return + } + + for child := node.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + } + finder(doc) + + if foundNode == nil { + t.Error("could not find node for 'startFrom'") + } + return foundNode +} diff --git a/internal/tester/dom_representation.go b/internal/tester/dom_representation.go new file mode 100644 index 0000000..886ab71 --- /dev/null +++ b/internal/tester/dom_representation.go @@ -0,0 +1,20 @@ +package tester + +import ( + "strings" + "testing" + + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +func ExpectRepresentation(t *testing.T, doc *html.Node, name string, expectedHtml string) { + actualHtml := dom.RenderRepresentation(doc) + + actualHtml = strings.TrimSpace(actualHtml) + expectedHtml = strings.TrimSpace(expectedHtml) + + if actualHtml != expectedHtml { + t.Errorf("%s: expected \n%s\nbut got\n%s", name, expectedHtml, actualHtml) + } +} diff --git a/internal/tester/goldenfiles.go b/internal/tester/goldenfiles.go new file mode 100644 index 0000000..419e5ff --- /dev/null +++ b/internal/tester/goldenfiles.go @@ -0,0 +1,98 @@ +package tester + +import ( + "flag" + "fmt" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/sebdah/goldie/v2" +) + +var enableRoundTrip = flag.Bool("round", false, "enable the round trip testing") + +const suffixInputFile = ".in.html" +const suffixOutputFile = ".out.md" + +func getInputFiles(pathOfFolder string) ([]string, error) { + files, err := os.ReadDir(pathOfFolder) + if err != nil { + return nil, fmt.Errorf("error while reading %q folder: %w", pathOfFolder, err) + } + + var names []string + for _, file := range files { + if file.IsDir() { + return nil, fmt.Errorf("did not expected a folder %q", file.Name()) + } + if strings.HasSuffix(file.Name(), suffixOutputFile) { + continue + } + if !strings.HasSuffix(file.Name(), suffixInputFile) { + return nil, fmt.Errorf("only expect in or out files but got %q", file.Name()) + } + + name := strings.TrimSuffix(file.Name(), suffixInputFile) + names = append(names, name) + } + + return names, nil +} + +func GoldenFiles(t *testing.T, convert ConvertFunc, roundTripConvert ConvertFunc) { + pathOfFolder := filepath.Join("./testdata", strings.TrimPrefix(t.Name(), "Test")) + runs, err := getInputFiles(pathOfFolder) + if err != nil { + t.Fatal(err) + } + if len(runs) == 0 { + t.Fatalf("there were no golden files found in %q", pathOfFolder) + } + + for _, run := range runs { + t.Run(run, func(t *testing.T) { + pathOfFile := filepath.Join(pathOfFolder, run+suffixInputFile) + input, err := os.ReadFile(pathOfFile) + if err != nil { + t.Fatal(err) + } + t.Logf("running golden file test for %q", pathOfFile) + + // - - - - - - - Golden File Test - - - - - - - // + output, err := convert(input) + if err != nil { + t.Fatal(err) + } + + g := goldie.New(t, + goldie.WithFixtureDir(pathOfFolder), + goldie.WithNameSuffix(suffixOutputFile), + // Simple, ColoredDiff, ClassicDiff + // goldie.WithDiffEngine(goldie.Simple), + goldie.WithDiffFn(func(actual, expected string) string { + return fmt.Sprintf("Expected: %q\nGot: %q", expected, actual) + }), + ) + g.Assert(t, run, []byte(output)) + + // - - - - - - - Round Trip Test - - - - - - - // + if *enableRoundTrip { + _, err := RoundTrip(run, input, roundTripConvert) + if err != nil { + t.Error(err) + + // - - - // + + // TODO: clear folder to override earlier runs + // TODO: enable writing using command line + // err2 := res.WriteToFiles("./.tmp/roundtrip") + // if err2 != nil { + // t.Error(err2) + // } + } + } + }) + } +} diff --git a/internal/tester/round_trip.go b/internal/tester/round_trip.go new file mode 100644 index 0000000..db4b99e --- /dev/null +++ b/internal/tester/round_trip.go @@ -0,0 +1,121 @@ +package tester + +import ( + "bytes" + "fmt" + "os" + "path/filepath" + "time" + + // "github.com/darmiel/gohtml" + "github.com/yuin/goldmark" + goldmarkHtml "github.com/yuin/goldmark/renderer/html" +) + +type ConvertFunc func(html []byte) (markdown []byte, err error) + +var goldmarkConverter = goldmark.New( + goldmark.WithRendererOptions( + // Also render "javascript:" links + goldmarkHtml.WithUnsafe(), + ), +) + +type Result struct { + Identifier string + + FirstDuration time.Duration + SecondDuration time.Duration + + OriginalHtml []byte + FirstMarkdown []byte + IntermediateHtml []byte + SecondMarkdown []byte +} + +func (r Result) GetStatus() string { + if bytes.Equal(r.FirstMarkdown, r.SecondMarkdown) { + return fmt.Sprintf("%s: ✅ in %s & %s", r.Identifier, r.FirstDuration, r.SecondDuration) + } + + return fmt.Sprintf("%s: ❌ in %s & %s", r.Identifier, r.FirstDuration, r.SecondDuration) +} +func (r Result) PrintStatus() { + var Reset = "\033[0m" + var Yellow = "\033[33m" + fmt.Printf("%s[Round Trip Test] %s \n%s", Yellow, r.GetStatus(), Reset) + +} +func (r Result) WriteToFiles(folderpath string) error { + err := os.MkdirAll(filepath.Join(folderpath, r.Identifier), os.ModePerm) + if err != nil { + return fmt.Errorf("error while creating folder %q: %w", filepath.Join(folderpath, r.Identifier), err) + } + + err = os.WriteFile(filepath.Join(folderpath, r.Identifier, "01.html"), r.OriginalHtml, 0644) + if err != nil { + return err + } + err = os.WriteFile(filepath.Join(folderpath, r.Identifier, "02.md"), r.FirstMarkdown, 0644) + if err != nil { + return err + } + err = os.WriteFile(filepath.Join(folderpath, r.Identifier, "03.html"), r.IntermediateHtml, 0644) + if err != nil { + return err + } + err = os.WriteFile(filepath.Join(folderpath, r.Identifier, "04.md"), r.SecondMarkdown, 0644) + if err != nil { + return err + } + + // originalHtmlPretty := gohtml.Format(string(r.OriginalHtml), true) + // err = ioutil.WriteFile(filepath.Join(folderpath, r.Identifier, "01_pretty.html"), []byte(originalHtmlPretty), 0644) + // if err != nil { + // return err + // } + + // intermediateHtmlPretty := gohtml.Format(string(r.IntermediateHtml), true) + // err = ioutil.WriteFile(filepath.Join(folderpath, r.Identifier, "03_pretty.html"), []byte(intermediateHtmlPretty), 0644) + // if err != nil { + // return err + // } + + return nil +} + +func RoundTrip(identifier string, originalHtml []byte, convert ConvertFunc) (*Result, error) { + var err error + res := &Result{ + Identifier: identifier, + OriginalHtml: originalHtml, + } + + firstStart := time.Now() + res.FirstMarkdown, err = convert(originalHtml) + res.FirstDuration = time.Since(firstStart) + if err != nil { + return res, fmt.Errorf("error in the first convert round: %w", err) + } + + var buf bytes.Buffer + err = goldmarkConverter.Convert(res.FirstMarkdown, &buf) + if err != nil { + return res, fmt.Errorf("error with goldmark: %w", err) + } + res.IntermediateHtml = buf.Bytes() + + secondStart := time.Now() + res.SecondMarkdown, err = convert(res.IntermediateHtml) + res.SecondDuration = time.Since(secondStart) + if err != nil { + return res, fmt.Errorf("error in the second convert round: %w", err) + } + + if bytes.Equal(res.FirstMarkdown, res.SecondMarkdown) { + // Hurray, the converter produced exactly the same result. Well done!!! + return res, nil + } + + return res, fmt.Errorf("difference between the first and second markdown round") +} diff --git a/internal/textutils/codefence.go b/internal/textutils/codefence.go new file mode 100644 index 0000000..ef0f50d --- /dev/null +++ b/internal/textutils/codefence.go @@ -0,0 +1,61 @@ +package textutils + +import "strings" + +func CalculateCodeFenceOccurrences(fenceChar rune, content string) int { + var occurrences []int + + var charsTogether int + for _, char := range content { + // We encountered a fence character, now count how many + // are directly afterwards + if char == fenceChar { + charsTogether++ + } else if charsTogether != 0 { + occurrences = append(occurrences, charsTogether) + charsTogether = 0 + } + } + + // If the last element in the content was a fenceChar + if charsTogether != 0 { + occurrences = append(occurrences, charsTogether) + } + + return findMax(occurrences) +} + +// CalculateCodeFence can be passed the content of a code block and it returns +// how many fence characters (` or ~) should be used. +// +// This is useful if the html content includes the same fence characters +// for example ``` +// -> https://stackoverflow.com/a/49268657 +func CalculateCodeFence(fenceChar rune, content string) string { + repeat := CalculateCodeFenceOccurrences(fenceChar, content) + + // The outer fence block always has to have + // at least one character more than any content inside + repeat++ + + // You have to have at least three fence characters + // to be recognized as a code block + if repeat < 3 { + repeat = 3 + } + + return strings.Repeat(string(fenceChar), repeat) +} + +func findMax(a []int) (max int) { + for i, value := range a { + if i == 0 { + max = a[i] + } + + if value > max { + max = value + } + } + return max +} diff --git a/internal/textutils/codefence_test.go b/internal/textutils/codefence_test.go new file mode 100644 index 0000000..a56eeca --- /dev/null +++ b/internal/textutils/codefence_test.go @@ -0,0 +1,79 @@ +package textutils + +import ( + "strings" + "testing" +) + +func TestCalculateCodeFence(t *testing.T) { + var tests = []struct { + Name string + FenceChar rune + + Text string + Expect string + }{ + { + Name: "no occurrences with backtick", + FenceChar: '`', + Text: `normal ~~~ code block`, + Expect: "```", + }, + { + Name: "no occurrences with tilde", + FenceChar: '~', + Text: "normal ``` code block", + Expect: "~~~", + }, + { + Name: "one exact occurrence", + FenceChar: '`', + Text: "```", + Expect: "````", + }, + { + Name: "one occurrences with backtick", + FenceChar: '`', + Text: "normal ``` code block", + Expect: "````", + }, + { + Name: "one bigger occurrences with backtick", + FenceChar: '`', + Text: "normal ````` code block", + Expect: "``````", + }, + { + Name: "multiple occurrences with backtick", + FenceChar: '`', + Text: "normal ``` code `````` block", + Expect: "```````", + }, + { + Name: "multiple occurrences with tilde", + FenceChar: '~', + Text: "normal ~~~ code ~~~~~~~~~~~~ block", + Expect: "~~~~~~~~~~~~~", + }, + { + Name: "multiple occurrences on different lines with tilde", + FenceChar: '~', + Text: ` +normal + ~~~ +code ~~~~~~~~~~~~ block + `, + Expect: "~~~~~~~~~~~~~", + }, + } + + for _, test := range tests { + t.Run(test.Name, func(t *testing.T) { + output := CalculateCodeFence(test.FenceChar, test.Text) + + if output != test.Expect { + t.Errorf("expected '%s' (x%d) but got '%s' (x%d)", test.Expect, strings.Count(test.Expect, string(test.FenceChar)), output, strings.Count(output, string(test.FenceChar))) + } + }) + } +} diff --git a/internal/textutils/collapse_code.go b/internal/textutils/collapse_code.go new file mode 100644 index 0000000..47772e2 --- /dev/null +++ b/internal/textutils/collapse_code.go @@ -0,0 +1,31 @@ +package textutils + +import ( + "bytes" +) + +func CollapseInlineCodeContent(content []byte) []byte { + // TODO: what about other characters like the reset char? Maybe unicode.IsSpace? + content = bytes.ReplaceAll(content, []byte{'\n'}, []byte{' '}) + content = bytes.ReplaceAll(content, []byte{'\t'}, []byte{' '}) + + content = bytes.TrimSpace(content) + + newChars := make([]byte, 0, len(content)) + + var count int + for _, char := range content { + if char == ' ' { + count++ + } else { + count = 0 + } + + if count > 1 { + continue + } + newChars = append(newChars, char) + } + + return newChars +} diff --git a/internal/textutils/collapse_code_test.go b/internal/textutils/collapse_code_test.go new file mode 100644 index 0000000..9748924 --- /dev/null +++ b/internal/textutils/collapse_code_test.go @@ -0,0 +1,57 @@ +package textutils + +import "testing" + +func TestCollapseInlineCodeContent(t *testing.T) { + runs := []struct { + desc string + input string + expected string + }{ + { + desc: "empty", + input: "", + expected: "", + }, + { + desc: "not needed", + input: "a b", + expected: "a b", + }, + { + desc: "one newline", + input: "a\nb", + expected: "a b", + }, + { + desc: "multiple newlines", + input: "a\nb\n\nc", + expected: "a b c", + }, + { + desc: "also trim", + input: " a b ", + expected: "a b", + }, + { + desc: "realistic code content", + input: ` + + body { + color: yellow; + font-size: 16px; + } + + `, + expected: "body { color: yellow; font-size: 16px; }", + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + actual := CollapseInlineCodeContent([]byte(run.input)) + if string(actual) != run.expected { + t.Errorf("expected %q but got %q", run.expected, string(actual)) + } + }) + } +} diff --git a/internal/textutils/consecutive_newlines.go b/internal/textutils/consecutive_newlines.go new file mode 100644 index 0000000..e520253 --- /dev/null +++ b/internal/textutils/consecutive_newlines.go @@ -0,0 +1,90 @@ +package textutils + +import ( + "unicode/utf8" + + "github.com/JohannesKaufmann/html-to-markdown/v2/marker" +) + +func TrimConsecutiveNewlines(source []byte) []byte { + // Some performance optimizations: + // - If no replacement was done, we return the original slice and dont allocate. + // - We batch appends + + var ret []byte + + startNormal := 0 + startMatch := -1 + + count := 0 + // for i, b := range source { + for i := 0; i < len(source); i++ { + r, size := utf8.DecodeRune(source[i:]) + _ = size + + isNewline := r == '\n' || r == marker.MarkerLineBreak + if isNewline { + count += 1 + } + + if startMatch == -1 && isNewline { + // Start of newlines + startMatch = i + i = i + size - 1 + continue + } else if startMatch != -1 && isNewline { + // Middle of newlines + i = i + size - 1 + continue + } else if startMatch != -1 { + // Character after the last newline character + + if count > 2 { + if ret == nil { + ret = make([]byte, 0, len(source)) + } + + ret = append(ret, source[startNormal:startMatch]...) + ret = append(ret, '\n', '\n') + startNormal = i + } + + startMatch = -1 + count = 0 + } + } + + getStartEnd := func() (int, int, bool, bool) { + if startMatch == -1 && startNormal == 0 { + // a) no changes need to be done + return -1, -1, false, false + } + + if count <= 2 { + // b) Only the normal characters still need to be added + return startNormal, len(source), true, false + } + + // c) The match still needs to be replaced (and possible the previous normal characters be added) + return startNormal, startMatch, true, true + } + + start, end, isKeepNeeded, isReplaceNeeded := getStartEnd() + if isKeepNeeded { + if ret == nil { + ret = make([]byte, 0, len(source)) + } + + ret = append(ret, source[start:end]...) + if isReplaceNeeded { + ret = append(ret, '\n', '\n') + } + } + + if ret == nil { + // Huray, we did not do any allocations with make() + // and instead just return the original slice. + return source + } + return ret +} diff --git a/internal/textutils/consecutive_newlines_test.go b/internal/textutils/consecutive_newlines_test.go new file mode 100644 index 0000000..dda0993 --- /dev/null +++ b/internal/textutils/consecutive_newlines_test.go @@ -0,0 +1,170 @@ +package textutils + +import ( + "bytes" + "testing" +) + +func TestTrimConsecutiveNewlines(t *testing.T) { + runs := []struct { + desc string + input []byte + expected []byte + }{ + { + desc: "empty", + input: []byte(""), + expected: []byte(""), + }, + { + desc: "not needed", + input: []byte("normal text"), + expected: []byte("normal text"), + }, + { + desc: "also not needed", + input: []byte("normal\n\ntext"), + expected: []byte("normal\n\ntext"), + }, + + { + desc: "just two newlines", + input: []byte("\n\n"), + expected: []byte("\n\n"), + }, + { + desc: "just three newlines", + input: []byte("\n\n\n"), + expected: []byte("\n\n"), + }, + { + desc: "just four newlines", + input: []byte("\n\n\n\n"), + expected: []byte("\n\n"), + }, + + { + desc: "newlines before", + input: []byte("\n\n\ntext"), + expected: []byte("\n\ntext"), + }, + { + desc: "newlines after", + input: []byte("text\n\n\n"), + expected: []byte("text\n\n"), + }, + { + desc: "newlines before and after", + input: []byte("\n\n\ntext\n\n\n"), + expected: []byte("\n\ntext\n\n"), + }, + { + desc: "newlines between", + input: []byte("before\n\n\nafter"), + expected: []byte("before\n\nafter"), + }, + { + desc: "newlines between multiple times", + input: []byte("1\n\n\n2\n\n\n3"), + expected: []byte("1\n\n2\n\n3"), + }, + + { + desc: "not needed the first time", + input: []byte("abc\n\nabc\n\n\nabc"), + expected: []byte("abc\n\nabc\n\nabc"), + }, + { + desc: "not needed the second time", + input: []byte("abc\n\n\nabc\n\nabc"), + expected: []byte("abc\n\nabc\n\nabc"), + }, + + { + desc: "with special characters", + input: []byte("äöü\n\n\näöü"), + expected: []byte("äöü\n\näöü"), + }, + { + desc: "space at end", + input: []byte("a\n\n\nb "), + expected: []byte("a\n\nb "), + }, + { + desc: "one newline at end", + input: []byte("a\n\n\nb\n"), + expected: []byte("a\n\nb\n"), + }, + { + desc: "two newlines at end", + input: []byte("a\n\n\nb\n\n"), + expected: []byte("a\n\nb\n\n"), + }, + } + + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + output := TrimConsecutiveNewlines(run.input) + if !bytes.Equal(output, run.expected) { + t.Errorf("expected %q but got %q", string(run.expected), string(output)) + } + }) + } +} + +func TestTrimConsecutiveNewlines_Allocs(t *testing.T) { + const N = 1000 + + avg := testing.AllocsPerRun(N, func() { + input := []byte("abc") + output := TrimConsecutiveNewlines(input) + _ = output + }) + if avg != 0 { + t.Errorf("with no newlines there should be no allocations but got %f", avg) + } + + avg = testing.AllocsPerRun(N, func() { + input := []byte("abc\n\nabc") + output := TrimConsecutiveNewlines(input) + _ = output + }) + if avg != 0 { + t.Errorf("with only two newlines there should be no allocations but got %f", avg) + } + + avg = testing.AllocsPerRun(N, func() { + input := []byte("abc\n\n\nabc") + output := TrimConsecutiveNewlines(input) + _ = output + }) + if avg != 1 { + t.Errorf("with trhee newlines there should be 1 allocation but got %f", avg) + } +} + +const Repeat = 10 + +func BenchmarkTrimConsecutiveNewlines(b *testing.B) { + runs := []struct { + desc string + input []byte + }{ + { + desc: "not needed", + input: bytes.Repeat([]byte("normal\n\ntext"), Repeat), + }, + { + desc: "multiple times", + input: bytes.Repeat([]byte("1\n\n\n2\n\n\n3"), Repeat), + }, + } + + for _, run := range runs { + b.Run(run.desc, func(b *testing.B) { + for i := 0; i < b.N; i++ { + TrimConsecutiveNewlines(run.input) + } + }) + } +} diff --git a/internal/textutils/delimiter.go b/internal/textutils/delimiter.go new file mode 100644 index 0000000..8f24372 --- /dev/null +++ b/internal/textutils/delimiter.go @@ -0,0 +1,37 @@ +package textutils + +import ( + "bytes" +) + +// DelimiterForEveryLine puts the delimiter not just at the start and end of the string +// but if the text is divided on multiple lines, puts the delimiters on every line with content. +// +// Otherwise the bold/italic delimiters won't be recognized if it contains new line characters. +func DelimiterForEveryLine(text []byte, delimiter []byte) []byte { + var buf bytes.Buffer + + lines := bytes.Split(text, []byte("\n")) + for i, line := range lines { + leftExtra, trimmed, rightExtra := SurroundingSpaces(line) + + if trimmed == nil { + // For empty lines, we don't need a delimiter + buf.Write(leftExtra) + buf.Write(rightExtra) + } else { + buf.Write(leftExtra) + buf.Write(delimiter) + buf.Write(trimmed) + buf.Write(delimiter) + buf.Write(rightExtra) + } + + // To join the lines again, add a newlines character + if i < len(lines)-1 { + buf.WriteRune('\n') + } + } + + return buf.Bytes() +} diff --git a/internal/textutils/delimiter_test.go b/internal/textutils/delimiter_test.go new file mode 100644 index 0000000..6139c27 --- /dev/null +++ b/internal/textutils/delimiter_test.go @@ -0,0 +1,90 @@ +package textutils + +import "testing" + +func TestDelimiterForEveryLine(t *testing.T) { + tests := []struct { + name string + + text string + delimiter string + + want string + }{ + { + name: "put delimiter around text", + + text: "bold text", + delimiter: "**", + + want: "**bold text**", + }, + { + name: "keep whitespace outside (normal space)", + + text: " bold text ", + delimiter: "**", + + want: " **bold text** ", + }, + { + name: "keep whitespace outside (non-breaking space)", + + text: "\u00a0bold text\u00a0\u00a0", + delimiter: "**", + + want: "\u00a0**bold text**\u00a0\u00a0", + }, + { + name: "keep whitespace outside on every line (non-breaking space)", + + text: "bold\u00a0\ntext\u00a0", + delimiter: "**", + + want: "**bold**\u00a0\n**text**\u00a0", + }, + { + name: "put strong on every line", + + text: "line 1\nline 2", + delimiter: "**", + + want: "**line 1**\n**line 2**", + }, + { + name: "skip empty lines", + + text: "line 1\n\n\nline 2", + delimiter: "_", + + want: "_line 1_\n\n\n_line 2_", + }, + { + name: "with indentation", + + text: ` +line 1 + +line 2 +line 3 +`, + delimiter: "__", + + want: ` +__line 1__ + +__line 2__ +__line 3__ +`, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if got := DelimiterForEveryLine([]byte(tt.text), []byte(tt.delimiter)); string(got) != tt.want { + t.Errorf("DelimiterForEveryLine() = \n'%v' but want \n'%v'", string(got), tt.want) + } + + }) + } +} diff --git a/internal/textutils/escape_multiline.go b/internal/textutils/escape_multiline.go new file mode 100644 index 0000000..6b48733 --- /dev/null +++ b/internal/textutils/escape_multiline.go @@ -0,0 +1,77 @@ +package textutils + +import ( + "bytes" + + "github.com/JohannesKaufmann/html-to-markdown/v2/marker" +) + +var newline = []byte{'\n'} +var escape = []byte{'\\'} + +func EscapeMultiLine(content []byte) []byte { + content = bytes.TrimSpace(content) + content = TrimConsecutiveNewlines(content) + if len(content) == 0 { + return content + } + + parts := marker.SplitFunc(content, func(r rune) bool { + return r == '\n' || r == marker.MarkerLineBreak + }) + + for i := range parts { + parts[i] = bytes.TrimSpace(parts[i]) + if len(parts[i]) == 0 { + parts[i] = escape + } + } + content = bytes.Join(parts, newline) + + return content +} + +/* +// TODO: use this optimized function again after integrating the marker.MarkerLineBreak changes + +// EscapeMultiLine deals with multiline content inside a link or a heading. +func EscapeMultiLine(content []byte) []byte { + content = TrimConsecutiveNewlines(content) + + newContent := make([]byte, 0, len(content)) + + startNormal := 0 + lineHasContent := false + for index, char := range content { + isNewline := char == '\n' + isSpace := char == ' ' || char == ' ' + + isFirstNewline := isNewline && lineHasContent + isLastNewline := isNewline && !lineHasContent + + if isFirstNewline { + newContent = append(newContent, content[startNormal:index]...) + newContent = append(newContent, '\n') + + startNormal = index + 1 + lineHasContent = false + + continue + } else if isLastNewline { + newContent = append(newContent, '\\') + newContent = append(newContent, '\n') + + startNormal = index + 1 + lineHasContent = false + } else if !isSpace { + lineHasContent = true + } else if isSpace && !lineHasContent { + startNormal = index + 1 + } + } + + newContent = append(newContent, content[startNormal:]...) + + return newContent +} +*/ diff --git a/internal/textutils/escape_multiline_test.go b/internal/textutils/escape_multiline_test.go new file mode 100644 index 0000000..3ace5e5 --- /dev/null +++ b/internal/textutils/escape_multiline_test.go @@ -0,0 +1,140 @@ +package textutils + +import ( + "bytes" + "strings" + "testing" +) + +func EscapeMultiLine_Old(content []byte) []byte { + content = bytes.TrimSpace(content) + content = TrimConsecutiveNewlines(content) + if len(content) == 0 { + return content + } + + parts := bytes.Split(content, newline) + for i := range parts { + parts[i] = bytes.TrimSpace(parts[i]) + if len(parts[i]) == 0 { + parts[i] = escape + } + } + content = bytes.Join(parts, newline) + + return content +} + +func TestEscapeMultiLine(t *testing.T) { + var tests = []struct { + Name string + Text string + Expected string + }{ + { + Name: "empty", + Text: "", + Expected: "", + }, + { + Name: "not needed", + Text: "some longer text that is on one line", + Expected: "some longer text that is on one line", + }, + + { + Name: "one newline", + Text: "A\nB", + Expected: "A\nB", + }, + { + Name: "two newlines", + Text: "A\n\nB", + Expected: "A\n\\\nB", + }, + { + + Name: "many newlines", + // Will be max two newlines characters + Text: "line 1\n\n\n\nline 2", + Expected: "line 1\n\\\nline 2", + }, + + { + Name: "multiple empty lines", + Text: `line1 +line2 + +line3 + + + + +line4`, + Expected: `line1 +line2 +\ +line3 +\ +line4`, + }, + + { + Name: "empty line with a space", + Text: "line 1\n \nline 2", + Expected: "line 1\n\\\nline 2", + }, + + { + Name: "content has a space", + Text: "a\n\n b", + Expected: "a\n\\\nb", + }, + { + Name: "content is indented", + Text: "line 1\n line 2\n\tline 3", + Expected: "line 1\nline 2\nline 3", + }, + + // TODO: keep existing "\" characters? + } + + for _, test := range tests { + t.Run(test.Name, func(t *testing.T) { + t.Run("old", func(t *testing.T) { + output := EscapeMultiLine_Old([]byte(test.Text)) + + if string(output) != test.Expected { + t.Errorf("expected '%s' but got '%s'", test.Expected, string(output)) + } + }) + t.Run("new", func(t *testing.T) { + output := EscapeMultiLine([]byte(test.Text)) + + if string(output) != test.Expected { + t.Errorf("expected '%s' but got '%s'", test.Expected, string(output)) + } + }) + + }) + + } +} + +func BenchmarkEscapeMultiLine(b *testing.B) { + + b.Run("old", func(b *testing.B) { + input := []byte(strings.Repeat("line 1\n\n \nline 2", 100)) + + for i := 0; i < b.N; i++ { + _ = EscapeMultiLine_Old(input) + } + }) + b.Run("new", func(b *testing.B) { + input := []byte(strings.Repeat("line 1\n\n \nline 2", 100)) + + for i := 0; i < b.N; i++ { + _ = EscapeMultiLine(input) + } + }) +} diff --git a/internal/textutils/prefix_lines.go b/internal/textutils/prefix_lines.go new file mode 100644 index 0000000..0ef2762 --- /dev/null +++ b/internal/textutils/prefix_lines.go @@ -0,0 +1,16 @@ +package textutils + +func PrefixLines(source []byte, repl []byte) []byte { + newSlice := make([]byte, 0, len(source)) + + newSlice = append(newSlice, repl...) + for _, b := range source { + newSlice = append(newSlice, b) + if b == '\n' { + newSlice = append(newSlice, repl...) + } + + } + + return newSlice +} diff --git a/internal/textutils/prefix_lines_test.go b/internal/textutils/prefix_lines_test.go new file mode 100644 index 0000000..4e55b83 --- /dev/null +++ b/internal/textutils/prefix_lines_test.go @@ -0,0 +1,61 @@ +package textutils + +import ( + "bytes" + "regexp" + "testing" +) + +var beginningR = regexp.MustCompile(`(?m)^`) + +func _oldPrefixLines(content string, repl string) string { + return beginningR.ReplaceAllString(content, repl) +} + +func TestPrefixLines(t *testing.T) { + runs := []struct { + desc string + input []byte + expected []byte + }{ + { + desc: "one line", + input: []byte("abc"), + expected: []byte("> abc"), + }, + { + desc: "two lines", + input: []byte("line 1\nline 2"), + expected: []byte("> line 1\n> line 2"), + }, + { + desc: "two newlines between", + input: []byte("line 1\n\nline 2"), + expected: []byte("> line 1\n> \n> line 2"), + }, + { + desc: "newline at end", + input: []byte("abc\n"), + expected: []byte("> abc\n> "), + }, + } + + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + t.Run("old", func(t *testing.T) { + output := _oldPrefixLines(string(run.input), "> ") + + if output != string(run.expected) { + t.Errorf("expected %q but got %q", string(run.expected), output) + } + }) + t.Run("new", func(t *testing.T) { + output := PrefixLines(run.input, []byte{'>', ' '}) + + if !bytes.Equal(output, run.expected) { + t.Errorf("expected %q but got %q", string(run.expected), string(output)) + } + }) + }) + } +} diff --git a/internal/textutils/quote.go b/internal/textutils/quote.go new file mode 100644 index 0000000..0e8c7eb --- /dev/null +++ b/internal/textutils/quote.go @@ -0,0 +1,39 @@ +package textutils + +import "bytes" + +const ( + DOUBLE_QUOTE = '"' + SINGLE_QUOTE = '\'' +) + +func SurroundBy(content []byte, chars []byte) []byte { + content = append(chars, content...) + content = append(content, chars...) + return content +} +func SurroundByQuotes(content []byte) []byte { + if len(content) == 0 { + return nil + } + + containsDoubleQuote := bytes.ContainsRune(content, DOUBLE_QUOTE) + containsSingleQuote := bytes.ContainsRune(content, SINGLE_QUOTE) + + if containsDoubleQuote && containsSingleQuote { + // Escape all quotes + content = bytes.ReplaceAll(content, []byte(`"`), []byte(`\"`)) + + // Surround the content by double quotes + return SurroundBy(content, []byte(`"`)) + } + if containsDoubleQuote { + // Since it contains double quotes (but no single quotes) + // we can surround it by single quotes + return SurroundBy(content, []byte(`'`)) + } + + // It may contain single quotes, but definitely no double quotes, + // so we can safely surround it by double quotes. + return SurroundBy(content, []byte(`"`)) +} diff --git a/internal/textutils/quote_test.go b/internal/textutils/quote_test.go new file mode 100644 index 0000000..491b711 --- /dev/null +++ b/internal/textutils/quote_test.go @@ -0,0 +1,53 @@ +package textutils + +import ( + "reflect" + "testing" +) + +func TestSurroundByQuotes(t *testing.T) { + + tests := []struct { + name string + input []byte + want []byte + }{ + { + name: "empty content", + input: nil, + want: nil, + }, + { + name: "no content", + input: []byte(""), + want: nil, + }, + { + name: "contains no quotes", + input: []byte(`no quotes are here`), + want: []byte(`"no quotes are here"`), + }, + { + name: "contains double quotes", + input: []byte(`double "quotes" are here`), + want: []byte(`'double "quotes" are here'`), + }, + { + name: "contains single quotes", + input: []byte(`single 'quotes' are here`), + want: []byte(`"single 'quotes' are here"`), + }, + { + name: "contains both quotes", + input: []byte(`double " AND single ' quotes '" are here`), + want: []byte(`"double \" AND single ' quotes '\" are here"`), + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := SurroundByQuotes(tt.input); !reflect.DeepEqual(got, tt.want) { + t.Errorf("SurroundByQuotes() = %v, want %v", string(got), string(tt.want)) + } + }) + } +} diff --git a/internal/textutils/surrounding_spaces.go b/internal/textutils/surrounding_spaces.go new file mode 100644 index 0000000..4f73623 --- /dev/null +++ b/internal/textutils/surrounding_spaces.go @@ -0,0 +1,60 @@ +/* + +The logic to handle whitespace around delimiters was initially developed +in the fork from "anyproto" by Roman Khafizianov and Mikhail. + +The changes were then merged upstream by Johannes Kaufmann. + +https://github.com/anyproto/html-to-markdown +https://github.com/JohannesKaufmann/html-to-markdown + +----------- + +MIT License + +Copyright (c) 2018 Johannes Kaufmann +Copyright (c) 2020 Roman Khafizianov +Copyright (c) 2023 Mikhail + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +package textutils + +import ( + "bytes" + "unicode" + + "github.com/JohannesKaufmann/html-to-markdown/v2/marker" +) + +func SurroundingSpaces(content []byte) ([]byte, []byte, []byte) { + rightTrimmed := bytes.TrimRightFunc(content, func(r rune) bool { + return unicode.IsSpace(r) || r == marker.MarkerLineBreak + }) + rightExtra := content[len(rightTrimmed):] + + trimmed := bytes.TrimLeftFunc(rightTrimmed, func(r rune) bool { + return unicode.IsSpace(r) || r == marker.MarkerLineBreak + }) + leftExtra := content[0 : len(rightTrimmed)-len(trimmed)] + + return leftExtra, trimmed, rightExtra +} diff --git a/internal/textutils/surrounding_spaces_test.go b/internal/textutils/surrounding_spaces_test.go new file mode 100644 index 0000000..2fc81c8 --- /dev/null +++ b/internal/textutils/surrounding_spaces_test.go @@ -0,0 +1,73 @@ +package textutils + +import ( + "bytes" + "testing" +) + +func TestSurroundingSpaces(t *testing.T) { + testCases := []struct { + desc string + input []byte + + expectedLeft []byte + expectedTrimmed []byte + expectedRight []byte + }{ + { + desc: "empty string", + input: []byte(""), + + expectedLeft: []byte(""), + expectedTrimmed: []byte(""), + expectedRight: []byte(""), + }, + { + desc: "one space", + input: []byte(" "), + + expectedLeft: []byte(""), + expectedTrimmed: []byte(""), + expectedRight: []byte(" "), + }, + { + desc: "simple string", + input: []byte("some text"), + + expectedLeft: []byte(""), + expectedTrimmed: []byte("some text"), + expectedRight: []byte(""), + }, + { + desc: "spaces around", + input: []byte(" text "), + + expectedLeft: []byte(" "), + expectedTrimmed: []byte("text"), + expectedRight: []byte(" "), + }, + { + desc: "newlines around", + input: []byte("\n\n text \n\n"), + + expectedLeft: []byte("\n\n "), + expectedTrimmed: []byte("text"), + expectedRight: []byte(" \n\n"), + }, + } + for _, tC := range testCases { + t.Run(tC.desc, func(t *testing.T) { + leftExtra, trimmed, rightExtra := SurroundingSpaces(tC.input) + + if !bytes.Equal(leftExtra, tC.expectedLeft) { + t.Errorf("expected %q but got %q for the left extra", string(tC.expectedLeft), string(leftExtra)) + } + if !bytes.Equal(trimmed, tC.expectedTrimmed) { + t.Errorf("expected %q but got %q for the trimmed text", string(tC.expectedTrimmed), string(trimmed)) + } + if !bytes.Equal(rightExtra, tC.expectedRight) { + t.Errorf("expected %q but got %q for the right extra", string(tC.expectedRight), string(rightExtra)) + } + }) + } +} diff --git a/marker/marker.go b/marker/marker.go new file mode 100644 index 0000000..f85b7ea --- /dev/null +++ b/marker/marker.go @@ -0,0 +1,81 @@ +package marker + +import ( + "bytes" + "unicode" + "unicode/utf8" +) + +const ( + // For simplificity we are using a rune that is one byte wide. A character + // that is not used widely (apart from cli's) is the bell character (7). + MarkerEscaping rune = '\a' + + // - - - - // + + // Marker1 rune = '\uF000' // 61440 + MarkerLineBreak rune = '\uF001' // 61441 + MarkerCodeBlockNewline rune = '\uF002' // 61442 +) + +var ( + BytesMarkerEscaping = []byte{7} + + // BytesMarker1 = []byte{239, 128, 128} + BytesMarkerLineBreak = []byte{239, 128, 129} + BytesTWICEMarkerLineBreak = []byte{239, 128, 129, 239, 128, 129} + BytesMarkerCodeBlockNewline = []byte{239, 128, 130} +) + +func init() { + checkRuneAndByteSlice(MarkerEscaping, BytesMarkerEscaping) + checkRuneAndByteSlice(MarkerLineBreak, BytesMarkerLineBreak) + checkRuneAndByteSlice(MarkerCodeBlockNewline, BytesMarkerCodeBlockNewline) +} + +func checkRuneAndByteSlice(r rune, b []byte) { + if !bytes.Equal([]byte(string(r)), b) { + panic("the rune and byte slice dont represent the same character") + } +} + +func GetMarker(p []byte, i int) (marker rune, size int) { + r, size := utf8.DecodeRune(p[i:]) + + switch r { + case MarkerLineBreak, MarkerCodeBlockNewline: + return r, size + + default: + return 0, 0 + } +} + +func IsSpace(r rune) bool { + return unicode.IsSpace(r) || r == MarkerLineBreak +} + +// func IsNewline(r rune) bool { +// return r == '\n' || r == '\r' || r == MarkerLineBreak +// } + +// TODO: should this be in another package? +func SplitFunc(str []byte, fn func(rune) bool) [][]byte { + var substrs [][]byte + for { + i := bytes.IndexFunc(str, fn) + if i == -1 { + if len(str) > 0 { + substrs = append(substrs, str) + } + break + } + + _, size := utf8.DecodeRune(str[i:]) + // substrs = append(substrs, str[:i], str[i:i+1]) + substrs = append(substrs, str[:i]) + str = str[i+size:] + } + + return substrs +} diff --git a/marker/marker_test.go b/marker/marker_test.go new file mode 100644 index 0000000..e9c7be2 --- /dev/null +++ b/marker/marker_test.go @@ -0,0 +1,32 @@ +package marker + +import ( + "bytes" + "reflect" + "testing" +) + +func TestSplitFunc_Basics(t *testing.T) { + in := []byte("one-two-three") + + out1 := bytes.Split(in, []byte("-")) + out2 := SplitFunc(in, func(r rune) bool { + return r == '-' + }) + + if !reflect.DeepEqual(out1, out2) { + t.Error("the split functions generated different outputs") + } +} +func TestSplitFunc_SpecialChars(t *testing.T) { + in := []byte("[öü]ä[öü]ä[öü]") + + out1 := bytes.Split(in, []byte("ä")) + out2 := SplitFunc(in, func(r rune) bool { + return r == 'ä' + }) + + if !reflect.DeepEqual(out1, out2) { + t.Error("the split functions generated different outputs") + } +} diff --git a/plugin/commonmark/commonmark.go b/plugin/commonmark/commonmark.go new file mode 100644 index 0000000..8b7c190 --- /dev/null +++ b/plugin/commonmark/commonmark.go @@ -0,0 +1,185 @@ +package commonmark + +import ( + "bytes" + "strings" + + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/domutils" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/escape" + "github.com/JohannesKaufmann/html-to-markdown/v2/marker" + "golang.org/x/net/html" +) + +type commonmark struct { + config +} + +type OptionFunc = func(config *config) + +// _ or * +// +// default: * +func WithEmDelimiter(delimiter string) OptionFunc { + return func(config *config) { + config.EmDelimiter = delimiter + } +} + +// ** or __ +// +// default: ** +func WithStrongDelimiter(delimiter string) OptionFunc { + return func(config *config) { + config.StrongDelimiter = delimiter + } +} + +// Any Thematic break +// +// default: "* * *" +func WithHorizontalRule(rule string) OptionFunc { + return func(config *config) { + config.HorizontalRule = rule + } +} + +// "-", "+", or "*" +// +// default: "-" +func WithBulletListMarker(marker string) OptionFunc { + return func(config *config) { + config.BulletListMarker = marker + } +} +func WithListEndComment(enabled bool) OptionFunc { + return func(config *config) { + config.DisableListEndComment = !enabled + } +} + +// ``` or ~~~ +// +// default: ``` +func WithCodeBlockFence(fence string) OptionFunc { + return func(config *config) { + config.CodeBlockFence = fence + } +} + +// "setext" or "atx" +// +// default: "atx" +// HeadingStyle string +func WithHeadingStyle(style headingStyle) OptionFunc { + return func(config *config) { + config.HeadingStyle = style + } +} + +// TODO: allow changing the link style once the render logic is implemented +// +// "inlined" or "referenced_index" or "referenced_short" +// +// default: inlined +// func WithLinkStyle(style linkStyle) OptionFunc { +// return func(config *config) { +// config.LinkStyle = style +// } +// } + +// NewCommonmarkPlugin registers the markdown syntax of commonmark. +func NewCommonmarkPlugin(opts ...OptionFunc) converter.Plugin { + + cfg := &config{} + for _, opt := range opts { + opt(cfg) + } + + cm := commonmark{ + config: fillInDefaultConfig(cfg), + } + + return &cm +} + +func (cm *commonmark) Init(conv *converter.Converter) error { + if err := validateConfig(&cm.config); err != nil { + return err + } + + // - - - - - - - - // + + conv.Register.PreRenderer(cm.handlePreRender, converter.PriorityStandard) + + // Note: Should run after "collapse" & also after "remove" + conv.Register.PreRenderer(func(ctx converter.Context, doc *html.Node) { + if cm.DisableListEndComment { + // Early return if the feature is unwanted + return + } + + domutils.AddListEndComments(ctx, doc) + }, converter.PriorityLate+100) + + conv.Register.EscapedChar( + '\\', + '*', '_', '-', '+', + '.', '>', '|', + '$', + '#', '=', + '[', ']', '(', ')', + '!', + '~', '`', '"', '\'', + ) + conv.Register.UnEscaper(escape.IsItalicOrBold, converter.PriorityStandard) + conv.Register.UnEscaper(escape.IsBlockQuote, converter.PriorityStandard) + conv.Register.UnEscaper(escape.IsAtxHeader, converter.PriorityStandard) + conv.Register.UnEscaper(escape.IsSetextHeader, converter.PriorityStandard) + conv.Register.UnEscaper(escape.IsDivider, converter.PriorityStandard) + conv.Register.UnEscaper(escape.IsOrderedList, converter.PriorityStandard) + conv.Register.UnEscaper(escape.IsUnorderedList, converter.PriorityStandard) + conv.Register.UnEscaper(escape.IsImageOrLink, converter.PriorityStandard) + conv.Register.UnEscaper(escape.IsFencedCode, converter.PriorityStandard) + conv.Register.UnEscaper(escape.IsInlineCode, converter.PriorityStandard) + conv.Register.UnEscaper(escape.IsBackslash, converter.PriorityStandard) + + conv.Register.Renderer(cm.handleRender, converter.PriorityStandard) + + conv.Register.TextTransformer(cm.handleTextTransform, converter.PriorityLate) + + conv.Register.PostRenderer(cm.handlePostRenderCodeBlockNewline, converter.PriorityLate) + conv.Register.PostRenderer(cm.handlePostRenderLineBreak, converter.PriorityStandard+10) + + return nil +} + +func (cm commonmark) handlePostRenderCodeBlockNewline(ctx converter.Context, content []byte) []byte { + return bytes.ReplaceAll( + content, + []byte(string(marker.BytesMarkerCodeBlockNewline)), + []byte("\n"), + ) +} +func (cm commonmark) handlePostRenderLineBreak(ctx converter.Context, content []byte) []byte { + return bytes.ReplaceAll( + content, + // Two line break markers should be replaced with a "hard line break". + marker.BytesTWICEMarkerLineBreak, + []byte(" \n"), + ) +} + +func (cm commonmark) handleTextTransform(ctx converter.Context, content string) string { + + if isEnabled, ok := ctx.Value("is_inside_link").(bool); ok && isEnabled { + content = strings.Replace(content, string(marker.MarkerEscaping)+`]`, `\]`, -1) + } + // if isEnabled, ok := ctx.Value("is_inside_heading").(bool); ok && isEnabled { + // // The "#" character would be completely removed, if at the _end_ + // // of the heading content. So always escape it inside headings. + // content = strings.Replace(content, string(marker.MarkerEscaping)+`#`, `\#`, -1) + // } + + return content +} diff --git a/plugin/commonmark/commonmark_test.go b/plugin/commonmark/commonmark_test.go new file mode 100644 index 0000000..14ba736 --- /dev/null +++ b/plugin/commonmark/commonmark_test.go @@ -0,0 +1,288 @@ +package commonmark_test + +import ( + "bytes" + "testing" + + htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2" + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/tester" + "github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark" +) + +func TestGoldenFiles(t *testing.T) { + goldenFileConvert := func(htmlInput []byte) ([]byte, error) { + conv := converter.NewConverter( + converter.WithPlugins(commonmark.NewCommonmarkPlugin()), + ) + + // This makes the testcases easier to read + conv.Register.TagStrategy("#comment", converter.StrategyHTMLBlock) + + return conv.ConvertReader(bytes.NewReader(htmlInput)) + } + roundTripConvert := func(html []byte) (markdown []byte, err error) { + // For the golden files we are keeping #comment as a block + // but collapse treats it as an inline element (which it is). + // + // So this testcase would cause problems. + // "
before after
" + + md, err := htmltomarkdown.ConvertString(string(html)) + + return []byte(md), err + } + + tester.GoldenFiles(t, goldenFileConvert, roundTripConvert) +} + +func TestOptionFunc(t *testing.T) { + testCases := []struct { + desc string + input string + options []commonmark.OptionFunc + expected string + }{ + { + desc: "WithEmDelimiter", + options: []commonmark.OptionFunc{ + commonmark.WithEmDelimiter("_"), + }, + input: `italic`, + expected: `_italic_`, + }, + { + desc: "WithStrongDelimiter", + options: []commonmark.OptionFunc{ + commonmark.WithStrongDelimiter("__"), + }, + input: `bold`, + expected: `__bold__`, + }, + + { + desc: "WithHorizontalRule(***)", + options: []commonmark.OptionFunc{ + commonmark.WithHorizontalRule("***"), + }, + input: `
`, + expected: `***`, + }, + { + desc: "WithHorizontalRule(******)", + options: []commonmark.OptionFunc{ + commonmark.WithHorizontalRule("******"), + }, + input: `
`, + expected: `******`, + }, + { + desc: "WithHorizontalRule(---)", + options: []commonmark.OptionFunc{ + commonmark.WithHorizontalRule("---"), + }, + input: `
`, + expected: `---`, + }, + { + desc: "WithHorizontalRule(___)", + options: []commonmark.OptionFunc{ + commonmark.WithHorizontalRule("___"), + }, + input: `
`, + expected: `___`, + }, + + { + desc: "WithBulletListMarker(+)", + options: []commonmark.OptionFunc{ + commonmark.WithBulletListMarker("+"), + }, + input: `
  • list item
`, + expected: `+ list item`, + }, + { + desc: "WithBulletListMarker(*)", + options: []commonmark.OptionFunc{ + commonmark.WithBulletListMarker("*"), + }, + input: `
  • list a
  • list b
`, + expected: "* list a\n\n\n\n* list b", + }, + { + desc: "WithBulletListMarker(*)", + options: []commonmark.OptionFunc{ + commonmark.WithBulletListMarker("*"), + commonmark.WithListEndComment(false), + }, + input: `
  • list a
  • list b
`, + expected: "* list a\n\n* list b", + }, + + { + desc: "WithCodeBlockFence", + options: []commonmark.OptionFunc{ + commonmark.WithCodeBlockFence("~~~"), + }, + input: `
hello world
`, + expected: "~~~\nhello world\n~~~", + }, + + { + desc: "WithHeadingStyle(atx)", + options: []commonmark.OptionFunc{ + commonmark.WithHeadingStyle("atx"), + }, + input: `

important
heading

`, + expected: "# important heading", + }, + { + desc: "WithHeadingStyle(setext)", + options: []commonmark.OptionFunc{ + commonmark.WithHeadingStyle("setext"), + }, + input: `

important
heading

`, + expected: "important\n\\\nheading\n=========", + }, + + // TODO: handle other link styles + // { + // desc: "WithLinkStyle(LinkInlined)", + // options: []commonmark.OptionFunc{ + // commonmark.WithLinkStyle(commonmark.LinkInlined), + // }, + // input: `link`, + // expected: "[link](/about)", + // }, + } + for _, tC := range testCases { + t.Run(tC.desc, func(t *testing.T) { + conv := converter.NewConverter( + converter.WithPlugins( + commonmark.NewCommonmarkPlugin( + tC.options..., + ), + ), + ) + + output, err := conv.ConvertString(tC.input) + if err != nil { + t.Error(err) + } + + if output != tC.expected { + t.Errorf("expected %q but got %q", tC.expected, output) + } + }) + } +} + +func TestOptionFunc_ValdationError(t *testing.T) { + testCases := []struct { + desc string + options []commonmark.OptionFunc + expectedError string + }{ + { + desc: "WithEmDelimiter(__)", + options: []commonmark.OptionFunc{ + commonmark.WithEmDelimiter("__"), + }, + expectedError: `invalid value for EmDelimiter:"__" must be exactly 1 character of "*" or "_"`, + }, + { + desc: "WithEmDelimiter(**)", + options: []commonmark.OptionFunc{ + commonmark.WithEmDelimiter("**"), + }, + expectedError: `invalid value for EmDelimiter:"**" must be exactly 1 character of "*" or "_"`, + }, + + { + desc: "WithStrongDelimiter(_)", + options: []commonmark.OptionFunc{ + commonmark.WithStrongDelimiter("_"), + }, + expectedError: `invalid value for StrongDelimiter:"_" must be exactly 2 characters of "**" or "__"`, + }, + { + desc: "WithStrongDelimiter(*)", + options: []commonmark.OptionFunc{ + commonmark.WithStrongDelimiter("*"), + }, + expectedError: `invalid value for StrongDelimiter:"*" must be exactly 2 characters of "**" or "__"`, + }, + + { + desc: "WithHorizontalRule(* *)", + options: []commonmark.OptionFunc{ + commonmark.WithHorizontalRule("* *"), + }, + expectedError: `invalid value for HorizontalRule:"* *" must be at least 3 characters of "*", "_" or "-"`, + }, + { + desc: "WithHorizontalRule(+++)", + options: []commonmark.OptionFunc{ + commonmark.WithHorizontalRule("+++"), + }, + expectedError: `invalid value for HorizontalRule:"+++" must be at least 3 characters of "*", "_" or "-"`, + }, + + { + desc: "WithBulletListMarker(_)", + options: []commonmark.OptionFunc{ + commonmark.WithBulletListMarker("_"), + }, + expectedError: `invalid value for BulletListMarker:"_" must be one of "-", "+" or "*"`, + }, + + { + desc: "WithCodeBlockFence(~~)", + options: []commonmark.OptionFunc{ + commonmark.WithCodeBlockFence("~~"), + }, + expectedError: "invalid value for CodeBlockFence:\"~~\" must be one of \"```\" or \"~~~\"", + }, + + { + desc: "WithHeadingStyle(ATX)", + options: []commonmark.OptionFunc{ + commonmark.WithHeadingStyle("ATX"), + }, + expectedError: `invalid value for HeadingStyle:"ATX" must be one of "atx" or "setext"`, + }, + { + desc: "WithHeadingStyle(misspelling settext)", + options: []commonmark.OptionFunc{ + commonmark.WithHeadingStyle("settext"), + }, + expectedError: `invalid value for HeadingStyle:"settext" must be one of "atx" or "setext"`, + }, + } + for _, tC := range testCases { + t.Run(tC.desc, func(t *testing.T) { + conv := converter.NewConverter( + converter.WithPlugins( + commonmark.NewCommonmarkPlugin( + tC.options..., + ), + ), + ) + + _, err := conv.ConvertString("bold text") + if err == nil { + t.Fatal("expected an error but got nil") + } + + _, isValidateConfigError := err.(*commonmark.ValidateConfigError) + if !isValidateConfigError { + // t.Error("the error is not of type ValidateConfigError") + } + + actual := err.Error() + if actual != tC.expectedError { + t.Errorf("expected %q but got %q", tC.expectedError, actual) + } + }) + } +} diff --git a/plugin/commonmark/handle_pre_render.go b/plugin/commonmark/handle_pre_render.go new file mode 100644 index 0000000..99a65a9 --- /dev/null +++ b/plugin/commonmark/handle_pre_render.go @@ -0,0 +1,91 @@ +package commonmark + +import ( + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/domutils" + "golang.org/x/net/html" +) + +func nameIsBold(node *html.Node) bool { + name := dom.NodeName(node) + return name == "strong" || name == "b" +} +func nameIsItalic(node *html.Node) bool { + name := dom.NodeName(node) + return name == "em" || name == "i" +} + +func nameIsBoldOrItalic(node *html.Node) bool { + return nameIsBold(node) || nameIsItalic(node) +} +func nameIsBothBoldOrItalic(a, b *html.Node) bool { + if nameIsBold(a) && nameIsBold(b) { + return true + } + if nameIsItalic(a) && nameIsItalic(b) { + return true + } + + return false +} + +func nameIsPre(node *html.Node) bool { + name := dom.NodeName(node) + return name == "pre" +} +func nameIsInlineCode(node *html.Node) bool { + name := dom.NodeName(node) + return name == "code" || name == "var" || name == "samp" || name == "kbd" || name == "tt" +} + +func nameIsLink(node *html.Node) bool { + return dom.NodeName(node) == "a" +} + +func nameIsBothLink(a, b *html.Node) bool { + return dom.NodeName(a) == "a" && dom.NodeName(b) == "a" +} + +func nameIsHeading(node *html.Node) bool { + name := dom.NodeName(node) + + if name == "h1" || name == "h2" || name == "h3" || name == "h4" || name == "h5" || name == "h6" { + return true + } + return false +} + +// func nameIsBlockquote(node *html.Node) bool { +// return dom.NodeName(node) == "blockquote" +// } + +func (c *commonmark) handlePreRender(ctx converter.Context, doc *html.Node) { + domutils.RenameFakeSpans(ctx, doc) + + // domutils.SplitUp(ctx, doc, nameIsBoldOrItalic, nameIsLink, atom.Span) + + // domutils.SplitUp(ctx, doc, nameIsLink, nameIsHeading, atom.Div) + // domutils.SplitUp(ctx, doc, nameIsLink, nameIsBlockquote, atom.Div) + + // - - - Bold / Italic - - - // + domutils.RemoveRedundant(doc, nameIsBothBoldOrItalic) + domutils.MergeAdjacent(doc, nameIsBoldOrItalic) + + // domutils.MovePunctuation(ctx, doc, nameIsBoldOrItalic) + + // - - - Code - - - // + domutils.RemoveEmptyCode(ctx, doc) + domutils.SwapTags(ctx, doc, nameIsInlineCode, nameIsPre) + domutils.MergeAdjacent(doc, nameIsInlineCode) + + domutils.AddSpace(ctx, doc, nameIsBoldOrItalic, nameIsInlineCode) + + // - - - Link - - - // + domutils.RemoveRedundant(doc, nameIsBothLink) + domutils.SwapTags(ctx, doc, nameIsBoldOrItalic, nameIsLink) + + // - - - Headings - - - // + domutils.SwapTags(ctx, doc, nameIsLink, nameIsHeading) + domutils.LeafBlockAlternatives(ctx, doc) +} diff --git a/plugin/commonmark/handle_render.go b/plugin/commonmark/handle_render.go new file mode 100644 index 0000000..5633ec5 --- /dev/null +++ b/plugin/commonmark/handle_render.go @@ -0,0 +1,47 @@ +package commonmark + +import ( + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "golang.org/x/net/html" +) + +func (c *commonmark) handleRender(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + name := dom.NodeName(n) + + switch name { + case "strong", "b", + "em", "i": + return c.renderBoldItalic(ctx, w, n) + case "hr": + return c.renderDivider(ctx, w, n) + case "br": + return c.renderBreak(ctx, w, n) + case "ul", "ol": + return c.renderListContainer(ctx, w, n) + + case "pre": + return c.renderBlockCode(ctx, w, n) + case "code", + "var", "samp", "kbd", "tt": + return c.renderInlineCode(ctx, w, n) + + case "blockquote": + return c.renderBlockquote(ctx, w, n) + + case "h1", "h2", "h3", "h4", "h5", "h6": + return c.renderHeading(ctx, w, n) + + case "img": + return c.renderImage(ctx, w, n) + + case "a": + return c.renderLink(ctx, w, n) + + case "#comment": + return c.renderComment(ctx, w, n) + } + + return converter.RenderTryNext + +} diff --git a/plugin/commonmark/options.go b/plugin/commonmark/options.go new file mode 100644 index 0000000..9d28b46 --- /dev/null +++ b/plugin/commonmark/options.go @@ -0,0 +1,118 @@ +package commonmark + +type linkStyle string + +const ( + // For example: + // + // [view more](/about.html) + LinkInlined linkStyle = "inlined" + + LinkReferencedIndex linkStyle = "referenced_index" + LinkReferencedShort linkStyle = "referenced_short" +) + +type headingStyle string + +const ( + // HeadingATX is the heading style of prefixing the heading with "#" signs indicating the level. For example: + // + // ## Heading + HeadingATX headingStyle = "atx" + + // HeadingSetext is the heading style of putting "=" or "-" on the followed line. For example: + // + // Heading + // ------- + HeadingSetext headingStyle = "setext" +) + +// config to customize the output. You can change stuff like +// the character that is used for strong text. +type config struct { + // _ or * + // + // default: * + EmDelimiter string + + // ** or __ + // + // default: ** + StrongDelimiter string + + // Any Thematic break + // + // default: "* * *" + HorizontalRule string + + // "-", "+", or "*" + // + // default: "-" + BulletListMarker string + + DisableListEndComment bool + + // "indented" or "fenced" + // + // default: "indented" + // TODO: CodeBlockStyle string + + // ``` or ~~~ + // + // default: ``` + CodeBlockFence string + + // "setext" or "atx" + // + // default: "atx" + HeadingStyle headingStyle + + // TODO: LineBreakStyle string "hard" or "soft" + + // "inlined" or "referenced_index" or "referenced_short" + // + // default: inlined + LinkStyle linkStyle + + // ----// + // basic, disabled + // default: basic + // TODO: EscapeMode string + // TODO: AssembleAbsoluteURL +} + +func fillInDefaultConfig(cfg *config) config { + if cfg.EmDelimiter == "" { + // The new default is now "*" (instead of "_") as that works better inside words. + cfg.EmDelimiter = "*" + } + if cfg.StrongDelimiter == "" { + cfg.StrongDelimiter = "**" + } + + if cfg.HorizontalRule == "" { + cfg.HorizontalRule = "* * *" + } + + if cfg.BulletListMarker == "" { + cfg.BulletListMarker = "-" + } + + // TODO: also check for spelling mistakes in "indented" + // if opt.CodeBlockStyle == "" { + // opt.CodeBlockStyle = "indented" + // } + if cfg.CodeBlockFence == "" { + cfg.CodeBlockFence = "```" + } + + if cfg.HeadingStyle == "" { + cfg.HeadingStyle = "atx" + } + + if cfg.LinkStyle == "" { + cfg.LinkStyle = LinkInlined + } + + return *cfg +} diff --git a/plugin/commonmark/render_blockquote.go b/plugin/commonmark/render_blockquote.go new file mode 100644 index 0000000..261a58f --- /dev/null +++ b/plugin/commonmark/render_blockquote.go @@ -0,0 +1,31 @@ +package commonmark + +import ( + "bytes" + + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/textutils" + "golang.org/x/net/html" +) + +func (c *commonmark) renderBlockquote(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + var buf bytes.Buffer + ctx.RenderChildNodes(ctx, &buf, n) + + content := buf.Bytes() + content = bytes.TrimSpace(content) + if content == nil { + return converter.RenderSuccess + } + + content = textutils.TrimConsecutiveNewlines(content) + content = textutils.PrefixLines(content, []byte{'>', ' '}) + + w.WriteRune('\n') + w.WriteRune('\n') + w.Write(content) + w.WriteRune('\n') + w.WriteRune('\n') + + return converter.RenderSuccess +} diff --git a/plugin/commonmark/render_bold_italic.go b/plugin/commonmark/render_bold_italic.go new file mode 100644 index 0000000..50586fc --- /dev/null +++ b/plugin/commonmark/render_bold_italic.go @@ -0,0 +1,39 @@ +package commonmark + +import ( + "bytes" + + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/textutils" + "golang.org/x/net/html" +) + +func (c commonmark) getDelimiter(n *html.Node) []byte { + name := dom.NodeName(n) + if name == "strong" || name == "b" { + return []byte(c.StrongDelimiter) + } else if name == "em" || name == "i" { + return []byte(c.EmDelimiter) + } else { + return nil + } +} +func (c commonmark) renderBoldItalic(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + var buf bytes.Buffer + ctx.RenderChildNodes(ctx, &buf, n) + + // Depending on the options & whether it is bold or italic there + // is going to be a different delimiter. + delimiter := c.getDelimiter(n) + content := buf.Bytes() + + // If there is a newline character between the start and end delimiter + // the delimiters won't be recognized. Either we remove all newline characters + // OR on _every_ line we put start & end delimiters. + content = textutils.DelimiterForEveryLine(content, delimiter) + + w.Write(content) + + return converter.RenderSuccess +} diff --git a/plugin/commonmark/render_bold_italic_test.go b/plugin/commonmark/render_bold_italic_test.go new file mode 100644 index 0000000..d63cae0 --- /dev/null +++ b/plugin/commonmark/render_bold_italic_test.go @@ -0,0 +1,132 @@ +package commonmark_test + +import ( + "testing" + + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark" +) + +func TestNewCommonmarkPlugin_Italic(t *testing.T) { + const nonBreakingSpace = '\u00A0' + const zeroWidthSpace = '\u200b' + + runs := []struct { + desc string + input string + expected string + }{ + { + desc: "simple", + input: `

Text

`, + expected: `*Text*`, + }, + { + desc: "normal text surrounded by italic", + input: `ItalicNormalItalic`, + expected: `*Italic*Normal*Italic*`, + }, + { + desc: "italic text surrounded by normal", + input: `NormalItalicNormal`, + expected: `Normal*Italic*Normal`, + }, + { + desc: "with spaces inside", + input: `

Text

`, + expected: `*Text*`, + }, + { + desc: "with delimiter inside", + input: `

*A*B*

`, + expected: `*\*A\*B\**`, + }, + { + desc: "adjacent", + input: `AB C`, + expected: `*AB* *C*`, + }, + { + desc: "adjacent and lots of spaces", + input: ` A B C `, + expected: `*A B* *C*`, + }, + { + desc: "nested", + input: `A B C`, + expected: `*A B C*`, + }, + { + desc: "nested and lots of spaces", + input: ` A B C `, + expected: `*A B C*`, + }, + { + desc: "mixed nested 1", + input: `A B C`, + expected: `*A **B** C*`, + }, + { + desc: "mixed nested 2", + input: `A B C`, + expected: `**A *B* C**`, + }, + { + desc: "mixed different italic", + input: `ABC`, + expected: `*ABC*`, + }, + + { + desc: "next to each other in other containers", + input: `
+ A +
B
+ C +
`, + expected: "*A*\n\n*B*\n\n*C*", + }, + + // - - - - // + { + desc: "empty italic #1", + input: `beforeafter`, + expected: `beforeafter`, + }, + { + desc: "empty italic #2", + input: `before after`, + expected: `before after`, + }, + { + desc: "empty italic #3", + input: `before after`, + expected: `before after`, + }, + { + desc: "italic with non-breaking-space", + input: `before` + string(nonBreakingSpace) + `after`, + expected: `before` + string(nonBreakingSpace) + `after`, + }, + { + desc: "italic with zero-width-space", + input: `before` + string(zeroWidthSpace) + `after`, + expected: `before*` + string(zeroWidthSpace) + `*after`, + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + conv := converter.NewConverter( + converter.WithPlugins(commonmark.NewCommonmarkPlugin()), + ) + + out, err := conv.ConvertString(run.input) + if err != nil { + t.Error(err) + } + if out != run.expected { + t.Errorf("expected %q but got %q", run.expected, out) + } + }) + } +} diff --git a/plugin/commonmark/render_break.go b/plugin/commonmark/render_break.go new file mode 100644 index 0000000..e89c4b7 --- /dev/null +++ b/plugin/commonmark/render_break.go @@ -0,0 +1,13 @@ +package commonmark + +import ( + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/marker" + "golang.org/x/net/html" +) + +func (c *commonmark) renderBreak(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + w.Write(marker.BytesMarkerLineBreak) + w.Write(marker.BytesMarkerLineBreak) + return converter.RenderSuccess +} diff --git a/plugin/commonmark/render_code.go b/plugin/commonmark/render_code.go new file mode 100644 index 0000000..d5fae22 --- /dev/null +++ b/plugin/commonmark/render_code.go @@ -0,0 +1,149 @@ +package commonmark + +import ( + "bytes" + "strings" + "unicode/utf8" + + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/textutils" + "github.com/JohannesKaufmann/html-to-markdown/v2/marker" + "golang.org/x/net/html" +) + +func (c *commonmark) renderInlineCode(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + // TODO: configure delimeter in options? + fenceChar := '`' + + codeContent, _ := getCodeWithoutTags(n) + + // TODO: debug flag? + if len(codeContent) == 0 { + // fmt.Println("expected an empty inline code to be already removed") + // panic("expected an empty inline code to be already removed") + } + // TODO: configurable function to decide if inline or block? + if bytes.Contains(codeContent, []byte("\n")) { + // fmt.Println("inline code contains newlines") + // return c.renderBlockCode(ctx, w, n, render) + } + + if bytes.TrimSpace(codeContent) == nil { + // No stripping occurs if the code span contains _only_ spaces: + w.WriteRune(fenceChar) + w.Write(codeContent) + w.WriteRune(fenceChar) + return converter.RenderSuccess + } + + // Newlines in the text aren't great, since this is inline code and not a code block. + // Newlines will be stripped anyway in the browser, but it won't be recognized as code + // from the markdown parser when there is more than one newline. + codeContent = textutils.CollapseInlineCodeContent(codeContent) + + code := string(codeContent) + + maxCount := textutils.CalculateCodeFenceOccurrences(fenceChar, code) + maxCount++ + + fence := strings.Repeat(string(fenceChar), maxCount) + + // Code contains a backtick as first character + if strings.HasPrefix(code, "`") { + code = " " + code + } + // Code contains a backtick as last character + if strings.HasSuffix(code, "`") { + code = code + " " + } + + w.WriteString(fence) + w.WriteString(code) + w.WriteString(fence) + + return converter.RenderSuccess +} +func (c *commonmark) renderBlockCode(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + code, infoString := getCodeWithoutTags(n) + + if bytes.HasSuffix(code, []byte("\n")) { + code = code[:len(code)-1] + } + + fenceChar, _ := utf8.DecodeRuneInString(c.CodeBlockFence) + fence := textutils.CalculateCodeFence(fenceChar, string(code)) + + // We want to keep the original content inside the code block untouched. + // Because multiple newlines would be trimmed, we temporarily replace it with another character. + code = bytes.ReplaceAll(code, []byte("\n"), marker.BytesMarkerCodeBlockNewline) + + w.WriteString("\n\n") + w.WriteString(fence) + w.WriteString(infoString) + w.WriteRune('\n') + w.Write(code) + w.WriteRune('\n') + w.WriteString(fence) + w.WriteString("\n\n") + + return converter.RenderSuccess +} + +func getCodeLanguage(n *html.Node) string { + class := dom.GetAttributeOr(n, "class", "") + + parts := strings.Split(class, " ") + for _, part := range parts { + if !strings.Contains(part, "language-") && !strings.Contains(part, "lang-") { + continue + } + + part = strings.Replace(part, "language-", "", 1) + part = strings.Replace(part, "lang-", "", 1) + + return part + } + + return "" +} +func getCodeWithoutTags(startNode *html.Node) ([]byte, string) { + var buf bytes.Buffer + var infoString string + + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.ElementNode && (n.Data == "code" || n.Data == "pre") { + + // TODO: what if multiple elements have an info string? + if infoString == "" { + infoString = getCodeLanguage(n) + } + } + + // - - - // + + if n.Type == html.ElementNode && (n.Data == "style" || n.Data == "script" || n.Data == "textarea") { + return + } + if n.Type == html.ElementNode && (n.Data == "br" || n.Data == "div") { + buf.WriteString("\n") + } + + if n.Type == html.TextNode { + // if strings.TrimSpace(n.Data) == "" && strings.Contains(n.Data, "\n") { + // buf.WriteString("\n") + // } + buf.WriteString(n.Data) + return + } + + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + + f(startNode) + + return buf.Bytes(), infoString +} diff --git a/plugin/commonmark/render_comment.go b/plugin/commonmark/render_comment.go new file mode 100644 index 0000000..a714112 --- /dev/null +++ b/plugin/commonmark/render_comment.go @@ -0,0 +1,25 @@ +package commonmark + +import ( + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/domutils" + "golang.org/x/net/html" +) + +func (c *commonmark) renderComment(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + + if n.Data == domutils.ListEndCommentData { + // We definetely want to render the list end comments + // that were just added + w.WriteRune('\n') + w.WriteRune('\n') + _ = html.Render(w, n) + w.WriteRune('\n') + w.WriteRune('\n') + return converter.RenderSuccess + + } + + // Fallback to the normal settings for comments + return converter.RenderTryNext +} diff --git a/plugin/commonmark/render_divider.go b/plugin/commonmark/render_divider.go new file mode 100644 index 0000000..5fd6c59 --- /dev/null +++ b/plugin/commonmark/render_divider.go @@ -0,0 +1,15 @@ +package commonmark + +import ( + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "golang.org/x/net/html" +) + +func (c *commonmark) renderDivider(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + + w.WriteString("\n\n") + w.WriteString(c.HorizontalRule) + w.WriteString("\n\n") + + return converter.RenderSuccess +} diff --git a/plugin/commonmark/render_heading.go b/plugin/commonmark/render_heading.go new file mode 100644 index 0000000..fc978b8 --- /dev/null +++ b/plugin/commonmark/render_heading.go @@ -0,0 +1,148 @@ +package commonmark + +import ( + "bytes" + "regexp" + + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/textutils" + "github.com/JohannesKaufmann/html-to-markdown/v2/marker" + "golang.org/x/net/html" +) + +// TODO: remove regex +var multipleSpacesR = regexp.MustCompile(` +`) + +func (r *commonmark) setextUnderline(level int, width int) []byte { + line := "-" + if level == 1 { + line = "=" + } + + return bytes.Repeat([]byte(line), width) +} +func (r *commonmark) atxPrefix(level int) []byte { + return bytes.Repeat([]byte("#"), level) +} + +func getHeadingLevel(name string) int { + switch name { + case "h1": + return 1 + case "h2": + return 2 + case "h3": + return 3 + case "h4": + return 4 + case "h5": + return 5 + case "h6": + return 6 + default: + return 6 + } +} +func runeCount(chars []rune) (count int) { + for _, char := range chars { + if char == marker.MarkerEscaping { + continue + } + count++ + } + return +} +func getUnderlineWidth(content []byte, minVal int) int { + var width int + + parts := bytes.Split(content, []byte("\n")) + for _, part := range parts { + // Count how wide the line should be, + // while using RuneCount to correctly count ä, ö, ... + // + // TODO: optimize function w := utf8.RuneCount(part) + w := runeCount([]rune(string(part))) + if w > width { + width = w + } + } + + // Technically the minimum value is only one character, + // but one dash could easily trigger a heading. + if width < minVal { + return minVal + } + + return width +} + +func escapePoundSignAtEnd(s []byte) []byte { + // -1 # + // -2 placeholder + // -3 maybe \ + + if s[len(s)-1] != '#' { + // We dont have a # at the end, + // so there is no work to do... + return s + } + if len(s) >= 3 && s[len(s)-3] == '\\' { + // It is already escaped, + // so there is no work to do... + return s + } + + // Because we have a # at the end, + // we should manually force the escaping + // by overriding the placeholder. + s[len(s)-2] = '\\' + + return s +} + +func (c *commonmark) renderHeading(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + // ctx = context.WithValue(ctx, "is_inside_heading", true) + + level := getHeadingLevel(dom.NodeName(n)) + + var buf bytes.Buffer + ctx.RenderChildNodes(ctx, &buf, n) + content := buf.Bytes() + + if bytes.TrimFunc(content, marker.IsSpace) == nil { + return converter.RenderSuccess + } + + if c.HeadingStyle == HeadingSetext && level < 3 { + content = textutils.EscapeMultiLine(content) + + width := getUnderlineWidth(content, 3) + underline := c.setextUnderline(level, width) + + w.WriteString("\n\n") + w.Write(content) + w.WriteRune('\n') + w.Write(underline) + w.WriteString("\n\n") + } else { + content = bytes.ReplaceAll(content, marker.BytesMarkerLineBreak, []byte(" ")) + content = bytes.ReplaceAll(content, []byte("\n"), []byte(" ")) + content = bytes.ReplaceAll(content, []byte("\r"), []byte(" ")) + // Replace multiple spaces by one space. + content = multipleSpacesR.ReplaceAll(content, []byte(" ")) + + content = bytes.TrimSpace(content) + + // A # sign at the end would be removed otherwise + content = escapePoundSignAtEnd(content) + + w.WriteString("\n\n") + w.Write(c.atxPrefix(level)) + w.WriteRune(' ') + w.Write(content) + w.WriteString("\n\n") + } + + return converter.RenderSuccess +} diff --git a/plugin/commonmark/render_image.go b/plugin/commonmark/render_image.go new file mode 100644 index 0000000..481db95 --- /dev/null +++ b/plugin/commonmark/render_image.go @@ -0,0 +1,63 @@ +package commonmark + +import ( + "bytes" + "strings" + + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/textutils" + "golang.org/x/net/html" +) + +func escapeAlt(altString string) string { + alt := []byte(altString) + + var buf bytes.Buffer + for i := range alt { + if alt[i] == '[' || alt[i] == ']' { + prevIndex := i - 1 + if prevIndex < 0 || alt[prevIndex] != '\\' { + buf.WriteRune('\\') + } + } + buf.WriteByte(alt[i]) + } + + return buf.String() +} + +func (c *commonmark) renderImage(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + src := dom.GetAttributeOr(n, "src", "") + src = strings.TrimSpace(src) + if src == "" { + return converter.RenderTryNext + } + + src = ctx.AssembleAbsoluteURL(ctx, converter.ElementImage, src) + + title := dom.GetAttributeOr(n, "title", "") + title = strings.ReplaceAll(title, "\n", " ") + + alt := dom.GetAttributeOr(n, "alt", "") + alt = strings.ReplaceAll(alt, "\n", " ") + + // The alt description will be placed between two square brackets `[alt]` + // so make sure that those characters are escaped. + alt = escapeAlt(alt) + + w.WriteRune('!') + w.WriteRune('[') + w.WriteString(alt) + w.WriteRune(']') + w.WriteRune('(') + w.WriteString(src) + if title != "" { + // The destination and title must be seperated by a space + w.WriteRune(' ') + w.Write(textutils.SurroundByQuotes([]byte(title))) + } + w.WriteRune(')') + + return converter.RenderSuccess +} diff --git a/plugin/commonmark/render_link.go b/plugin/commonmark/render_link.go new file mode 100644 index 0000000..7b65321 --- /dev/null +++ b/plugin/commonmark/render_link.go @@ -0,0 +1,97 @@ +package commonmark + +import ( + "bytes" + "strings" + + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/textutils" + "github.com/JohannesKaufmann/html-to-markdown/v2/marker" + "golang.org/x/net/html" +) + +// link in commonmark contains +// - the link text (the visible text) +// - a link destination (the URI that is the link destination) +// - an optional link title +type link struct { + *html.Node + + before []byte + content []byte + after []byte + + href string + title string +} + +func (c *commonmark) renderLinkInlined(w converter.Writer, l *link) converter.RenderStatus { + + w.Write(l.before) + w.WriteRune('[') + w.Write(l.content) + w.WriteRune(']') + w.WriteRune('(') + w.WriteString(l.href) + if l.title != "" { + // The destination and title must be seperated by a space + w.WriteRune(' ') + w.Write(textutils.SurroundByQuotes([]byte(l.title))) + } + w.WriteRune(')') + w.Write(l.after) + + return converter.RenderSuccess +} + +func (c *commonmark) renderLink(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + ctx = ctx.WithValue("is_inside_link", true) + + href := dom.GetAttributeOr(n, "href", "") + + href = strings.TrimSpace(href) + href = ctx.AssembleAbsoluteURL(ctx, converter.ElementLink, href) + + title := dom.GetAttributeOr(n, "title", "") + title = strings.ReplaceAll(title, "\n", " ") + + l := &link{ + Node: n, + href: href, + title: title, + } + + var buf bytes.Buffer + ctx.RenderChildNodes(ctx, &buf, n) + content := buf.Bytes() + + if bytes.TrimFunc(content, marker.IsSpace) == nil { + // Fallback to the title + content = []byte(l.title) + } + if bytes.TrimSpace(content) == nil { + return converter.RenderSuccess + } + + if l.href == "" { + // A link without href is valid, like e.g. [text]() + // But a title would make it invalid. + l.title = "" + } + + leftExtra, trimmed, rightExtra := textutils.SurroundingSpaces(content) + + trimmed = textutils.EscapeMultiLine(trimmed) + + l.before = leftExtra + l.content = trimmed + l.after = rightExtra + + switch c.LinkStyle { + case LinkInlined: + return c.renderLinkInlined(w, l) + default: + return converter.RenderTryNext + } +} diff --git a/plugin/commonmark/render_list.go b/plugin/commonmark/render_list.go new file mode 100644 index 0000000..d737960 --- /dev/null +++ b/plugin/commonmark/render_list.go @@ -0,0 +1,106 @@ +package commonmark + +import ( + "bytes" + "fmt" + "strconv" + "unicode/utf8" + + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/textutils" + "github.com/JohannesKaufmann/html-to-markdown/v2/marker" + "golang.org/x/net/html" +) + +func getStartAt(node *html.Node) int { + startVal := dom.GetAttributeOr(node, "start", "1") + startAt, err := strconv.Atoi(startVal) + if err != nil { + startAt = 1 + } + + return startAt +} + +func (c commonmark) getPrefixFunc(n *html.Node, sliceLength int) func(int) string { + startAt := getStartAt(n) + + return func(sliceIndex int) string { + if n.Data == "ul" { + return c.BulletListMarker + " " + } + + currentIndex := startAt + sliceIndex + lastIndex := startAt + sliceLength - 1 + maxLength := utf8.RuneCountInString(strconv.Itoa(lastIndex)) + + // Pad the numbers so that all prefix numbers in the list take up the same space + // `%02d.` -> "01. " + format := `%0` + strconv.Itoa(maxLength) + `d. ` + return fmt.Sprintf(format, currentIndex) + } +} + +func renderMultiLineListItem(w converter.Writer, content []byte, indentCount int) { + lines := marker.SplitFunc(content, func(r rune) bool { + return r == '\n' || r == marker.MarkerLineBreak + }) + + for i := range lines { + if i != 0 { + // The first line is already indented through the prefix, + // all other lines need the correct amount of spaces. + w.Write(bytes.Repeat([]byte(" "), indentCount)) + } + w.Write(lines[i]) + + if i < len(lines)-1 { + w.WriteRune('\n') + } + } +} +func (c commonmark) renderListContainer(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + children := dom.AllChildNodes(n) + items := make([][]byte, 0, len(children)) + + for _, child := range children { + var buf bytes.Buffer + ctx.RenderNodes(ctx, &buf, child) + + content := buf.Bytes() + content = bytes.TrimSpace(content) + if content == nil { + continue + } + + items = append(items, content) + } + + if len(items) == 0 { + return converter.RenderSuccess + } + + getPrefix := c.getPrefixFunc(n, len(items)) + indentCount := utf8.RuneCountInString(getPrefix(0)) + + w.WriteString("\n\n") + for i, item := range items { + w.WriteString(getPrefix(i)) + + item = textutils.TrimConsecutiveNewlines(item) + // item = escape.UnEscaper(item) + item = ctx.UnEscapeContent(item) + + // An item might have different lines that each + // must be indented with the correct count of spaces. + renderMultiLineListItem(w, item, indentCount) + + if i < len(items)-1 { + w.WriteRune('\n') + } + } + w.WriteString("\n\n") + + return converter.RenderSuccess +} diff --git a/plugin/commonmark/testdata/.gitattributes b/plugin/commonmark/testdata/.gitattributes new file mode 100644 index 0000000..a8d2daa --- /dev/null +++ b/plugin/commonmark/testdata/.gitattributes @@ -0,0 +1,4 @@ + +# Leave the files untouched. Otherwise they might be +# changed when cloning the repo on Windows... +* -text diff --git a/plugin/commonmark/testdata/GoldenFiles/blockquote.in.html b/plugin/commonmark/testdata/GoldenFiles/blockquote.in.html new file mode 100644 index 0000000..81d6ac2 --- /dev/null +++ b/plugin/commonmark/testdata/GoldenFiles/blockquote.in.html @@ -0,0 +1,64 @@ + +
+First Line +Second Line +Third Line +
+ + +
+
+
+
+ +
+
+

Start Line

+


+ +


+

End Line

+
+ + + +
+

Paragraph 1

+

Paragraph 2

+

Paragraph 3

+
+ + + +
+

before

+
+

nested

+
+

after

+
+ + + +
+

Heading

+ +
    +
  1. List Item 1
  2. +
  3. List Item 2
  4. +
+ +

A code block:

+
code block content
+
+ + + + +

Not a > blockquote

+ +

+> not a blockquote +

diff --git a/plugin/commonmark/testdata/GoldenFiles/blockquote.out.md b/plugin/commonmark/testdata/GoldenFiles/blockquote.out.md new file mode 100644 index 0000000..bffdddc --- /dev/null +++ b/plugin/commonmark/testdata/GoldenFiles/blockquote.out.md @@ -0,0 +1,46 @@ + + +> First Line Second Line Third Line + + + +> Start Line +> +> End Line + + + +> Paragraph 1 +> +> Paragraph 2 +> +> Paragraph 3 + + + +> before +> +> > nested +> +> after + + + +> ## Heading +> +> 1. List Item 1 +> 2. List Item 2 +> +> A code block: +> +> ``` +> code block content +> ``` + + + +Not a > blockquote + +> not a blockquote \ No newline at end of file diff --git a/plugin/commonmark/testdata/GoldenFiles/bold.in.html b/plugin/commonmark/testdata/GoldenFiles/bold.in.html new file mode 100644 index 0000000..b00372c --- /dev/null +++ b/plugin/commonmark/testdata/GoldenFiles/bold.in.html @@ -0,0 +1,152 @@ + + + + +

some bold and bold text

+ + +

some bold and bold text

+ + +

someboldandboldtext

+ + +
+ + + +

some text

+

some text

+

some text

+ +

sometext

+

some text

+

some text

+ + + + +

normalboldnormal

+ +

boldnormalbold

+ + + + +

very bold text

+ +

very bold text

+ + + + + +

+ hello + +


+ + hello +

+ + + +
+ + bold onebold two +
+ + +

+ one + + two + +

+ +
+ +

ab

+

ab

+

abc

+
+

a b

+
+ + +

+ + Von Max Mustermann, + + + Berlin + +

+ + + +

+ bold and italic +

+ +

+ italic and bold +

+ + + + + +
+

beforemiddleafter

+
+

before.middleafter

+

beforemiddle.after

+

before.middle.after

+
+

before .middle after

+

before middle. after

+

before .middle. after

+
+

before?!!middle?!!after

+
+

before-middle-after

+

before-middle-after

+
+

check it out.

+

check it out?

+

check it out!!!

+ +

!just after

+

just before!

+ +
+ +

heading

!italic!

heading

+ + see here:
blockquote
+ + see here:

paragraph

+ + one.two + + one.two + +
+ + +
before

!paragraph!

after
+
+ diff --git a/plugin/commonmark/testdata/GoldenFiles/bold.out.md b/plugin/commonmark/testdata/GoldenFiles/bold.out.md new file mode 100644 index 0000000..6220d1b --- /dev/null +++ b/plugin/commonmark/testdata/GoldenFiles/bold.out.md @@ -0,0 +1,159 @@ + + + + +some **bold** and **bold** text + + + +some **bold** and **bold** text + + + +some**bold**and**bold**text + +* * * + + + +some text + +some text + +some text + +sometext + +some text + +some text + + + +normal**bold**normal + +**bold**normal**bold** + + + +**very bold text** + +**very bold text** + + + +***hello*** + +* * * + +***hello*** + + + + + +**bold onebold two** + +***one*** ***two*** + + + +**ab** + +**ab** + +**abc** + +* * * + +**a** **b** + +**Von Max Mustermann,** **Berlin** + + + +***bold and italic*** + +***italic and bold*** + + + +before*middle*after + +* * * + +before*.middle*after + +before*middle.*after + +before*.middle.*after + +* * * + +before *.middle* after + +before *middle.* after + +before *.middle.* after + +* * * + +before*?!!middle?!!*after + +* * * + +before-*middle*-after + +before*-middle-*after + +* * * + +check it out*.* + +check it out*?* + +check it out*!!!* + +*!*just after + +just before*!* + +* * * + +#### heading + +*!italic!* + +#### heading + +**see here:** + +> blockquote + +**see here:** + +paragraph + +[*one.*](/)[two](/) [*one.*](/)[two](/) + +* * * + + + +before + +*!paragraph!* + +after \ No newline at end of file diff --git a/plugin/commonmark/testdata/GoldenFiles/code.in.html b/plugin/commonmark/testdata/GoldenFiles/code.in.html new file mode 100644 index 0000000..c35364d --- /dev/null +++ b/plugin/commonmark/testdata/GoldenFiles/code.in.html @@ -0,0 +1,287 @@ + + + +
inline code
+ +
variable
+ +
sample output
+ +
keyboard input
+ +
teletype text
+ + + +
+ + + +
When x = 3, that means x + 2 = 5
+ +
A simple equation: x = y + 2
+ + + + + + +
before A middle B after
+
beforeAmiddleBafter
+ + +
before A B after
+
beforeABafter
+ + +
ABCDE
+ + + + + +
beforeinline codeafter
+
beforeinline codeafter
+ +
beforeainline codebafter
+
beforeainline codebafter
+
beforeinline codeafter
+
before inline code after
+ +
beforeinline code and inline codeafter
+
beforeinline code and inline codeafter
+ + +
+ + +
before inline code after
+
before inline code after
+ +
before inline code after
+
before inline code after
+ + +
+ + +
before <pre> after
+ + + + + +
before <img> after
+
before after
+
before A middle B after
+ + + +

+The <img> tag is used to embed an image.
+
+The  tag is used to embed an image.
+
+ + + +

+    
    +
  • List Item One
  • +
  • List Item Two
  • +
  • List Item Three
  • +
+
+ + + + + +
An inline code that is empty except spaces:
+
beforeafter
+
before after
+
before after
+ +
before after
+
before after
+
before after
+ +
before after
+
before after
+
before after
+ + +
beforeafter
+
before after
+
before after
+ + +
+
 
+
  
+

+  
+
+ + +
Beginning of code
+ 
+  
+  
+
+
+End of code
+ +
Start of many newlines
+
+
+
+
+
+
+End of many newlines
+ + + +
+ + + +
inline code
+
inline code
+
inline code
+
inline code
+
inline code
+ + +
+ + + +
An inline code that contains backticks:
+
with ` backtick
+
with `` backticks
+
a ``` b ```` c ` d
+
`starting & ending with a backtick`
+ + +
+ + +
An inline code that just contains backticks:
+
before``after
+
before `` after
+
before `` after
+ +
before `` after
+
before `` after
+
before `` after
+ +
before `` after
+
before `` after
+
before `` after
+ + +
+ + + +
```
+ +
~~~
+ +

+Some ```
+totally `````` normal
+` code
+
+ +

+Some ~~~
+totally ~~~~~~ normal
+~ code
+
+ + + + + +
before just code after
+
before
just pre
after
+ +
before
code inside pre
after
+
before
pre inside code
after
+ + +
+ + +
before +// just code +// another line + after
+ +
before
+// just pre
+// another line
+
after
+ +
before
+// code inside pre
+// another line
+
after
+ +
before

+// pre inside code
+// another line
+
after
+ + + + + +
content
+
content
+ + + + + +
Line 0
+    Line 1 AB C
+    Line 2 AB C
+Line 3
+ +
+ +

+    Line 1 AB C
+    Line 2 AB C
+
+ +
+ +

+    Line 1 AB C
+    Line 2 AB C
+
+
+ + + diff --git a/plugin/commonmark/testdata/GoldenFiles/code.out.md b/plugin/commonmark/testdata/GoldenFiles/code.out.md new file mode 100644 index 0000000..6549f61 --- /dev/null +++ b/plugin/commonmark/testdata/GoldenFiles/code.out.md @@ -0,0 +1,355 @@ + + + + +`inline code` + +`variable` + +`sample output` + +`keyboard input` + +`teletype text` + +* * * + + + +When `x = 3`, that means `x + 2 = 5` + +A simple equation: `x` = `y` + 2 + + + + + +before `A` middle `B` after + +before`A`middle`B`after + + + +before `A` `B` after + +before`AB`after + +`ABCDE` + + + +before **`inline code`** after + +before *`inline code`* after + +before**a`inline code`b**after + +before**a`inline code`b**after + +before **`inline code`** after + +before **`inline code`** after + +before *`inline code` and `inline code`* after + +before *`inline code` and `inline code`* after + +* * * + +before **`inline code`** after + +before *`inline code`* after + +before **`inline code`** after + +before *`inline code`* after + +* * * + +before **`
`** after
+
+
+
+
+
+before `` after
+
+before after
+
+before `A middle B` after
+
+
+
+```
+
+The  tag is used to embed an image.
+
+The  tag is used to embed an image.
+```
+
+
+
+```
+
+    
+        List Item One
+        List Item Two
+        List Item Three
+    
+```
+
+
+
+
+
+An inline code that is empty except spaces:
+
+beforeafter
+
+before after
+
+before after
+
+before` `after
+
+before ` ` after
+
+before ` ` after
+
+before`  `after
+
+before `  ` after
+
+before `  ` after
+
+beforeafter
+
+before after
+
+before after
+
+```
+
+```
+
+```
+ 
+```
+
+```
+  
+```
+
+```
+
+  
+```
+
+```
+Beginning of code
+ 
+  
+  
+
+
+End of code
+```
+
+```
+Start of many newlines
+
+
+
+
+
+
+End of many newlines
+```
+
+* * *
+
+
+
+`inline code`
+
+`inline code`
+
+`inline code`
+
+`inline code`
+
+`inline code`
+
+* * *
+
+
+
+An inline code that contains backticks:
+
+``with ` backtick``
+
+```with `` backticks```
+
+`````a ``` b ```` c ` d`````
+
+`` `starting & ending with a backtick` ``
+
+* * *
+
+An inline code that just contains backticks:
+
+before``` `` ```after
+
+before``` `` ```after
+
+before``` `` ```after
+
+before ``` `` ``` after
+
+before ``` `` ``` after
+
+before ``` `` ``` after
+
+before ``` `` ``` after
+
+before ``` `` ``` after
+
+before ``` `` ``` after
+
+* * *
+
+
+
+````
+```
+````
+
+```
+~~~
+```
+
+```````
+
+Some ```
+totally `````` normal
+` code
+```````
+
+```
+
+Some ~~~
+totally ~~~~~~ normal
+~ code
+```
+
+
+
+before `just code` after
+
+before
+
+```
+just pre
+```
+
+after
+
+before
+
+```
+code inside pre
+```
+
+after
+
+before
+
+```
+pre inside code
+```
+
+after
+
+* * *
+
+before `// just code // another line` after
+
+before
+
+```
+// just pre
+// another line
+```
+
+after
+
+before
+
+```
+// code inside pre
+// another line
+```
+
+after
+
+before
+
+```
+
+// pre inside code
+// another line
+```
+
+after
+
+
+
+```one
+content
+```
+
+```two
+content
+```
+
+
+
+```
+Line 0
+    Line 1 AB C
+    Line 2 AB C
+Line 3
+```
+
+* * *
+
+```
+
+    Line 1 AB C
+    Line 2 AB C
+```
+
+* * *
+
+```
+
+    Line 1 AB C
+    Line 2 AB C
+
+```
\ No newline at end of file
diff --git a/plugin/commonmark/testdata/GoldenFiles/heading.in.html b/plugin/commonmark/testdata/GoldenFiles/heading.in.html
new file mode 100644
index 0000000..f4ff001
--- /dev/null
+++ b/plugin/commonmark/testdata/GoldenFiles/heading.in.html
@@ -0,0 +1,149 @@
+
+
+
+
+

Heading 1

+

Heading 2

+

Heading 3

+

Heading 4

+
Heading 5
+
Heading 6
+Heading 7 + + + + +

+

+

+

a

+

a

+

a

+


+ + + +

heading with spaces

+

heading with spaces and tabs

+ + +

+ + heading + with + newlines + +

+ +

heading

with
breaks

+



heading with breaks

+ + + + + + + +

#hashtag

+

# Heading

+ + +

#

+

#

+ + +

# Heading #

+

Heading #

+

Heading ##

+ +

Heading \#

+ + +
+ + +

These should not be recognized as headings:

+

not title
===

+

not title
=

+ +

not title
---

+

not title
-

+ +

#not title

+

# not title

+

## not title

+ + + + + + + + +

important h2 heading

+ + + + +
+ +
+ +

Heading 2

+
+
+ +
+ +
+ +

Heading 2

+
Heading 5
+
+
+ +
+ +
+ +

Heading 2

+

+ Description Line 1
+ Description Line 2
+ Description Line 3
+

+
Some quote
+
+
+ +
+ + + + +


More posts from around the site:

+ + + +
+ + + +
+ +
+ +

Heading

+
+
+
+
+ diff --git a/plugin/commonmark/testdata/GoldenFiles/heading.out.md b/plugin/commonmark/testdata/GoldenFiles/heading.out.md new file mode 100644 index 0000000..c190577 --- /dev/null +++ b/plugin/commonmark/testdata/GoldenFiles/heading.out.md @@ -0,0 +1,133 @@ + + + + +# Heading 1 + +## Heading 2 + +### Heading 3 + +#### Heading 4 + +##### Heading 5 + +###### Heading 6 + +Heading 7 + + + +# a + +# a + +# a + + + +## heading with spaces + +## heading with spaces and tabs + +## heading with newlines + +## heading with breaks + +## heading with breaks + + + +# #hashtag + +# # Heading + + + +# \# + +# \# + + + +# # Heading \# + +# Heading \# + +# Heading #\# + + + +# Heading \\# + +* * * + +These should not be recognized as headings: + +not title +\=== + +not title +\= + +not title +\--- + +not title +\- + +#not title + +\# not title + +\## not title + + + + + +## **important** `h2` *heading* + + + +* * * + +> ## [Heading 2](/page.html) + +* * * + +> [**Heading 2** +> \ +> **Heading 5**](/page.html) + +* * * + +> [**Heading 2** +> \ +> Description Line 1 +> \ +> Description Line 2 +> \ +> Description Line 3 +> \ +> "Some quote"](/page.html) + +* * * + + + +#### More posts from around the site: + +* * * + + + +### **Heading** \ No newline at end of file diff --git a/plugin/commonmark/testdata/GoldenFiles/image.in.html b/plugin/commonmark/testdata/GoldenFiles/image.in.html new file mode 100644 index 0000000..a7f614e --- /dev/null +++ b/plugin/commonmark/testdata/GoldenFiles/image.in.html @@ -0,0 +1,118 @@ + + + +

+

+ + +

+

+ + + +

alt text

+

+

alt text

+ + + + +

  the  alt  attribute

+

the alt "attribute"

+

the alt 'attribute'

+

the
+alt
+attribute

+

the [alt] attribute

+

the (alt) attribute

+

the ](alt) attribute

+ + +
+ + +

+

+

+

+

+

+

+ + + + + +

+ +

+ + + + + +

+ Such Icon + Email Icon +

+ +

+ Such Icon + Email Icon +

+ + +
+ + +

+ + + image alt text + + +
+ + + + image alt text + +

+ + + + + + + + alt text + + + +
+ + +
+
+ + + alt text + +
+
+ caption text +
+
+ diff --git a/plugin/commonmark/testdata/GoldenFiles/image.out.md b/plugin/commonmark/testdata/GoldenFiles/image.out.md new file mode 100644 index 0000000..6a533ba --- /dev/null +++ b/plugin/commonmark/testdata/GoldenFiles/image.out.md @@ -0,0 +1,95 @@ + + + + + + +![](/relative_url) + +![](www.example.com/absolute_url) + + + +![alt text](/url) + +![](/url "title text") + +![alt text](/url "title text") + + + +![ the alt attribute ](/url) + +![the alt "attribute"](/url) + +![the alt 'attribute'](/url) + +![the alt attribute](/url) + +![the \[alt\] attribute](/url) + +![the (alt) attribute](/url) + +![the \](alt) attribute](/url) + +* * * + +![](/url " the title attribute ") + +![](/url 'the title "attribute"') + +![](/url "the title 'attribute'") + +![](/url "the title attribute") + +![](/url "the [title] attribute") + +![](/url "the (title) attribute") + +![](/url "the )(title) attribute") + + + + + +![](data:image/gif;base64,abcdefghij) + +![](data:image/svg+xml;utf8,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20width='1080'%20height='956'%3E%3C/svg%3E) + + + + + +[*![Such Icon](/search.svg)*]() [*![Email Icon](/email.svg)*]() + +[*![Such Icon](/search.svg)*]() [*![Email Icon](/email.svg)*]() + +* * * + + + +[![image alt text](/image.jpg "image title text")](/page.html "link title text") + + + +[![image alt text](/src)]() + + + +![alt text](/image.jpg "title text") + +* * * + +![alt text](/image.jpg "title text") + +caption text \ No newline at end of file diff --git a/plugin/commonmark/testdata/GoldenFiles/link.in.html b/plugin/commonmark/testdata/GoldenFiles/link.in.html new file mode 100644 index 0000000..15287b8 --- /dev/null +++ b/plugin/commonmark/testdata/GoldenFiles/link.in.html @@ -0,0 +1,267 @@ + + + +

no href

+

no href

+

no href

+ +
+ + +

+

+

+

+ +

+


+ + +

+ + +
+ + +

relative link

+

absolute link

+

query params

+

fragment heading

+

fragment

+

+ Wir freuen uns über eine + Mail! +

+ + + + +

broken link

+

broken link

+ + +

with whitespace around

+ +

with space inside

+ + + + + + +

content

+

content

+ + +

content

+

content

+

content

+

content

+ + + + + + + + + + + +
+

a(b)[c]

+

a]

+
+ + +
+

a(b)[c]

+ +

[a]

+

[a

+

a]

+ +

(a)

+

(a

+

a)

+
+ + + +

AB

+

A B

+ +

beforeAmiddleBafter

+

before A middle B after

+

before A middle B after

+ + + + + + +

before content after

+

before content after

+

before content after

+ + +
+ + + +

bold and italic text

+

bold and italic text

+ + + + +

Start Line

+


+ +


+

End Line

+
+ + + +


+

newlines around the link content

+


+
+ + + + + + +

+ + + + + + + + + + + + +

before a inside strong after

+

beforea inside strongafter

+ +

before strong inside a after

+

beforestrong inside aafter

+ + +
+

before middle after

+

before middle after

+

beforemiddleafter

+ +

before middle after

+

before middle after

+

before middle after

+ +

beforewith empty spanafter

+

before with empty span after

+

before with empty span after

+ +
+ +

beforea bafter

+

beforeabafter

+

beforea b cafter

+
+ +
+ +

beforea inside italicafter

+

beforeitalic inside aafter

+ +

beforea inside bafter

+

beforeb inside aafter

+ +

beforealready boldafter

+ +
+ +

beforemiddleafter

+

beforeinside bold & italicafter

+

beforeinside bold & italicabafter

+

beforeinside bold & italicafter

+

beforeabcdeafter

+ +
+ +

beforeitaliclinkstrongafter

+ + + + + + + + + +
+

before

+ another link +

after

+
+
+ diff --git a/plugin/commonmark/testdata/GoldenFiles/link.out.md b/plugin/commonmark/testdata/GoldenFiles/link.out.md new file mode 100644 index 0000000..072c830 --- /dev/null +++ b/plugin/commonmark/testdata/GoldenFiles/link.out.md @@ -0,0 +1,242 @@ + + + + +[no href]() + +[no href]() + +[no href]() + +* * * + + + + + +[link title](/no_content "link title") + +* * * + +[relative link](/page.html) + +[absolute link](http://simple.org/) + +[query params](/page?b=1&a=2) + +[fragment heading](#heading) + +[fragment](#) + +Wir freuen uns über eine [Mail](mailto:hi@example.com?body=Hello%0AJohannes)! + + + +[broken link](/page) + +[broken link](/page%0A%0A.html) + +[with whitespace around](example.com) + +[with space inside](http://Open%20Demo) + + + + + +[content](/ "link title") + +[content](/ " link title ") + + + +[content](/ " link title ") + +[content](/ '"link title"') + +[content](/ "'link title'") + +[content](/ '"link title"') + + + + + +- [a(b)\[c\]](/page.html) +- [a\]](/page.html) + + + + + +[a(b)\[c\]](/page.html) + +[a\]](/page.html) + + + +a(b)\[c] + +\[a] + +[a + +a] + +(a) + +(a + +a) + + + +[A](/)[B](/) + +[A](/) [B](/) + +before[A](/)middle[B](/)after + +before [A](/) middle [B](/) after + +before [A](/) middle [B](/) after + + + + + +before [content](/) after + +before [content](/) after + +before [content](/) after + +* * * + + + +[**bold** and *italic* text](/) + +**bold [and *italic*](/) text** + + + +[Start Line +\ +End Line](/) + + + +[newlines around the link content](/) + + + +- [first text + \ + second text](/) + + + +[![](/image.jpg)](/page.html) + + + +[first text +\ +![](/image.jpg) +\ +second text](/page.html) + + + +[**Heading A** +\ +**Heading B**](/page.html) + + + +[title](/ "title") + + + +before [**a inside strong**](/) after + +before[**a inside strong**](/)after + +before [**strong inside a**](/) after + +before[**strong inside a**](/)after + +before [**middle**](/) after + +before [**middle**](/) after + +before[**middle**](/)after + +before [**middle**](/) after + +before [**middle**](/) after + +before [**middle**](/) after + +before**[with empty span](/)**after + +before **[with empty span](/)** after + +before **[with empty span](/)** after + +* * * + +before**[a](/) b**after + +before**[a](/)b**after + +before**[a](/) b [c](/)**after + +* * * + +before[*a inside italic*](/)after + +before[*italic inside a*](/)after + +before[**a inside b**](/)after + +before[**b inside a**](/)after + +before[**already bold**](/)after + +* * * + +before**[middle](/)**after + +before**[*inside bold & italic*](/)**after + +before***[inside bold & italic](/)a*b**after + +before**[inside bold & italic](/)**after + +before**a*b[c](/)d*e**after + +* * * + +before***italic*[link](/)strong**after + + + +[before +\ +another link +\ +after](/a) \ No newline at end of file diff --git a/plugin/commonmark/testdata/GoldenFiles/list.in.html b/plugin/commonmark/testdata/GoldenFiles/list.in.html new file mode 100644 index 0000000..c65903e --- /dev/null +++ b/plugin/commonmark/testdata/GoldenFiles/list.in.html @@ -0,0 +1,191 @@ +
+

A paragraph

+
    +
  • 1
  • +
  • +
  • +

    2

    +
  • +
  • +
      +
    • 3.1
    • +
    • 3.2
    • +
    +
  • +
  • + 4 Before +
      +
    • 4.1
    • +
    • +

      4.2

      +
    • +
    +
  • +
  • +
      +
    • 5.1
    • +
    +

    5 After

    +
  • +
  • + + 6 Before
    + 6 also Before +
    +
      +
    • 6A.1
    • +
    + 6 Between +
      +
    • 6B.1
    • +
    +

    6 After

    +

    6 also After

    +
  • +
  • 7
  • +
+
+ + +
+ + +
+

And also other lists...

+ +
    +
  • First
  • +
  • +

    Someone once said:

    +
    My famous quote
    + - someone +
  • +
+
    +
  1. Nine
  2. +
  3. Ten
  4. +
  5. +
      +
    1. Eleven.A
    2. +
    3. Eleven.B
    4. +
    +
  6. +
  7. +

    Someone once said:

    +
    My famous quote
    + - someone +
  8. +
  9. Thirteen
  10. +
+ +
  • List Item without Container
  • +
    + + +
    + + + +
      +
    1. one
    2. +
    3. two
    4. +
    + + +
    + + +
    + +
      +
    1. a
    2. +
    3. b
    4. +
    + + +
      +
    1. a
    2. +
    3. b
    4. +
    +
    + + +
    + + +
    +
      +
    • Before + text after
    • +
    • Before + text after
    • +
    +
    + + +
    + + + + + +
    + + +
    +
    • List 1
    +
    • List 2
    +
      +
      • List 3
      + +
      +
      +
      • List 4
      +

      text between

      +
      • List 5
      +

      +
      • List 6
      +


      +
      • List 7
      +
      +
      + +
      + +
        +
      • +
        • List 1
        +
        • List 2
        +
        • List 3
        +
      • +
      +
      + + +
        +
      • +

        Start Line

        +


        + +


        +

        End Line

        +
      • +
      + + + + + +

      1.

      +

      -

      +

      +

      +

      *

      + +
      + +

      1. not a list

      +

      - not a list

      +

      + not a list

      +

      * not a list

      diff --git a/plugin/commonmark/testdata/GoldenFiles/list.out.md b/plugin/commonmark/testdata/GoldenFiles/list.out.md new file mode 100644 index 0000000..ef9d811 --- /dev/null +++ b/plugin/commonmark/testdata/GoldenFiles/list.out.md @@ -0,0 +1,153 @@ +A paragraph + +- 1 +- 2 +- - 3.1 + - 3.2 +- 4 Before + + - 4.1 + - 4.2 +- - 5.1 + + 5 After +- 6 Before + + 6 also Before + + - 6A.1 + + 6 Between + + - 6B.1 + + 6 After + + 6 also After +- 7 + +* * * + +And also other lists... + +- First +- Someone once said: + + > My famous quote + + \- someone + + + +09. Nine +10. Ten +11. 111. Eleven.A + 112. Eleven.B +12. Someone once said: + + > My famous quote + + \- someone +13. Thirteen + +List Item without Container + +* * * + + + +1. one +2. two + +* * * + + + +8. a +9. b + + + + + +09. a +10. b + +* * * + +- Before text after +- Before [text](/page) after + +* * * + +- A double `**` [can open strong emphasis](/page) + +* * * + +- List 1 + + + +- List 2 + + + + + +- List 3 + + + +- List 4 + +text between + +- List 5 + + + +- List 6 + + + +- List 7 + +* * * + +- - List 1 + + + + - List 2 + + + + - List 3 + + + +- Start Line + + End Line + + + +1\. + +\- + +\+ + +\* + +* * * + +1\. not a list + +\- not a list + +\+ not a list + +\* not a list \ No newline at end of file diff --git a/plugin/commonmark/testdata/GoldenFiles/metadata.in.html b/plugin/commonmark/testdata/GoldenFiles/metadata.in.html new file mode 100644 index 0000000..ac5b64e --- /dev/null +++ b/plugin/commonmark/testdata/GoldenFiles/metadata.in.html @@ -0,0 +1,55 @@ + + + + + + Page Title + + +

      Heading A

      + + + + +

      Heading B

      + +
      + +

      \a \* \\

      + +

      + .<name> + .< name >. + <name> +

      +

      + 2 > 1
      + 1 < 2
      + + A & B
      + A & B
      + &ouml; +

      + +

      + *not emphasized*
      + <br/> not a tag
      + [not a link](/foo)
      + `not code`
      + 1. not a list
      + * not a list
      + # not a heading
      + [foo]: /url "not a reference"
      + &ouml; not a character entity +

      + + +

      + Start Line +


      + +


      + End Line +

      + + diff --git a/plugin/commonmark/testdata/GoldenFiles/metadata.out.md b/plugin/commonmark/testdata/GoldenFiles/metadata.out.md new file mode 100644 index 0000000..8d20176 --- /dev/null +++ b/plugin/commonmark/testdata/GoldenFiles/metadata.out.md @@ -0,0 +1,29 @@ +#### Heading A + +#### Heading B + +* * * + +\\a \\* \\\\ + +.<name> .< name >. <name> + +2 > 1 +1 < 2 +A & B +A & B +&ouml; + +\*not emphasized* +<br/> not a tag +\[not a link](/foo) +\`not code\` +1\. not a list +\* not a list +\# not a heading +\[foo]: /url "not a reference" +&ouml; not a character entity + +Start Line + +End Line \ No newline at end of file diff --git a/plugin/commonmark/validation.go b/plugin/commonmark/validation.go new file mode 100644 index 0000000..2b766a9 --- /dev/null +++ b/plugin/commonmark/validation.go @@ -0,0 +1,99 @@ +package commonmark + +import ( + "fmt" + "strings" +) + +func contains(values []string, searchVal string) bool { + for _, val := range values { + if val == searchVal { + return true + } + } + return false +} + +type ValidateConfigError struct { + Key string + Value string + + // By default is "Key:Value" but can be + // overriden to e.g. "--key=value" + KeyWithValue string + + patternDescription string +} + +func (e *ValidateConfigError) setDefaultKeyWithValue() { + e.KeyWithValue = fmt.Sprintf("%s:%q", e.Key, e.Value) +} +func (e *ValidateConfigError) Error() string { + if e.KeyWithValue == "" { + e.setDefaultKeyWithValue() + } + + return fmt.Sprintf("invalid value for %s must be %s", e.KeyWithValue, e.patternDescription) +} + +func validateConfig(cfg *config) error { + if strings.Count(cfg.EmDelimiter, "_") != 1 && strings.Count(cfg.EmDelimiter, "*") != 1 { + return &ValidateConfigError{ + Key: "EmDelimiter", + Value: cfg.EmDelimiter, + patternDescription: `exactly 1 character of "*" or "_"`, + } + } + if strings.Count(cfg.StrongDelimiter, "_") != 2 && strings.Count(cfg.StrongDelimiter, "*") != 2 { + return &ValidateConfigError{ + Key: "StrongDelimiter", + Value: cfg.StrongDelimiter, + patternDescription: `exactly 2 characters of "**" or "__"`, + } + } + + if strings.Count(cfg.HorizontalRule, "*") < 3 && + strings.Count(cfg.HorizontalRule, "_") < 3 && + strings.Count(cfg.HorizontalRule, "-") < 3 { + return &ValidateConfigError{ + Key: "HorizontalRule", + Value: cfg.HorizontalRule, + patternDescription: `at least 3 characters of "*", "_" or "-"`, + } + } + + if !contains([]string{"-", "+", "*"}, cfg.BulletListMarker) { + return &ValidateConfigError{ + Key: "BulletListMarker", + Value: cfg.BulletListMarker, + patternDescription: `one of "-", "+" or "*"`, + } + } + + if !contains([]string{"```", "~~~"}, cfg.CodeBlockFence) { + return &ValidateConfigError{ + Key: "CodeBlockFence", + Value: cfg.CodeBlockFence, + patternDescription: "one of \"```\" or \"~~~\"", + } + } + + if !contains([]string{"atx", "setext"}, string(cfg.HeadingStyle)) { + return &ValidateConfigError{ + Key: "HeadingStyle", + Value: string(cfg.HeadingStyle), + patternDescription: `one of "atx" or "setext"`, + } + } + + possibleLinkStyles := []string{string(LinkInlined), string(LinkReferencedIndex), string(LinkReferencedShort)} + if !contains(possibleLinkStyles, string(cfg.LinkStyle)) { + return &ValidateConfigError{ + Key: "LinkStyle", + Value: string(cfg.LinkStyle), + patternDescription: `one of "inlined", "referenced_index" or "referenced_short"`, + } + } + + return nil +} diff --git a/plugin/commonmark/validation_test.go b/plugin/commonmark/validation_test.go new file mode 100644 index 0000000..1d1f12b --- /dev/null +++ b/plugin/commonmark/validation_test.go @@ -0,0 +1,88 @@ +package commonmark + +import ( + "fmt" + "testing" +) + +func TestValidateConfig_Empty(t *testing.T) { + cfg := fillInDefaultConfig(&config{}) + if cfg.HeadingStyle != "atx" { + t.Error("the config value was not filled with the default value") + } + + err := validateConfig(&cfg) + if err != nil { + t.Errorf("expected no error but got %+v", err) + } +} +func TestValidateConfig_Success(t *testing.T) { + cfg := fillInDefaultConfig(&config{ + HeadingStyle: "setext", + }) + if cfg.HeadingStyle != "setext" { + t.Error("the config value was overridden") + } + + err := validateConfig(&cfg) + if err != nil { + t.Errorf("expected no error but got %+v", err) + } +} +func TestValidateConfig_RandomValue(t *testing.T) { + cfg := fillInDefaultConfig(&config{ + HeadingStyle: "random", + }) + + err := validateConfig(&cfg) + if err == nil { + t.Error("expected an error") + } + e, ok := err.(*ValidateConfigError) + if !ok { + t.Error("expected an error of type ValidateConfigError") + } + if e.Key != "HeadingStyle" { + t.Errorf("expected a different value for 'key' but got %q", e.Key) + } + if e.Value != "random" { + t.Errorf("expected a different value for 'actual' but got %q", e.Value) + } + + formatted := err.Error() + if formatted != "invalid value for HeadingStyle:\"random\" must be one of \"atx\" or \"setext\"" { + t.Errorf("expected a different formatted message but got %q", formatted) + } +} + +func TestValidateConfig_KeyWithValue(t *testing.T) { + cfg := fillInDefaultConfig(&config{ + StrongDelimiter: "*", + }) + + err := validateConfig(&cfg) + if err == nil { + t.Error("expected an error") + } + e, ok := err.(*ValidateConfigError) + if !ok { + t.Fatal("expected an error of type ValidateConfigError") + } + + // The default error message for the golang api + formatted1 := err.Error() + expected1 := `invalid value for StrongDelimiter:"*" must be exactly 2 characters of "**" or "__"` + if formatted1 != expected1 { + t.Errorf("expected a different formatted message but got %q", formatted1) + } + + // The error message for the cli + if e.Key == "StrongDelimiter" { + e.KeyWithValue = fmt.Sprintf("--%s=%q", "strong_delimiter", e.Value) + } + formatted2 := err.Error() + expected2 := `invalid value for --strong_delimiter="*" must be exactly 2 characters of "**" or "__"` + if formatted2 != expected2 { + t.Errorf("expected a different formatted message but got %q", formatted2) + } +} diff --git a/plugin/strikethrough/strikethrough.go b/plugin/strikethrough/strikethrough.go new file mode 100644 index 0000000..bf0a8a1 --- /dev/null +++ b/plugin/strikethrough/strikethrough.go @@ -0,0 +1,103 @@ +package strikethrough + +import ( + "bytes" + "unicode" + + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/domutils" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/escape" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/textutils" + "golang.org/x/net/html" +) + +type option func(p *strikethroughPlugin) + +func WithDelimiter(delimiter string) option { + return func(p *strikethroughPlugin) { + p.delimiter = delimiter + } +} + +type strikethroughPlugin struct { + delimiter string +} + +// Strikethrough converts ``, ``, and `` elements +func NewStrikethroughPlugin(opts ...option) converter.Plugin { + plugin := &strikethroughPlugin{} + for _, opt := range opts { + opt(plugin) + } + + if plugin.delimiter == "" { + plugin.delimiter = "~~" + } + + return plugin +} + +func (s *strikethroughPlugin) Init(conv *converter.Converter) error { + conv.Register.PreRenderer(s.handlePreRender, converter.PriorityStandard) + + conv.Register.EscapedChar('~') + conv.Register.UnEscaper(s.handleUnEscapers, converter.PriorityStandard) + + conv.Register.Renderer(s.handleRender, converter.PriorityStandard) + + return nil +} + +func (s *strikethroughPlugin) handlePreRender(ctx converter.Context, doc *html.Node) { + domutils.RemoveRedundant(doc, nameIsBothStrikethough) + domutils.MergeAdjacent(doc, nameIsStrikethough) +} + +func (s *strikethroughPlugin) handleUnEscapers(chars []byte, index int) int { + if chars[index] != '~' { + return -1 + } + + next := escape.GetNextAsRune(chars, index) + + nextIsWhitespace := unicode.IsSpace(next) || next == 0 + if nextIsWhitespace { + // "not followed by Unicode whitespace" + return -1 + } + + return 1 +} + +func nameIsStrikethough(node *html.Node) bool { + name := dom.NodeName(node) + + return name == "del" || name == "s" || name == "strike" +} +func nameIsBothStrikethough(a *html.Node, b *html.Node) bool { + return nameIsStrikethough(a) && nameIsStrikethough(b) +} + +func (s strikethroughPlugin) handleRender(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + if nameIsStrikethough(n) { + return s.renderStrikethrough(ctx, w, n) + } + + return converter.RenderTryNext +} +func (s strikethroughPlugin) renderStrikethrough(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + var buf bytes.Buffer + ctx.RenderChildNodes(ctx, &buf, n) + + content := buf.Bytes() + + // If there is a newline character between the start and end delimiter + // the delimiters won't be recognized. Either we remove all newline characters + // OR on _every_ line we put start & end delimiters. + content = textutils.DelimiterForEveryLine(content, []byte(s.delimiter)) + + w.Write(content) + + return converter.RenderSuccess +} diff --git a/plugin/strikethrough/strikethrough_test.go b/plugin/strikethrough/strikethrough_test.go new file mode 100644 index 0000000..1698383 --- /dev/null +++ b/plugin/strikethrough/strikethrough_test.go @@ -0,0 +1,95 @@ +package strikethrough_test + +import ( + "bytes" + "testing" + + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/tester" + "github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark" + "github.com/JohannesKaufmann/html-to-markdown/v2/plugin/strikethrough" +) + +func TestNewStrikethroughPlugin(t *testing.T) { + runs := []struct { + desc string + input string + expected string + }{ + { + desc: "simple", + input: `

      Text

      `, + expected: `~~Text~~`, + }, + { + desc: "with spaces inside", + input: `

      Text

      `, + expected: `~~Text~~`, + }, + { + desc: "with spaces inside", + input: `

      ~~A~~B~~

      `, + expected: `~~\~\~A\~\~B\~\~~~`, + }, + { + desc: "nested", + input: `

      A B C

      `, + expected: `~~A B C~~`, + }, + { + desc: "adjacent", + input: `

      AB C

      `, + expected: `~~AB~~ ~~C~~`, + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + conv := converter.NewConverter( + converter.WithPlugins(strikethrough.NewStrikethroughPlugin()), + ) + + out, err := conv.ConvertString(run.input) + if err != nil { + t.Error(err) + } + if out != run.expected { + t.Errorf("expected %q but got %q", run.expected, out) + } + }) + } +} +func TestWithDelimiter(t *testing.T) { + conv := converter.NewConverter( + converter.WithPlugins( + strikethrough.NewStrikethroughPlugin( + strikethrough.WithDelimiter("=="), + ), + ), + ) + + input := `

      Text

      ` + expected := `==Text==` + + out, err := conv.ConvertString(input) + if err != nil { + t.Error(err) + } + if out != expected { + t.Errorf("expected %q but got %q", expected, out) + } +} + +func TestGoldenFiles(t *testing.T) { + goldenFileConvert := func(htmlInput []byte) ([]byte, error) { + conv := converter.NewConverter( + converter.WithPlugins( + commonmark.NewCommonmarkPlugin(), + strikethrough.NewStrikethroughPlugin(), + ), + ) + + return conv.ConvertReader(bytes.NewReader(htmlInput)) + } + + tester.GoldenFiles(t, goldenFileConvert, goldenFileConvert) +} diff --git a/plugin/strikethrough/testdata/.gitattributes b/plugin/strikethrough/testdata/.gitattributes new file mode 100644 index 0000000..a8d2daa --- /dev/null +++ b/plugin/strikethrough/testdata/.gitattributes @@ -0,0 +1,4 @@ + +# Leave the files untouched. Otherwise they might be +# changed when cloning the repo on Windows... +* -text diff --git a/plugin/strikethrough/testdata/GoldenFiles/strikethrough.in.html b/plugin/strikethrough/testdata/GoldenFiles/strikethrough.in.html new file mode 100644 index 0000000..67cf8c0 --- /dev/null +++ b/plugin/strikethrough/testdata/GoldenFiles/strikethrough.in.html @@ -0,0 +1,4 @@ +strikethrough content + +

      ~

      +

      *

      diff --git a/plugin/strikethrough/testdata/GoldenFiles/strikethrough.out.md b/plugin/strikethrough/testdata/GoldenFiles/strikethrough.out.md new file mode 100644 index 0000000..2815e2c --- /dev/null +++ b/plugin/strikethrough/testdata/GoldenFiles/strikethrough.out.md @@ -0,0 +1,5 @@ +~~strikethrough content~~ + +~ + +\* \ No newline at end of file