_extensions/format-md.py

# Convert *.md files to *.qmd files and pre-process them
# Randomize footnote identifiers in multiple Quarto files to avoid conflicts
# Convert reference-style links to inline links
# Remove line breaks within a straight Chinese quote

# Copyright: © 2024 Tom Ben
# License: MIT License

import re
import glob
import os
import random
import string


def get_md_files():
    # Get all *.md files
    return [f for f in glob.glob("contents/[0-9]*.md")]


def randomize_footnote_identifiers(qmd_content):
    # Find all existing footnote identifiers (numbers)
    existing_ids = set(re.findall(r'\[\^(\d+)\]', qmd_content))

    # Generate a unique random identifier for each existing footnote
    unique_ids = {}
    for id in existing_ids:
        # Generate a random string of 5 characters
        new_id = ''.join(random.choices(
            string.ascii_letters + string.digits, k=5))
        while new_id in unique_ids.values():
            new_id = ''.join(random.choices(
                string.ascii_letters + string.digits, k=5))
        unique_ids[id] = new_id

    # Replace all footnote references and definitions with new identifiers
    for old_id, new_id in unique_ids.items():
        qmd_content = re.sub(rf'\[\^{old_id}\]', f'[^{new_id}]', qmd_content)
        qmd_content = re.sub(rf'\[\^{old_id}\]:', f'[^{new_id}]:', qmd_content)

    return qmd_content


def convert_reference_to_inline(qmd_content):
    # Extract reference links
    reference_links = {}
    reference_pattern = re.compile(r'\n\[(\d+)\]:\s*(.*)')
    for match in reference_pattern.findall(qmd_content):
        reference_links[match[0]] = match[1]

    # Remove the reference link definitions from the qmd_content
    qmd_content = reference_pattern.sub('', qmd_content)

    # Replace reference-style link usages with inline links
    def replace_link(match):
        text = match.group(1)
        key = match.group(2)
        url = reference_links.get(key, '')
        return f'[{text}]({url})'

    usage_pattern = re.compile(r'\[(.*?)\]\[(\d+)\]')
    qmd_content = usage_pattern.sub(replace_link, qmd_content)

    return qmd_content


def remove_linebreaks_in_quotes(text):
    # Regular expression pattern to find blocks within single Chinese quotes
    pattern = r'「[^」]*?」'

    # Function to replace newlines in the found quoted text
    def replace_newlines(m):
        # Remove all newlines within the quote block
        return m.group(0).replace('\n', '')

    # Use re.sub to replace the newline characters in each match
    cleaned_text = re.sub(pattern, replace_newlines, text)

    return cleaned_text


def process_file(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f:
        content = f.read()

    # Remove links with `[@]` and a space before it
    content = re.sub(r"\s*\[@\].*?[\]\)]", "", content)
    # Remove square brackets enclosing the caption
    content = re.sub(r"^\[(.*)\}\]$", r"\n  :\1}", content, flags=re.MULTILINE)
    # Merge multiple adjacent citations into one
    content = re.sub(r"\][\(\[].*?;\s*\[", "; ", content)
    # Replace '{{\<...\>}}' with '{{<...>}}'
    content = re.sub(r"\{\{\\<(.*)\\>}}", r"{{<\1>}}", content)
    # Remove comment blocks to avoid errors of Python filter
    content = re.sub(r"^```{=comment}.*?^```$", "",
                     content, flags=re.DOTALL | re.MULTILINE)

    # Randomize footnote identifiers
    content = randomize_footnote_identifiers(content)
    # Convert reference-style links to inline links
    content = convert_reference_to_inline(content)
    # Remove line breaks in quotes
    content = remove_linebreaks_in_quotes(content)

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(content)


def main():
    md_files = get_md_files()
    # Convert *.md files to *.qmd files
    qmd_files = [f.replace(".md", ".qmd") for f in md_files]

    for md_file, qmd_file in zip(md_files, qmd_files):
        process_file(md_file, qmd_file)

    os.chdir('contents')
    qmd_files = glob.glob('*.qmd')

    for qmd_file in qmd_files:
        process_file(qmd_file, qmd_file)


if __name__ == "__main__":
    main()