-
Notifications
You must be signed in to change notification settings - Fork 0
/
format-md.py
125 lines (93 loc) · 4.05 KB
/
format-md.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Convert *.md files to *.qmd files and pre-process them
# Randomize footnote identifiers in multiple Quarto files to avoid conflicts
# Convert reference-style links to inline links
# Remove line breaks within a straight Chinese quote
# Copyright: © 2024 Tom Ben
# License: MIT License
import re
import glob
import os
import random
import string
def get_md_files():
# Get all *.md files
return [f for f in glob.glob("contents/[0-9]*.md")]
def randomize_footnote_identifiers(qmd_content):
# Find all existing footnote identifiers (numbers)
existing_ids = set(re.findall(r'\[\^(\d+)\]', qmd_content))
# Generate a unique random identifier for each existing footnote
unique_ids = {}
for id in existing_ids:
# Generate a random string of 5 characters
new_id = ''.join(random.choices(
string.ascii_letters + string.digits, k=5))
while new_id in unique_ids.values():
new_id = ''.join(random.choices(
string.ascii_letters + string.digits, k=5))
unique_ids[id] = new_id
# Replace all footnote references and definitions with new identifiers
for old_id, new_id in unique_ids.items():
qmd_content = re.sub(rf'\[\^{old_id}\]', f'[^{new_id}]', qmd_content)
qmd_content = re.sub(rf'\[\^{old_id}\]:', f'[^{new_id}]:', qmd_content)
return qmd_content
def convert_reference_to_inline(qmd_content):
# Extract reference links
reference_links = {}
reference_pattern = re.compile(r'\n\[(\d+)\]:\s*(.*)')
for match in reference_pattern.findall(qmd_content):
reference_links[match[0]] = match[1]
# Remove the reference link definitions from the qmd_content
qmd_content = reference_pattern.sub('', qmd_content)
# Replace reference-style link usages with inline links
def replace_link(match):
text = match.group(1)
key = match.group(2)
url = reference_links.get(key, '')
return f'[{text}]({url})'
usage_pattern = re.compile(r'\[(.*?)\]\[(\d+)\]')
qmd_content = usage_pattern.sub(replace_link, qmd_content)
return qmd_content
def remove_linebreaks_in_quotes(text):
# Regular expression pattern to find blocks within single Chinese quotes
pattern = r'「[^」]*?」'
# Function to replace newlines in the found quoted text
def replace_newlines(m):
# Remove all newlines within the quote block
return m.group(0).replace('\n', '')
# Use re.sub to replace the newline characters in each match
cleaned_text = re.sub(pattern, replace_newlines, text)
return cleaned_text
def process_file(input_file, output_file):
with open(input_file, "r", encoding="utf-8") as f:
content = f.read()
# Remove links with `[@]` and a space before it
content = re.sub(r"\s*\[@\].*?[\]\)]", "", content)
# Remove square brackets enclosing the caption
content = re.sub(r"^\[(.*)\}\]$", r"\n :\1}", content, flags=re.MULTILINE)
# Merge multiple adjacent citations into one
content = re.sub(r"\][\(\[].*?;\s*\[", "; ", content)
# Replace '{{\<...\>}}' with '{{<...>}}'
content = re.sub(r"\{\{\\<(.*)\\>}}", r"{{<\1>}}", content)
# Remove comment blocks to avoid errors of Python filter
content = re.sub(r"^```{=comment}.*?^```$", "",
content, flags=re.DOTALL | re.MULTILINE)
# Randomize footnote identifiers
content = randomize_footnote_identifiers(content)
# Convert reference-style links to inline links
content = convert_reference_to_inline(content)
# Remove line breaks in quotes
content = remove_linebreaks_in_quotes(content)
with open(output_file, "w", encoding="utf-8") as f:
f.write(content)
def main():
md_files = get_md_files()
# Convert *.md files to *.qmd files
qmd_files = [f.replace(".md", ".qmd") for f in md_files]
for md_file, qmd_file in zip(md_files, qmd_files):
process_file(md_file, qmd_file)
os.chdir('contents')
qmd_files = glob.glob('*.qmd')
for qmd_file in qmd_files:
process_file(qmd_file, qmd_file)
if __name__ == "__main__":
main()