-
Notifications
You must be signed in to change notification settings - Fork 0
/
sort_by_date.py
80 lines (65 loc) · 2.91 KB
/
sort_by_date.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Parsing and ordering all paper entries based on the extracted file content
import pandas as pd
import calendar
import re
# 读取文件内容
with open("add_paper_here.md", "r", encoding="utf-8") as file:
sample_input = file.read()
def parse_date_with_defaults(date_str):
try:
return pd.to_datetime(date_str, errors='coerce')
except ValueError:
try:
parsed_date = parser.parse(date_str, fuzzy=True, default=pd.Timestamp.now())
last_day = calendar.monthrange(parsed_date.year, parsed_date.month)[1]
return pd.Timestamp(year=parsed_date.year, month=parsed_date.month, day=last_day)
except ValueError:
try:
year = int(re.search(r"\b(20\d{2})\b", date_str).group(0))
return pd.Timestamp(year=year, month=12, day=31)
except (ValueError, AttributeError):
return pd.NaT
# Define regex pattern based on new input format
new_format_pattern = re.compile(
r"- \[(.*?)\]\((.*?)\)\s+"
r"- (.*?)\s+"
r"- 🏛️ Institutions: (.*?)\s+"
r"- 📅 Date: (.*?)\s+"
r"- 📑 Publisher: (.*?)\s+"
r"- 💻 Env: \[(.*?)\]\s+"
r"- 🔑 Key: (.*?)\s+"
r"- 📖 TLDR: (.*?)\n",
re.DOTALL
)
# Extract data and process it
parsed_entries = []
for match in new_format_pattern.findall(sample_input):
title, link, authors, institutions, date, publisher, env, keywords, tldr = match
parsed_date = parse_date_with_defaults(date)
# Wrap keywords in brackets, split by commas
formatted_keywords = ", ".join([f"{kw.strip()}" for kw in keywords.split(",")])
parsed_entries.append((title, link, authors, institutions, date, parsed_date, publisher, f"[{env.strip()}]",
formatted_keywords, tldr))
# Convert to DataFrame and sort
papers_df = pd.DataFrame(parsed_entries, columns=[
'Title', 'Link', 'Authors', 'Institutions', 'Original Date', 'Parsed Date', 'Publisher', 'Env', 'Keywords', 'TLDR'
]).drop_duplicates(subset='Title', keep='first')
papers_df.sort_values(by='Parsed Date', ascending=False, inplace=True)
# Format output to new specified Markdown structure
sorted_markdown = []
for _, row in papers_df.iterrows():
markdown_entry = f"- [{row['Title']}]({row['Link']})\n" \
f" - {row['Authors']}\n" \
f" - 🏛️ Institutions: {row['Institutions']}\n" \
f" - 📅 Date: {row['Original Date']}\n" \
f" - 📑 Publisher: {row['Publisher']}\n" \
f" - 💻 Env: {row['Env']}\n" \
f" - 🔑 Key: {row['Keywords']}\n" \
f" - 📖 TLDR: {row['TLDR']}\n"
sorted_markdown.append(markdown_entry)
# Combine into final output
final_output = "\n".join(sorted_markdown)
# print((final_output))
print(papers_df)
with open("add_paper_here.md", "w", encoding="utf-8") as file:
file.write(final_output)