-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions.py
375 lines (280 loc) · 14.2 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
# load packages
import openai
import requests
import json
import time
from bs4 import BeautifulSoup
from pprint import pprint
######################################################################################
############################### fetch_details ########################################
######################################################################################
# define fetching function based on msg number
def fetch_details(msg_number, headers, timeout, retries):
# Initialize an empty dictionary to hold the details
details = {}
# Add the message number to the dictionary
details['id'] = msg_number
# Generate the full URL of the detailed page
url = f"https://www.mail-archive.com/nmusers@globomaxnm.com/msg{msg_number}.html"
for attempt in range(retries):
try:
# Fetch the HTML content from the URL
response = requests.get(url, headers = headers, timeout = timeout)
response.raise_for_status() # Ensure we raise errors for 4xx and 5xx responses
page_content = response.text
# Initialize a BeautifulSoup object and specify the parser
soup = BeautifulSoup(page_content, 'html.parser')
# Additional fields for thread details
thread_section = soup.select_one('div.tSliceList ul.icons')
# Check if the list is empty
details['is_standalone'] = len(thread_section.find_all('li', recursive=False)) == 0 if thread_section else True
# Extract message IDs within the thread
details['thread_message_ids'] = []
if not details['is_standalone']:
thread_msgs = thread_section.select('a[href^="msg"]')
details['thread_message_ids'] = [msg['href'].replace('msg', '').replace('.html', '') for msg in thread_msgs]
# append id to thread_message_ids
details['thread_message_ids'].append(msg_number)
# sort message ids in upscending order
details['thread_message_ids'] = sorted(details['thread_message_ids'], key=int)
# Extract the date
date_tag = soup.select_one('span.date a')
if date_tag:
details['date'] = date_tag.text.strip()
# Extract the subject
subject_tag = soup.select_one('span.subject span[itemprop="name"]')
if subject_tag and subject_tag.text.strip():
details['subject'] = subject_tag.text.strip()
else:
details['subject'] = "no title"
# Extract the author
author_tag = soup.select_one('span.sender span[itemprop="name"]')
if author_tag:
details['author'] = author_tag.text.strip()
# Extract the message text
message_tag = soup.select_one('div.msgBody')
if message_tag:
details['message'] = message_tag.text.strip()
# if everything worked out, break the loop
return details
except requests.RequestException as e: # Catches any request-related exceptions
if attempt < retries - 1: # if not the last attempt
time.sleep(5) # delay for 5 seconds before next retry
continue # go to the next iteration of the loop to retry
else:
print(f"Failed to fetch details for msg{msg_number} after {retries} attempts. Error: {str(e)}")
details['message'] = "Scraping failed"
details['is_standalone'] = False
details['thread_message_ids'] = []
details['date'] = "Scraping failed"
details['subject'] = "Scraping failed"
details['author'] = "Scraping failed"
return details
######################################################################################
############################### create_discussion ####################################
######################################################################################
# define function to create discussion in GitHub repo
def create_discussion(api_token, repository_id, category_id, date, author, title, body):
# define url
url = 'https://api.github.com/graphql'
# define additional information (pre)
add_inf_pre = f'**Date:** {date}\n**Author:** {author}\n\n'
# defione additional information (post)
add_inf_post = f'\n\n---\n\n*Please note: this discussion was automatically created via web scraping of the nmusers mail archive. If you have any questions, please contact the original author of this message. If you are the original author and want your message deleted, you can contact the maintainer at any time.*'
# replace '"' with '\"' in body
body = body.replace('"', '\\"')
# add additional information to body
body = add_inf_pre + body + add_inf_post
# define query
query = f'mutation{{createDiscussion(input: {{repositoryId: "{repository_id}", categoryId: "{category_id}", body: "{body}", title: "{title}"}}) {{discussion {{id}}}}}}'
# define json paylod
json_payload = { 'query' : query}
# define headers
headers = {'Authorization': 'token %s' % api_token}
# post request and return json
r = requests.post(url=url, json=json_payload, headers=headers)
# return response text and store as dict
return json.loads(r.text)
#######################################################################################
############################ add_comment_to_discussion ################################
#######################################################################################
# Function to add a comment to an existing discussion in GitHub
def add_comment_to_discussion(api_token, discussion_id, body, date, author, silent):
# Define GraphQL URL
url = 'https://api.github.com/graphql'
# define additional information (pre)
add_inf_pre = f'**Date:** {date}\n**Author:** {author}\n\n'
# defione additional information (post)
add_inf_post = f'\n\n---\n\n*Please note: this discussion was automatically created via web scraping of the nmusers mail archive. If you have any questions, please contact the original author of this message. If you are the original author and want your message deleted, you can contact the maintainer at any time.*'
# Replace '"' with '\"' in body
body = body.replace('"', '\\"')
# add additional information to body
body = add_inf_pre + body + add_inf_post
# Define GraphQL query for adding a comment
query = f'''mutation{{
addDiscussionComment(input: {{ body: "{body}", discussionId: "{discussion_id}"}}) {{
comment {{
id
}}
}}
}}'''
# Define JSON payload
json_payload = {'query': query}
# Define headers
headers = {'Authorization': f'token {api_token}'}
# Make POST request
r = requests.post(url=url, json=json_payload, headers=headers)
# Print response if not silent
if not silent:
print(r.text + "\n")
#######################################################################################
################################ extract_threads ######################################
#######################################################################################
# function to extract the threads from a list of messages
def extract_threads(messages_dict):
thread_dict = {}
thread_id = 1 # Starting thread ID
# To keep track of messages that are already part of a thread
seen_messages = set()
# Loop through each message
for msg_id, msg in messages_dict.items():
# Skip this message if it's already part of a thread
if msg_id in seen_messages:
continue
# Extract the thread message IDs using .get() method with a default empty list
thread_message_ids = msg.get('thread_message_ids', [])
# add threads
thread_dict[thread_id] = {
'ids': thread_message_ids if thread_message_ids else None,
'category': None, # placeholder
'labels': None # placeholder
}
thread_id += 1
# Mark all messages in this thread as seen
seen_messages.update(thread_message_ids)
return thread_dict
#######################################################################################
############################### fetch_missing_messages ################################
#######################################################################################
# Function to fetch missing messages in thread_dict and add them to msg dictionary
def fetch_missing_messages(thread_dict, msg, headers, timeout, retries, first_only = False):
for thread_id, thread_info in thread_dict.items():
msg_ids = thread_info['ids']
if first_only:
msg_ids = [msg_ids[0]]
for msg_id in msg_ids:
if msg_id not in msg:
# print statement
print("fetch_missing_messages: fetching message " + msg_id)
# Fetch missing message details
fetched_msg = fetch_details(
msg_number = msg_id,
headers = headers,
timeout = timeout,
retries = retries
)
# Add the fetched message to msg dictionary
msg[msg_id] = fetched_msg
#######################################################################################
############################### delete_discussion #####################################
#######################################################################################
# Function to delete a discussion by its ID
def delete_discussion(api_token, discussion_id):
# Define GraphQL URL
url = 'https://api.github.com/graphql'
# Define GraphQL query for deleting a discussion
query = f'''mutation{{
deleteDiscussion(input: {{ id: "{discussion_id}" }}) {{
clientMutationId
}}
}}'''
# Define JSON payload
json_payload = {'query': query}
# Define headers
headers = {'Authorization': f'token {api_token}'}
# Make POST request
r = requests.post(url=url, json=json_payload, headers=headers)
# Check for errors
if r.status_code == 200:
print(f'Successfully deleted discussion with ID {discussion_id}.')
else:
print(f'Failed to delete discussion with ID {discussion_id}. Error: {r.text}')
#######################################################################################
############################### list_all_discussions ##################################
#######################################################################################
# Function to list all discussions by repository ID
def list_all_discussions(api_token, repository_id):
# Define GraphQL URL
url = 'https://api.github.com/graphql'
# Define GraphQL query for fetching discussions
query = f'''
query {{
node(id: "{repository_id}") {{
... on Repository {{
discussions(first: 100) {{
nodes {{
id
title
}}
}}
}}
}}
}}
'''
# Define JSON payload
json_payload = {'query': query}
# Define headers
headers = {'Authorization': f'token {api_token}'}
# Make POST request
r = requests.post(url=url, json=json_payload, headers=headers)
# Parse the JSON response
discussions_data = json.loads(r.text)
# Extract discussion ids and titles
discussions = [(d['id'], d['title']) for d in discussions_data['data']['node']['discussions']['nodes']]
return discussions
#######################################################################################
############################### get_chat_completion ###################################
#######################################################################################
def get_chat_completion(api_key, categories, message_text, model):
# Set OpenAI API key
openai.api_key = api_key
# Construct the messages
messages = [
{"role": "system", "content": f"You are a machine which classifies emails by content. You can only speak 4 words = categories: {', '.join(categories)} Your output is directly used as a string for the category. If you deviate from the 4 words, the program will fail and you will be punished. Even a . after the category can crash the program so stick exactly to the categories provided and do not deviate. Please include Webinars to Announcements. "},
{"role": "user", "content": message_text}
]
# Make API request
chat_completion = openai.ChatCompletion.create(
model=model,
messages=messages
)
# Extract and return the content
return chat_completion["choices"][0]["message"]["content"]
###########################################################################################
############################### add_labels_to_discussion ##################################
###########################################################################################
# Function to add a set of labels to an existing discussion in GitHub
def add_labels_to_discussion(api_token, discussion_id, labels):
# Define GraphQL URL
url = 'https://api.github.com/graphql'
# Convert the list of labels to GraphQL array format
graphql_labels = json.dumps(labels)
# Define GraphQL query for adding labels
query = f'''
mutation{{
addLabelsToLabelable(input: {{ labelableId: "{discussion_id}", labelIds: {graphql_labels} }}) {{
clientMutationId
}}
}}
'''
# Define JSON payload
json_payload = {'query': query}
# Define headers
headers = {'Authorization': f'token {api_token}'}
# Make POST request
r = requests.post(url=url, json=json_payload, headers=headers)
# Check for errors
if r.status_code == 200:
print(f'Successfully added labels to discussion with ID {discussion_id}.')
else:
print(f'Failed to add labels to discussion with ID {discussion_id}. Error: {r.text}')