This repository has been archived by the owner on Dec 8, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
make-spreadsheet.py
157 lines (138 loc) · 5 KB
/
make-spreadsheet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from __future__ import print_function
import pickle
import os.path
import base64
import email
import csv
import requests
import settings
import json
from pprint import pprint
from datetime import datetime
from titlecase import titlecase
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from bs4 import BeautifulSoup
# If modifying these scopes, delete the file token.pickle.
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
def main():
creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists("token.pickle"):
with open("token.pickle", "rb") as token:
creds = pickle.load(token)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open("token.pickle", "wb") as token:
pickle.dump(creds, token)
service = build("gmail", "v1", credentials=creds)
# Call the Gmail API
results = (
service.users().messages().list(userId="me", q="in:IR is:unread").execute()
)
messages = results.get("messages", [])
# handle the data
output_data = []
# loop through the email messages
for message in messages:
data = service.users().messages().get(userId="me", id=message["id"]).execute()
try:
data_b = base64.urlsafe_b64decode(data["payload"]["body"]["data"])
except:
data_b = base64.urlsafe_b64decode(data["payload"]["parts"][0]["body"]["data"])
data_s = data_b.decode("utf-8")
# beacause there may be multiple items per email, split it up into chunks
data_split = data_s.split(
'<h3 style="font-weight:normal;margin:0;font-size:17px;line-height:20px;">'
)
for item in data_split[1:]:
soup = BeautifulSoup(item, features="html.parser")
# fetch the title
title = soup.find("a", class_="gse_alrt_title").text
title = titlecase(title)
try:
del title["style"]
except TypeError:
pass
# fetch the url
url_soup = soup.find("a", class_="gse_alrt_title", href=True)
url = url_soup["href"]
# fetch the author and publication
auth_and_pub = str(soup.select("div[style='color:#006621']")[0])
clean_auth_and_pub = BeautifulSoup(auth_and_pub, "lxml").text
spl1 = clean_auth_and_pub.split("\xa0- ")
if len(spl1) == 1:
spl2 = spl1[0].split(" - ")
else:
spl2 = spl1
author = spl2[0] if 0 < len(spl2) else ""
journal = spl2[1] if 1 < len(spl2) else ""
if journal in ["2019", "2020", "2021", "2022", "2023", "2024", "2025"]:
journal = ""
else:
journal = titlecase(str(journal))
if journal[-6:] in [
", 2019",
", 2020",
", 2021",
", 2022",
", 2023",
", 2024",
", 2025",
]:
journal = journal[:-6]
rights = fetch_rights(journal)
# append the data
output_data.append((datetime.now(), author, title, journal, url, rights))
return output_data
def fetch_rights(journal):
if journal == "":
return ""
requests_url = "https://v2.sherpa.ac.uk/cgi/retrieve/cgi/retrieve?item-type=publication&api-key=" \
+ settings.romeo2 \
+ "&format=Json&limit=5&filter=[[%22title%22,%22equals%22,%22" \
+ journal \
+ "%22]]"
print(requests_url)
resp = requests.get(requests_url)
try:
print(resp.status_code)
data = resp.json()
policies = data["items"][0]["publisher_policy"]
policy_urls = ""
for policy in policies:
for urls in policy["urls"]:
url = urls["url"]
policy_urls = url + "\n" + policy_urls
return policy_urls
except:
return ""
def make_csv(data):
""" make a csv file with all the data """
with open("output.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(
[
"Date",
"Prof",
"Article",
"Journal",
"URL",
"Allowed?",
"Emailed?",
"Status",
]
)
writer.writerows(data)
if __name__ == "__main__":
data = main()
make_csv(data)