-
Notifications
You must be signed in to change notification settings - Fork 37
/
journal_export.py
118 lines (83 loc) · 3.7 KB
/
journal_export.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import gzip
import tempfile
import boto
import unicodecsv
from sqlalchemy import sql
from app import db
OA_STATS_FILE = 'journal_open_access.csv.gz'
REPO_FILE = 'repositories.csv.gz'
REQUESTS_FILE = 'extension_requests.csv.gz'
ISSNS_FILE = 'crossref_issns.csv.gz'
def _write_oa_stats_csv():
# get oa stats by color
stats_fields = [
'issn_l', 'title', 'year',
'num_dois', 'num_open', 'open_rate',
'num_green', 'green_rate',
'num_bronze', 'bronze_rate',
'num_hybrid', 'hybrid_rate',
'num_gold', 'gold_rate',
]
csv_fields = stats_fields + ['is_in_doaj', 'is_gold_journal']
journal_stats_rows = db.engine.execute(
sql.text('''
select {stats_fields}, doaj.issn_l is not null as is_in_doaj, oa.issn_l is not null as is_gold_journal
from oa_rates_by_journal_year
left join oa_issn_l_years oa using (issn_l, year)
left join doaj_issn_l_years doaj using (issn_l, year)
where issn_l is not null and year is not null
'''.format(stats_fields=', '.join(stats_fields)))
).fetchall()
journal_dicts = [dict(list(zip(csv_fields, row))) for row in journal_stats_rows]
csv_filename = tempfile.mkstemp()[1]
with gzip.open(csv_filename, 'wb') as csv:
writer = unicodecsv.DictWriter(csv, csv_fields, dialect='excel', encoding='utf-8')
writer.writeheader()
for row in journal_dicts:
writer.writerow(row)
return csv_filename
def _write_repo_csv():
rows = db.engine.execute(sql.text('''
select issn_l, endpoint_id, r.repository_name, r.institution_name, r.home_page, e.pmh_url, num_articles
from num_articles_by_journal_repo
join endpoint e on e.id = endpoint_id
join repository r on e.repo_unique_id = r.id
''')).fetchall()
csv_filename = tempfile.mkstemp()[1]
with gzip.open(csv_filename, 'wb') as csv:
writer = unicodecsv.writer(csv, dialect='excel', encoding='utf-8')
writer.writerow(('issn_l', 'endpoint_id', 'repository_name', 'institution_name', 'home_page', 'pmh_url', 'num_articles'))
for row in rows:
writer.writerow(row)
return csv_filename
def _write_crossref_issn_csv():
issns = db.engine.execute(sql.text('select issn from crossref_issn')).fetchall()
csv_filename = tempfile.mkstemp()[1]
with gzip.open(csv_filename, 'wb') as csv:
writer = unicodecsv.writer(csv, dialect='excel', encoding='utf-8')
writer.writerow(['issn'])
for issn in issns:
writer.writerow(issn)
return csv_filename
def _write_requests_csv():
rows = db.engine.execute(sql.text('select month, issn_l, requests from extension_journal_requests_by_month')).fetchall()
csv_filename = tempfile.mkstemp()[1]
with gzip.open(csv_filename, 'wb') as csv:
writer = unicodecsv.writer(csv, dialect='excel', encoding='utf-8')
writer.writerow(('month', 'issn_l', 'requests'))
for row in rows:
writer.writerow(row)
return csv_filename
def _upload_journal_file(filename, object_name):
s3 = boto.connect_s3(host='s3-us-west-2.amazonaws.com')
bucket = s3.get_bucket('unpaywall-journal-csv')
boto.s3.key.Key(bucket=bucket, name=object_name).set_contents_from_filename(filename, replace=True)
def get_journal_file_key(filename):
s3 = boto.connect_s3()
bucket = s3.get_bucket('unpaywall-journal-csv')
return bucket.lookup(filename)
if __name__ == "__main__":
_upload_journal_file(_write_oa_stats_csv(), OA_STATS_FILE)
_upload_journal_file(_write_repo_csv(), REPO_FILE)
_upload_journal_file(_write_requests_csv(), REQUESTS_FILE)
_upload_journal_file(_write_crossref_issn_csv(), ISSNS_FILE)