-
Notifications
You must be signed in to change notification settings - Fork 0
/
uploader.py
303 lines (276 loc) · 13.8 KB
/
uploader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2011-2014 WikiTeam
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import getopt
import os
import re
import subprocess
import sys
import time
import urllib
import urllib2
from xml.sax.saxutils import quoteattr
import dumpgenerator
# Configuration goes here
# You need a file named keys.txt with access and secret keys, in two different lines
accesskey = open('keys.txt', 'r').readlines()[0].strip()
secretkey = open('keys.txt', 'r').readlines()[1].strip()
# Use --admin if you are a wikiteam collection admin, or specify another collection:
collection = 'opensource'
# Nothing to change below
convertlang = {'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese', 'ru': 'Russian'}
listfile = sys.argv[1]
uploadeddumps = []
try:
uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines()]
except:
pass
print '%d dumps uploaded previously' % (len(uploadeddumps))
def getParameters(params=[]):
if not params:
params = sys.argv[2:]
config = {
'prune-directories': False,
'prune-wikidump': False,
'collection': collection
}
#console params
try:
opts, args = getopt.getopt(params, "", ["h", "help", "prune-directories", "prune-wikidump", "admin"])
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
for o, a in opts:
if o in ("-h","--help"):
usage()
sys.exit()
elif o in ("--prune-directories"):
config['prune-directories'] = True
elif o in ("--prune-wikidump"):
config['prune-wikidump'] = True
elif o in ("--admin"):
config['collection'] = "wikiteam"
return config
def usage():
""" """
print """uploader.py
This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
The list must be a text file with the wiki's api.php URLs, one per line.
Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
as produced by launcher.py (explained in https://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump ).
You need a file named keys.txt with access and secret keys, in two different lines
You also need dumpgenerator.py in the same directory as this script.
Use --help to print this help."""
def log(wiki, dump, msg):
f = open('uploader-%s.log' % (listfile), 'a')
f.write('\n%s;%s;%s' % (wiki, dump, msg))
f.close()
def upload(wikis, config={}):
for wiki in wikis:
print "#"*73
print "# Uploading", wiki
print "#"*73
wiki = wiki.lower()
prefix = dumpgenerator.domain2prefix(config={'api': wiki})
wikiname = prefix.split('-')[0]
dumps = []
for dirname, dirnames, filenames in os.walk('.'):
if dirname == '.':
for f in filenames:
if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
dumps.append(f)
break
c = 0
for dump in dumps:
wikidate = dump.split('-')[1]
if dump in uploadeddumps:
if config['prune-directories']:
rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
# With -f the deletion might have happened before and we won't know
if not os.system(rmline):
print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
# Simplistic quick&dirty check for the presence of this file in the item
stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
dumphash = re.sub(' +.+\n?', '', stdout)
headers = {'User-Agent': dumpgenerator.getUserAgent()}
itemdata = urllib2.Request(url='http://archive.org/metadata/wiki-' + wikiname, headers=headers)
if re.search(dumphash, urllib2.urlopen(itemdata).read()):
log(wiki, dump, 'verified')
rmline='rm -rf %s' % dump
if not os.system(rmline):
print 'DELETED ' + dump
print '%s was uploaded before, skipping...' % (dump)
continue
else:
print 'ERROR: The online item misses ' + dump
log(wiki, dump, 'missing')
# We'll exit this if and go upload the dump
else:
print '%s was uploaded before, skipping...' % (dump)
continue
time.sleep(0.1)
wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
print wiki, wikiname, wikidate, dump
# Does the item exist already?
headers = {'User-Agent': dumpgenerator.getUserAgent()}
itemdata = urllib2.Request(url='http://archive.org/metadata/wiki-' + wikiname, headers=headers)
if urllib2.urlopen(itemdata).read() == '{}':
ismissingitem = True
else:
ismissingitem = False
# We don't know a way to fix/overwrite metadata if item exists already:
# just pass bogus data and save some time
if ismissingitem:
#get metadata from api.php
#first sitename and base url
params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
data = urllib.urlencode(params)
req = urllib2.Request(url=wiki, data=data, headers=headers)
xml = ''
try:
f = urllib2.urlopen(req)
xml = f.read()
f.close()
except:
pass
sitename = ''
baseurl = ''
lang = ''
try:
sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0]
except:
pass
try:
baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0]
except:
pass
try:
lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0]
except:
pass
if not sitename:
sitename = wikiname
if not baseurl:
baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki)
if lang:
lang = convertlang.has_key(lang.lower()) and convertlang[lang.lower()] or lang.lower()
#now copyright info from API
params = {'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml'}
data = urllib.urlencode(params)
req = urllib2.Request(url=wiki, data=data, headers=headers)
xml = ''
try:
f = urllib2.urlopen(req)
xml = f.read()
f.close()
except:
pass
rightsinfourl = ''
rightsinfotext = ''
try:
rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0]
rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0]
except:
pass
#or copyright info from #footer in mainpage
if baseurl and not rightsinfourl and not rightsinfotext:
raw = ''
try:
f = urllib.urlopen(baseurl)
raw = f.read()
f.close()
except:
pass
rightsinfotext = ''
rightsinfourl = ''
try:
rightsinfourl = re.findall(ur"<link rel=\"copyright\" href=\"([^\"]+)\" />", raw)[0]
except:
pass
try:
rightsinfotext = re.findall(ur"<li id=\"copyright\">([^\n\r]*?)</li>", raw)[0]
except:
pass
if rightsinfotext and not rightsinfourl:
rightsinfourl = baseurl + '#footer'
#retrieve some info from the wiki
wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
wikidesc = "<a href=\"%s\">%s</a> dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools." % (baseurl, sitename)# "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools."
wikikeys = ['wiki', 'wikiteam', 'MediaWiki', sitename, wikiname] # ecg; ECGpedia; wiki; wikiteam; MediaWiki
if not rightsinfourl and not rightsinfotext:
wikikeys.append('unknowncopyright')
wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/
wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php
else:
print 'Item already exists.'
lang = 'foo'
wikititle = 'foo'
wikidesc = 'foo'
wikikeys = 'foo'
wikilicenseurl = 'foo'
wikirights = 'foo'
wikiurl = 'foo'
#creates curl command
curl = ['curl', '--location',
'--header', "'x-amz-auto-make-bucket:1'", # Creates the item automatically, need to give some time for the item to correctly be created on archive.org, or everything else will fail, showing "bucket not found" error
'--header', "'x-archive-queue-derive:0'",
'--header', "'x-archive-size-hint:%d'" % (os.path.getsize(dump)),
'--header', "'authorization: LOW %s:%s'" % (accesskey, secretkey),
]
if c == 0:
curl += ['--header', "'x-archive-meta-mediatype:web'",
'--header', "'x-archive-meta-collection:%s'" % (config['collection']),
'--header', quoteattr('x-archive-meta-title:' + wikititle),
'--header', "'x-archive-meta-description:%s'" % wikidesc.replace("'", r"\'"),
'--header', quoteattr('x-archive-meta-language:' + lang),
'--header', "'x-archive-meta-last-updated-date:%s'" % (wikidate_text),
'--header', "'x-archive-meta-subject:%s'" % ('; '.join(wikikeys)), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ...
'--header', quoteattr('x-archive-meta-licenseurl:' + wikilicenseurl),
'--header', "'x-archive-meta-rights:%s'" % wikirights.replace("'", r"\'"),
'--header', quoteattr('x-archive-meta-originalurl:' + wikiurl),
]
curl += ['--upload-file', "%s" % (dump),
"http://s3.us.archive.org/wiki-%s/%s" % (wikiname, dump), # It could happen that the identifier is taken by another user; only wikiteam collection admins will be able to upload more files to it, curl will fail immediately and get a permissions error by s3.
'> /dev/null',
#FIXME: Must be NUL instead on Windows, how to make compatible?
]
#now also to update the metadata
#TODO: not needed for the second file in an item
curlmeta = ['curl --silent',
'--data-urlencode -target=metadata',
"""--data-urlencode -patch='{"replace":"/last-updated-date", "value":"%s"}'""" % (wikidate_text),
'--data-urlencode access=' + accesskey,
'--data-urlencode secret=' + secretkey,
'http://archive.org/metadata/wiki-' + wikiname,
'> /dev/null'
]
curlline = ' '.join(curl)
curlmetaline = ' '.join(curlmeta)
if not os.system(curlline):
uploadeddumps.append(dump)
log(wiki, dump, 'ok')
if not ismissingitem:
os.system(curlmetaline)
c += 1
def main(params=[]):
config = getParameters(params=params)
wikis = open(listfile, 'r').read().strip().splitlines()
upload(wikis, config)
if __name__ == "__main__":
main()