-
Notifications
You must be signed in to change notification settings - Fork 5
/
mapillary_takeout.py
executable file
·577 lines (484 loc) · 19.6 KB
/
mapillary_takeout.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
#!/usr/bin/env python3
import argparse
import json
import os
import re
import requests
import sys
import time
from pprint import pprint
from multiprocessing.pool import ThreadPool
##################################################################################################
# config
#
VERSION = "1.2"
NUM_THREADS = 16
# mapillary_tools client_id
CLIENT_ID = "MkJKbDA0bnZuZlcxeTJHTmFqN3g1dzo1YTM0NjRkM2EyZGU5MzBh"
# Max from API is 1000, but timeouts
SEQUENCES_PER_PAGE = "100"
# 220 max
REQUESTS_PER_CALL = 210
# Number of retries for a sequence
# Normally a sequence is limited to 1000 images, but this is no longer true for the iPhone
# Given the low reliability of AWS S3 (<0.99) we need to set an insanse high retry value
# to download 3k of images
SEQUENCE_DL_MAX_RETRIES = 128
# verbose level 0..4
# 0: only import messages, like errors
# 1: more verbose
# 2: full verbose
# 3: debug
# 4: more debug
DEBUG = 0
# connection timeout to AWS S3
# this will raise an exception for a download, and we retry to download the image later
DOWNLOAD_FILE_TIMEOUT=5
# timeout for login and sequences
META_TIMEOUT=60
API_ENDPOINT = "https://a.mapillary.com"
LOGIN_URL = API_ENDPOINT + "/v2/ua/login?client_id=" + CLIENT_ID
SEQUENCES_URL = (
API_ENDPOINT
+ "/v3/sequences?client_id="
+ CLIENT_ID
+ "&per_page="
+ SEQUENCES_PER_PAGE
)
MODEL_URL = API_ENDPOINT + "/v3/model.json?client_id=" + CLIENT_ID
AWS_EXPIRED = '<\?xml version="1\.0" encoding="UTF-8"\?>\\n<Error><Code>AccessDenied<\/Code><Message>Request has expired<\/Message>'
# show what we would do
DRY_RUN = False
# Store images by date and sequence subfolders.
# Disabled by default to be compatible with older versions.
SUBFOLDER = False
# count downloads per sequence
_DOWNLOAD_SEQUENCE_SIZE = 0
_DOWNLOAD_TOTAL_SIZE = 0
# estimates
AVERAGE_IMAGE_SIZE=2500000
##################################################################################################
# exception classes
#
class SSLException(Exception):
pass
class DownloadException(Exception):
pass
class URLExpireException(Exception):
pass
##################################################################################################
# functions
#
def get_mpy_auth(email, password):
# Returns mapillary token
payload = {"email": email, "password": password}
r = requests.post(LOGIN_URL, json=payload)
if r and "token" in r.json():
r.close()
return r.json()["token"]
elif r and "message" in r.json():
print("Authentication failed: %s" % r.json()["message"])
else:
print(
"Authentication failed with HTTP error %r : %r" % (r.status_code, r.text,)
)
r.close()
sys.exit(-1)
def get_user_sequences(mpy_token, username, start_date, end_date):
# Fetches all sequences for the username and
# returns them in an array of json objects
# https://www.mapillary.com/developer/api-documentation/#the-sequence-object
response = []
nb_images = 0
headers = {"Authorization": "Bearer " + mpy_token}
try:
r = requests.get(
SEQUENCES_URL,
headers=headers,
params={"usernames": username, "start_time": start_date, "end_time": end_date},
timeout=META_TIMEOUT
)
except:
raise DownloadException("Error downloading sequence URL %r" % SEQUENCES_URL)
# validate result
if r.status_code != requests.codes.ok:
raise DownloadException("Error status response sequence URL: HTTP " + str(r.status_code) + " give up!")
try:
r.json()
r.json()["features"]
except:
raise DownloadException("Error parsing json features response, give up!")
for feature in r.json()["features"]:
response.append(feature)
nb_images += len(feature["properties"]["coordinateProperties"]["image_keys"])
r.close()
nb_seq = len(response)
# first run: 100 sequences (xxx images)
if DEBUG >= 1:
print("Fetched %s sequences (%s images) ..." % (nb_seq, nb_images), end="\r")
while "next" in r.links:
try:
r = requests.get(r.links["next"]["url"], headers=headers, timeout=META_TIMEOUT)
except:
print("Error downloading next URL %r" % r.links["next"]["url"])
continue
if r.status_code != requests.codes.ok:
print("Error status response links: HTTP %d - ignore." % r.status_code)
continue
try:
json = r.json()
features = json["features"]
except:
print("Error parsing json object, ignore")
continue
for feature in features:
response.append(feature)
nb_images += len(
feature["properties"]["coordinateProperties"]["image_keys"]
)
r.close()
nb_seq = len(response)
if DEBUG >= 1:
print("Fetched %s sequences (%s images) ..." % (nb_seq, nb_images), end="\r")
# cleanup \r
if DEBUG >= 1:
print("")
return (response, nb_seq)
def get_source_urls(download_list, mpy_token, username):
if DEBUG >= 2:
print(" Start get_source_urls()")
# Fetches "unprocessed original" images URL and returns them in a dict
source_urls = {}
chunks = [
download_list[x : x + REQUESTS_PER_CALL]
for x in range(0, len(download_list), REQUESTS_PER_CALL)
]
counter = 0
for chunk in chunks:
params = {
"paths": json.dumps(
[["imageByKey", chunk, ["original_url"]]], separators=(",", ":")
),
"method": "get",
}
headers = {"Authorization": "Bearer " + mpy_token}
try:
counter += len(chunk)
if DEBUG >= 3:
print(" Fetch model URLs in chunks: (%d/%d)" % (counter, len(download_list)))
r = requests.get(MODEL_URL, headers=headers, params=params, timeout=META_TIMEOUT)
except:
raise DownloadException("Error downloading model URL %r, ignore sequence" % MODEL_URL)
if r.status_code != requests.codes.ok:
print("Error status response: HTTP %d ignore sequence" % r.status_code)
r.close()
continue
try:
data = r.json()
except:
print("Error parsing JSON model URL response, ignore sequence")
r.close()
continue
if "jsonGraph" in data:
for image_key, image in data["jsonGraph"]["imageByKey"].items():
if "value" in image["original_url"]:
source_urls[image_key] = image["original_url"]["value"]
r.close()
return source_urls
def download_file(args):
image_key, sorted_path, source_url = args
global _DOWNLOAD_SEQUENCE_SIZE
try:
r = requests.get(source_url, stream=True, timeout=DOWNLOAD_FILE_TIMEOUT)
except requests.exceptions.SSLError:
raise SSLException("SSL error downloading %r, retrying later" % image_key)
except:
if DEBUG >= 1:
raise DownloadException("Error downloading %r, retrying later. Info %r" % (image_key, sys.exc_info()[0],) )
else:
return False
if r.status_code == requests.codes.ok:
size = int(r.headers["content-length"])
if os.path.isfile(sorted_path) and os.path.getsize(sorted_path) == size:
if DEBUG >= 3:
print(" Already downloaded as %r" % sorted_path)
else:
if os.path.isfile(sorted_path):
if DEBUG >= 2:
print(" Size mismatch for %r, replacing..." % sorted_path)
try:
with open(sorted_path, "wb") as f:
f.write(r.content)
except:
if DEBUG >= 1:
raise DownloadException("Error downloading image %r, retrying later. Info %r" % (image_key, sys.exc_info()[0],))
else:
return False
# downloaded MB per sequence
_DOWNLOAD_SEQUENCE_SIZE += size
return image_key
elif r.status_code == 403 and re.match(AWS_EXPIRED, r.text):
raise URLExpireException("Download token expired, requesting fresh one ...")
else:
print(
" Error %r downloading image %r : %r" % (r.status_code, image_key, r.text,)
)
return False
def download_sequence(output_folder, mpy_token, sequence, username, c, nb_sequences):
global _DOWNLOAD_SEQUENCE_SIZE
global _DOWNLOAD_TOTAL_SIZE
subfolder_enabled = SUBFOLDER
if DEBUG >= 3:
print(" Prepare sequence download")
if DEBUG >= 4:
pprint(sequence)
sequence_name = (
sequence["properties"]["captured_at"]
+ "_"
+ sequence["properties"]["created_at"]
)
if os.name == "nt":
sequence_name = sequence_name.replace(":", "_")
sequence_day = sequence_name.split("T")[0]
sorted_folder = output_folder + "/" + sequence_day
if subfolder_enabled == 1:
subfolder = sequence["properties"]["captured_at"]
if os.name == "nt":
subfolder = subfolder.replace(":", "_")
sorted_folder = sorted_folder + "/" + subfolder
download_list = []
os.makedirs(sorted_folder, exist_ok=True)
# First pass on image_keys : sorts which one needs downloading
image_keys = sequence["properties"]["coordinateProperties"]["image_keys"]
for image_index, image_key in enumerate(image_keys, 1):
sorted_path = (
sorted_folder + "/" + sequence_name + "_" + "%04d" % image_index + ".jpg"
)
if not os.path.exists(sorted_path):
download_list.append(image_key)
elif os.stat(sorted_path).st_size == 0:
download_list.append(image_key)
if not download_list:
if DEBUG >= 2:
print(" Sequence %r already fully downloaded" % sequence_name)
return 0, 0
already_downloaded = len(image_keys) - len(download_list)
if already_downloaded:
if DEBUG >= 1:
print(" Already downloaded: %d/%d" % (already_downloaded, len(image_keys)))
if DRY_RUN:
return 1, len(download_list)
# Third pass, download if entry is found in dict
sequence_dl_retries = 0
update_urls = True
while download_list and not sequence_dl_retries >= SEQUENCE_DL_MAX_RETRIES:
if update_urls:
source_urls = get_source_urls(download_list, mpy_token, username)
update_urls = False
sequence_dl_retries += 1
# show only on a retry
if sequence_dl_retries > 1 and DEBUG >= 1:
print("sequence download retries: %s/%s" % (sequence_dl_retries, SEQUENCE_DL_MAX_RETRIES))
if len(download_list) > len(source_urls):
print(
" Missing %d/%d images, will refresh and retry later"
% (len(download_list) - len(source_urls), len(download_list))
)
# if we get nothing wait a little bit
if len(download_list) - len(source_urls) == len(download_list):
if DEBUG >= 1:
print(" Wait a second due long missing source list")
time.sleep(2)
sequence_dl_retries -= 1
# refresh list after this pass
update_urls = True
pool = ThreadPool(NUM_THREADS)
pool_args = []
for image_index, image_key in enumerate(image_keys, 1):
if image_key in download_list:
sorted_path = (
sorted_folder
+ "/"
+ sequence_name
+ "_"
+ "%04d" % image_index
+ ".jpg"
)
if image_key in source_urls:
source_url = source_urls[image_key]
pool_args.append((image_key, sorted_path, source_url))
if DEBUG >= 3:
print(" Filling download pool done")
try:
for i, image_key in enumerate(pool.imap(download_file, pool_args), 1):
if image_key:
download_list.remove(image_key)
print(
" Downloading images #%03d out of %03d round: %d" % (i, len(pool_args), sequence_dl_retries),
end="\r",
flush=True,
)
except SSLException as e:
print(e)
except DownloadException as e:
print(e)
except URLExpireException as e:
print(e)
sequence_dl_retries -= 1
# refresh urls
update_urls = True
finally:
pool.terminate()
pool.join()
print(" Done sequence %r (%d/%d) %3.1f MB, camera: %s" % (sequence_name, c, nb_sequences, _DOWNLOAD_SEQUENCE_SIZE/1024/1024, sequence["properties"]["camera_make"]), flush=True)
_DOWNLOAD_TOTAL_SIZE += _DOWNLOAD_SEQUENCE_SIZE
_DOWNLOAD_SEQUENCE_SIZE = 0
return 1, len(source_urls)
def add(tgt, src):
for i in range(len(tgt)):
tgt[i] += src[i]
def main(email, password, username, output_folder, start_date, end_date):
mpy_token = get_mpy_auth(email, password)
user_sequences, nb_sequences = get_user_sequences(
mpy_token, username, start_date, end_date
)
if not nb_sequences:
print(
"No sequences found to download. Check this is the valid username at https://www.mapillary.com/app/user/%s"
% username
)
sys.exit(-2)
accumulated_stats = [0, 0] # seq, img,
for c, sequence in enumerate(reversed(user_sequences), 1):
stats = download_sequence(output_folder, mpy_token, sequence, username, c, nb_sequences )
add(accumulated_stats, stats)
if DEBUG >= 2:
print(
"Sequence %s_%s (%d/%d) contains %d images camera: %s"
% (
sequence["properties"]["captured_at"],
sequence["properties"]["created_at"],
c,
nb_sequences,
stats[1],
sequence["properties"]["camera_make"],
)
)
if DRY_RUN:
print(
"%s images in %s sequences would have been downloaded without the dry run"
% (accumulated_stats[1], accumulated_stats[0],)
)
download_size = accumulated_stats[1] * AVERAGE_IMAGE_SIZE;
if accumulated_stats[1] == 0:
print("You are up-to-date, all images are already downloaded. Great!")
else:
print("Estimated download size: %2.1f GB" % (download_size / 1024/1024/1024))
print("Estimated download time 250Mbit/s: %2.1f min, 100Mbit/s: %2.1f min, 50Mbit/s: %2.1f min, 16Mbit/s %2.1f min" % (
(download_size / 250/1000/1000*8/60),
(download_size / 100/1000/1000*8/60),
(download_size / 50/1000/1000*8/60),
(download_size / 16/1000/1000*8/60),
))
else:
global _DOWNLOAD_TOTAL_SIZE
if accumulated_stats[1] > 0:
print("Total images: %s total download size: %2.1f GB average image size: %2.1f MB" %
(accumulated_stats[1],
_DOWNLOAD_TOTAL_SIZE/1024/1024/1024,
_DOWNLOAD_TOTAL_SIZE/accumulated_stats[1]/1024/1024))
else:
print("You are up-to-date, all images are already downloaded. Great!")
return 0
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download your images from Mapillary, version: " + VERSION)
parser.add_argument("email", help="Your email address for mapillary authentication")
parser.add_argument("password", help="Your mapillary password")
parser.add_argument("username", help="Your mapillary username")
parser.add_argument("output_folder", help="Download destination")
parser.add_argument(
"--start-date",
help="Filter sequences that are captured since this date",
metavar="YYYY-MM-DD",
)
parser.add_argument(
"--end-date",
help="Filter sequences that are captured before this date (Note: end date is not included!)",
metavar="YYYY-MM-DD",
)
parser.add_argument( "--debug", metavar="0..4", help="set global debug level, default: " + str(DEBUG))
parser.add_argument( "--timeout", metavar="1..300", help="set connection/read timeout in seconds, default: " + str(DOWNLOAD_FILE_TIMEOUT))
parser.add_argument( "--timeout-meta", metavar="1..300", help="set connection/read timeout for meta requests in seconds, default: " + str(META_TIMEOUT))
parser.add_argument( "--threads", metavar="1..128", help="number of threads, default: " + str(NUM_THREADS))
parser.add_argument( "--retries", metavar="1..512", help="sequence max. retries, default: " + str(SEQUENCE_DL_MAX_RETRIES))
parser.add_argument(
"-D", "--dry-run", action="store_true", help="Check sequences status, display estimates and leave"
)
parser.add_argument( "--subfolder", action="store_true", help="Store images by date and sequence subfolders, default: " + str(SUBFOLDER))
args = parser.parse_args()
if args.dry_run:
DRY_RUN = True
if args.subfolder:
SUBFOLDER = True
if args.debug:
try:
debug = int(args.debug)
except:
print("illegal value for debug: %s" % args.debug)
sys.exit(-1)
if debug >= 0 and debug <= 4:
DEBUG = debug
else:
print ("debug parameter is out of range 0..4: %s, ignored" % debug)
if args.timeout:
try:
timeout = float(args.timeout)
except:
print("illegal value for timeout: %s" % args.timeout)
sys.exit(-1)
if timeout > 0 and timeout <= 300:
DOWNLOAD_FILE_TIMEOUT = timeout
else:
print ("timeout parameter is out of range 0..300: %s, ignored" % timeout)
if args.timeout_meta:
try:
timeout_meta = float(args.timeout_meta)
except:
print("illegal value for timeout: %s" % args.timeout_meta)
sys.exit(-1)
if timeout_meta > 0 and timeout_meta <= 300:
META_TIMEOUT = timeout_meta
else:
print ("timeout meta parameter is out of range 0..300: %s, ignored" % timeout_meta)
if args.threads:
try:
threads = int(args.threads)
except:
print("illegal value for threads: %s" % args.threads)
sys.exit(-1)
if threads > 0 and threads <= 128:
NUM_THREADS = threads
else:
print ("timeout parameter is out of range 0..128: %s, ignored" % threads)
if args.retries:
try:
retries = int(args.retries)
except:
print("illegal value for retries: %s" % args.retries)
sys.exit(-1)
if retries > 0 and retries <= 512:
SEQUENCE_DL_MAX_RETRIES = retries
else:
print ("retries parameter is out of range 0..512: %s, ignored" % retries)
if DEBUG > 0:
print("number of threads: %d, connection timeout: %2.1f sec., retries: %d, debug: %d, with subfolder: %s" % (NUM_THREADS, DOWNLOAD_FILE_TIMEOUT, SEQUENCE_DL_MAX_RETRIES, DEBUG, SUBFOLDER))
exit(
main(
args.email,
args.password,
args.username,
args.output_folder,
args.start_date,
args.end_date,
)
)