Skip to content

Commit

Permalink
Merge pull request #74 from UDST/enhancement/download-feed-wheader
Browse files Browse the repository at this point in the history
Enhancement/download feed wheader
  • Loading branch information
sablanchard authored Oct 13, 2020
2 parents 0cd98da + 6dc1a13 commit 5628cca
Show file tree
Hide file tree
Showing 6 changed files with 288 additions and 56 deletions.
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ Citation and academic literature

To cite this tool and for a complete description of the UrbanAccess methodology see the paper below:

`Samuel D. Blanchard and Paul Waddell. 2017. "UrbanAccess: Generalized Methodology for Measuring Regional Accessibility with an Integrated Pedestrian and Transit Network." Transportation Research Record: Journal of the Transportation Research Board. No. 2653. pp. 35–44. <http://trrjournalonline.trb.org/doi/pdf/10.3141/2653-05>`__
`Samuel D. Blanchard and Paul Waddell. 2017. "UrbanAccess: Generalized Methodology for Measuring Regional Accessibility with an Integrated Pedestrian and Transit Network." Transportation Research Record: Journal of the Transportation Research Board. No. 2653. pp. 35–44. <https://journals.sagepub.com/doi/pdf/10.3141/2653-05>`__

For other related literature see `here <https://udst.github.io/urbanaccess/introduction.html#citation-and-academic-literature>`__.

Expand Down
4 changes: 2 additions & 2 deletions docs/source/introduction.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,11 @@ Citation and academic literature

To cite this tool and for a complete description of the UrbanAccess methodology see the paper below:

`Samuel D. Blanchard and Paul Waddell. 2017. "UrbanAccess: Generalized Methodology for Measuring Regional Accessibility with an Integrated Pedestrian and Transit Network." Transportation Research Record: Journal of the Transportation Research Board. No. 2653. pp. 35–44. <http://trrjournalonline.trb.org/doi/pdf/10.3141/2653-05>`__
`Samuel D. Blanchard and Paul Waddell. 2017. "UrbanAccess: Generalized Methodology for Measuring Regional Accessibility with an Integrated Pedestrian and Transit Network." Transportation Research Record: Journal of the Transportation Research Board. No. 2653. pp. 35–44. <https://journals.sagepub.com/doi/pdf/10.3141/2653-05>`__

For a detailed use case of the tool see the following paper:

`Samuel D. Blanchard and Paul Waddell. 2017. "Assessment of Regional Transit Accessibility in the San Francisco Bay Area of California with UrbanAccess." Transportation Research Record: Journal of the Transportation Research Board. No. 2654. pp. 45–54. <http://trrjournalonline.trb.org/doi/abs/10.3141/2654-06>`__
`Samuel D. Blanchard and Paul Waddell. 2017. "Assessment of Regional Transit Accessibility in the San Francisco Bay Area of California with UrbanAccess." Transportation Research Record: Journal of the Transportation Research Board. No. 2654. pp. 45–54. <https://journals.sagepub.com/doi/pdf/10.3141/2654-06>`__

Reporting bugs
~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
107 changes: 60 additions & 47 deletions urbanaccess/gtfsfeeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os
import logging as lg
import time
from six.moves.urllib.request import urlopen
from six.moves.urllib import request

from urbanaccess.utils import log
from urbanaccess import config
Expand Down Expand Up @@ -78,9 +78,11 @@ def from_yaml(cls, gtfsfeeddir=os.path.join(config.settings.data_folder,
for value in yaml_config['gtfs_feeds'][key]:
if not isinstance(value, str):
raise ValueError('{} must be a string'.format(value))

if (pd.Series(
yaml_config['gtfs_feeds'].values()).value_counts() != 1).all():
unique_url_count = len(
pd.DataFrame.from_dict(yaml_config['gtfs_feeds'], orient='index')[
0].unique())
url_count = len(yaml_config['gtfs_feeds'])
if unique_url_count != url_count:
raise ValueError(
'duplicate values were found when the passed add_dict '
'dictionary was added to the existing dictionary. Feed URL '
Expand Down Expand Up @@ -439,7 +441,7 @@ def download(data_folder=os.path.join(config.settings.data_folder),
raise ValueError('{} must be a string'.format(value))

for key, value in feed_dict.items():
if value in feed_dict.gtfs_feeds.values():
if value in feeds.gtfs_feeds.values():
raise ValueError(
'duplicate values were found when the passed add_dict '
'dictionary was added to the existing dictionary. Feed '
Expand All @@ -458,70 +460,81 @@ def download(data_folder=os.path.join(config.settings.data_folder),
if not os.path.exists(download_folder):
os.makedirs(download_folder)
log('{} does not exist. Directory was created'.format(download_folder))
log('{} GTFS feeds will be downloaded here: {}'.format(
log('{:,} GTFS feed(s) will be downloaded here: {}'.format(
len(feeds.gtfs_feeds), download_folder))

start_time1 = time.time()
msg_no_connection_w_status = ('Unable to connect. URL at {} returned '
'status code {} and no data')
msg_no_connection = 'Unable to connect to: {}. Error: {}'
msg_download_succeed = ('{} GTFS feed downloaded successfully. '
'Took {:,.2f} seconds for {:,.1f}KB')
# TODO: add file counter and print number to user
for feed_name_key, feed_url_value in feeds.gtfs_feeds.items():
start_time2 = time.time()
zipfile_path = ''.join([download_folder, '/', feed_name_key, '.zip'])

if 'http' in feed_url_value:
status_code = urlopen(feed_url_value).getcode()
if status_code == 200:
file = urlopen(feed_url_value)

_zipfile_type_check(file=file,
feed_url_value=feed_url_value)
# add default user-agent header in request to avoid 403 Errors
opener = request.build_opener()
opener.addheaders = [('User-agent', '')]
request.install_opener(opener)

with open(zipfile_path, "wb") as local_file:
local_file.write(file.read())
log(
'{} GTFS feed downloaded successfully. Took {:,'
'.2f} seconds for {:,.1f}KB'.format(
feed_name_key, time.time() - start_time2,
os.path.getsize(zipfile_path)))
elif status_code in [429, 504]:
log(
'URL at {} returned status code {} and no data. '
'Re-trying request in {:.2f} seconds.'.format(
feed_url_value, status_code, error_pause_duration),
level=lg.WARNING)
time.sleep(error_pause_duration)
try:
file = urlopen(feed_url_value)
if 'http' in feed_url_value:
try:
status_code = request.urlopen(feed_url_value).getcode()
if status_code == 200:
file = request.urlopen(feed_url_value)

_zipfile_type_check(file=file,
feed_url_value=feed_url_value)

with open(zipfile_path, "wb") as local_file:
local_file.write(file.read())
except Exception:
log('Unable to connect. URL at {} returned status code '
'{} and no data'.format(feed_url_value, status_code),
log(msg_download_succeed.format(
feed_name_key, time.time() - start_time2,
os.path.getsize(zipfile_path)))
elif status_code in [429, 504]:
msg = ('URL at {} returned status code {} and no data. '
'Re-trying request in {:.2f} seconds.')
log(msg.format(feed_url_value, status_code,
error_pause_duration),
level=lg.WARNING)
time.sleep(error_pause_duration)
try:
file = request.urlopen(feed_url_value)

_zipfile_type_check(file=file,
feed_url_value=feed_url_value)

with open(zipfile_path, "wb") as local_file:
local_file.write(file.read())
except Exception:
log(msg_no_connection_w_status.format(
feed_url_value, status_code),
level=lg.ERROR)
else:
log(msg_no_connection_w_status.format(
feed_url_value, status_code),
level=lg.ERROR)
else:
log(
'Unable to connect. URL at {} returned status code {} '
'and no data'.format(
feed_url_value, status_code), level=lg.ERROR)
except Exception:
log(msg_no_connection.format(
feed_url_value, traceback.format_exc()),
level=lg.ERROR)
else:
try:
file = urlopen(feed_url_value)
file = request.urlopen(feed_url_value)
_zipfile_type_check(file=file,
feed_url_value=feed_url_value)
with open(
''.join([download_folder, '/', feed_name_key, '.zip']),
"wb") as local_file:
file_path = ''.join(
[download_folder, '/', feed_name_key, '.zip'])
with open(file_path, "wb") as local_file:
local_file.write(file.read())
log(
'{} GTFS feed downloaded successfully. Took {:,'
'.2f} seconds for {:,.1f}KB'.format(
feed_name_key, time.time() - start_time2,
os.path.getsize(zipfile_path)))
log(msg_download_succeed.format(
feed_name_key, time.time() - start_time2,
os.path.getsize(zipfile_path)))
except Exception:
log('Unable to connect: {}'.format(traceback.format_exc()),
log(msg_no_connection.format(
feed_url_value, traceback.format_exc()),
level=lg.ERROR)

log('GTFS feed download completed. Took {:,.2f} seconds'.format(
Expand Down
219 changes: 219 additions & 0 deletions urbanaccess/tests/test_gtfsfeeds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
import pytest
import os
import pandas as pd
import yaml

from urbanaccess import gtfsfeeds
from urbanaccess.gtfsfeeds import feeds


@pytest.fixture
def feed_dict1():
return {
'ac transit':
'http://www.actransit.org/wp-content/uploads/GTFSJune182017B.zip'}


@pytest.fixture
def feed_dict2():
return {
'Bay Area Rapid Transit':
'http://www.gtfs-data-exchange.com/agency/bay-area-rapid-transit'
'/latest.zip'}


@pytest.fixture
def feed_dict3():
return {
'ac transit': 'http://www.actransit.org/wp-content/uploads'
'/GTFSJune182017B.zip',
'Bay Area Rapid Transit':
'http://www.gtfs-data-exchange.com/agency/bay-area-rapid-transit'
'/latest.zip'}


@pytest.fixture
def feed_yaml(tmpdir):
yaml_dict = {
'gtfs_feeds': {
'ac transit': 'http://www.actransit.org/wp-content/uploads'
'/GTFSJune182017B.zip',
'Bay Area Rapid Transit':
'http://www.gtfs-data-exchange.com/agency/bay-area-rapid'
'-transit/latest.zip'}}

yaml_path = os.path.join(tmpdir.strpath, 'gtfsfeeds.yaml')
with open(yaml_path, 'w') as f:
yaml.dump(yaml_dict, f, default_flow_style=False)
return tmpdir.strpath


def test_feed_object():
assert isinstance(gtfsfeeds.feeds, gtfsfeeds.urbanaccess_gtfsfeeds)
assert isinstance(feeds.to_dict(), dict)


def test_add_feed(feed_dict1, feed_dict2):
feeds.add_feed(add_dict=feed_dict1)
assert len(feeds.gtfs_feeds.keys()) == 1
feeds.add_feed(add_dict=feed_dict2)
assert len(feeds.gtfs_feeds.keys()) == 2
feed_dict_replace = {'Bay Area Rapid Transit': 'test'}
feeds.add_feed(add_dict=feed_dict_replace, replace=True)

for key, value in feeds.gtfs_feeds.items():
if key == 'Bay Area Rapid Transit':
assert value == 'test'
assert isinstance(feeds, gtfsfeeds.urbanaccess_gtfsfeeds)
# clear feeds from global memory
feeds.remove_feed(remove_all=True)


def test_remove_feed(feed_dict3):
feeds.add_feed(add_dict=feed_dict3)
feeds.remove_feed(del_key='ac transit')
assert len(feeds.gtfs_feeds.keys()) == 1
assert 'ac transit' not in feeds.gtfs_feeds.keys()
feeds.remove_feed(remove_all=True)
assert len(feeds.gtfs_feeds.keys()) == 0
assert isinstance(feeds, gtfsfeeds.urbanaccess_gtfsfeeds)
# clear feeds from global memory
feeds.remove_feed(remove_all=True)


def test_to_yaml_feed(tmpdir, feed_dict3):
feeds.add_feed(add_dict=feed_dict3)
feeds.to_yaml(tmpdir.strpath, overwrite=True)

yaml_path = os.path.join(tmpdir.strpath, 'gtfsfeeds.yaml')
with open(yaml_path, 'r') as f:
yaml_config = yaml.load(f)
assert yaml_config['gtfs_feeds'] == feed_dict3
# clear feeds from global memory
feeds.remove_feed(remove_all=True)


def test_from_yaml_feed(feed_yaml):
yaml_path = feed_yaml
feeds_from_yaml = feeds.from_yaml(yaml_path, 'gtfsfeeds.yaml')

assert isinstance(feeds_from_yaml, gtfsfeeds.urbanaccess_gtfsfeeds)
assert len(feeds_from_yaml.gtfs_feeds.keys()) == 2

valid_feed = ('http://www.gtfs-data-exchange.com/'
'agency/bay-area-rapid-transit/latest.zip')
assert feeds_from_yaml.gtfs_feeds['Bay Area Rapid Transit'] == valid_feed

valid_feed = ('http://www.actransit.org/wp-content/'
'uploads/GTFSJune182017B.zip')
assert feeds_from_yaml.gtfs_feeds['ac transit'] == valid_feed
# clear feeds from global memory
feeds.remove_feed(remove_all=True)


def test_search_contains_gtfs_data_exchange():
search_result = gtfsfeeds.search(api='gtfsdataexch',
search_text=['ac transit', 'santa rosa'],
search_field=None, match='contains',
add_feed=False, overwrite_feed=False)

assert isinstance(search_result, pd.DataFrame)
assert search_result.empty is False
assert len(search_result) == 2

col_list = ['dataexchange_url', 'dataexchange_id', 'name']
for col in col_list:
assert col in search_result.columns
assert search_result[col].isnull().all() == False # noqa

value_list = ['ac-transit', 'santa-rosa-citybus']
for value in value_list:
assert value in list(search_result['dataexchange_id'])


def test_search_contains_add_feed_gtfs_data_exchange():
gtfsfeeds.search(api='gtfsdataexch',
search_text='ac transit',
search_field=None, match='contains',
add_feed=True, overwrite_feed=False)

assert len(feeds.gtfs_feeds.keys()) == 1
assert 'AC Transit' in feeds.gtfs_feeds.keys()

# test overwrite feed
gtfsfeeds.search(api='gtfsdataexch',
search_text='Bay Area Rapid Transit',
search_field=None, match='exact',
add_feed=True, overwrite_feed=True)

assert len(feeds.gtfs_feeds.keys()) == 1
assert 'Bay Area Rapid Transit' in feeds.gtfs_feeds.keys()
# clear feeds from global memory
feeds.remove_feed(remove_all=True)


def test_search_exact_search_field_gtfs_data_exchange():
# test search field
search_result = gtfsfeeds.search(api='gtfsdataexch',
search_text='San Francisco Bay Area',
search_field=['area'], match='exact',
add_feed=False, overwrite_feed=False)
assert len(search_result) == 8


def test_download_gtfs_feed_via_feed_object(feed_dict3, tmpdir):
feeds.add_feed(add_dict=feed_dict3)
tmp_path = tmpdir.strpath
gtfsfeeds.download(data_folder=tmp_path)

filelist = ['ac transit.zip', 'Bay Area Rapid Transit.zip']
txtlist = ['calendar.txt', 'routes.txt', 'stop_times.txt',
'stops.txt', 'trips.txt']
zip_path = os.path.join(tmp_path, 'gtfsfeed_zips')
txt_path = os.path.join(tmp_path, 'gtfsfeed_text')
for zipfile in filelist:
assert os.path.exists(os.path.join(zip_path, zipfile)) is True
for folder in filelist:
check_path = os.path.join(txt_path, folder.replace('.zip', ''))
assert os.path.exists(check_path) is True
for txt in txtlist:
check_path = os.path.join(
txt_path, folder.replace('.zip', ''), txt)
assert os.path.exists(check_path) is True
# clear feeds from global memory
feeds.remove_feed(remove_all=True)


def test_download_gtfs_feed_via_feed_name_and_dict(tmpdir):
tmp_path = tmpdir.strpath
gtfsfeeds.download(
data_folder=tmp_path,
feed_name='test_agency',
feed_url=('http://www.gtfs-data-exchange.com/'
'agency/bay-area-rapid-transit/latest.zip'),
feed_dict=None,
error_pause_duration=5, delete_zips=False)

gtfsfeeds.download(
data_folder=tmp_path,
feed_dict={
'test_agency_dict': 'http://www.gtfs-data-exchange.com/agency/'
'ac-transit/latest.zip'},
error_pause_duration=5, delete_zips=False)

filelist = ['test_agency.zip', 'test_agency_dict.zip']
txtlist = ['calendar.txt', 'routes.txt', 'stop_times.txt',
'stops.txt', 'trips.txt']
zip_path = os.path.join(tmp_path, 'gtfsfeed_zips')
txt_path = os.path.join(tmp_path, 'gtfsfeed_text')
for zipfile in filelist:
assert os.path.exists(os.path.join(zip_path, zipfile)) is True
for folder in filelist:
check_path = os.path.join(txt_path, folder.replace('.zip', ''))
assert os.path.exists(check_path) is True
for txt in txtlist:
check_path = os.path.join(
txt_path, folder.replace('.zip', ''), txt)
assert os.path.exists(check_path) is True
# clear feeds from global memory
feeds.remove_feed(remove_all=True)
Loading

0 comments on commit 5628cca

Please sign in to comment.