Merge pull request #74 from UDST/enhancement/download-feed-wheader

Enhancement/download feed wheader
UDST · Oct 13, 2020 · 5628cca · 5628cca
2 parents 0cd98da + 6dc1a13
commit 5628cca
Show file tree

Hide file tree

Showing 6 changed files with 288 additions and 56 deletions.
diff --git a/README.rst b/README.rst
@@ -51,7 +51,7 @@ Citation and academic literature
 
 To cite this tool and for a complete description of the UrbanAccess methodology see the paper below:
 
-`Samuel D. Blanchard and Paul Waddell. 2017. "UrbanAccess: Generalized Methodology for Measuring Regional Accessibility with an Integrated Pedestrian and Transit Network." Transportation Research Record: Journal of the Transportation Research Board. No. 2653. pp. 35–44. <http://trrjournalonline.trb.org/doi/pdf/10.3141/2653-05>`__
+`Samuel D. Blanchard and Paul Waddell. 2017. "UrbanAccess: Generalized Methodology for Measuring Regional Accessibility with an Integrated Pedestrian and Transit Network." Transportation Research Record: Journal of the Transportation Research Board. No. 2653. pp. 35–44. <https://journals.sagepub.com/doi/pdf/10.3141/2653-05>`__
 
 For other related literature see `here <https://udst.github.io/urbanaccess/introduction.html#citation-and-academic-literature>`__.
 

diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst
@@ -51,11 +51,11 @@ Citation and academic literature
 
 To cite this tool and for a complete description of the UrbanAccess methodology see the paper below:
 
-`Samuel D. Blanchard and Paul Waddell. 2017. "UrbanAccess: Generalized Methodology for Measuring Regional Accessibility with an Integrated Pedestrian and Transit Network." Transportation Research Record: Journal of the Transportation Research Board. No. 2653. pp. 35–44. <http://trrjournalonline.trb.org/doi/pdf/10.3141/2653-05>`__
+`Samuel D. Blanchard and Paul Waddell. 2017. "UrbanAccess: Generalized Methodology for Measuring Regional Accessibility with an Integrated Pedestrian and Transit Network." Transportation Research Record: Journal of the Transportation Research Board. No. 2653. pp. 35–44. <https://journals.sagepub.com/doi/pdf/10.3141/2653-05>`__
 
 For a detailed use case of the tool see the following paper:
 
-`Samuel D. Blanchard and Paul Waddell. 2017. "Assessment of Regional Transit Accessibility in the San Francisco Bay Area of California with UrbanAccess." Transportation Research Record: Journal of the Transportation Research Board. No. 2654. pp. 45–54. <http://trrjournalonline.trb.org/doi/abs/10.3141/2654-06>`__
+`Samuel D. Blanchard and Paul Waddell. 2017. "Assessment of Regional Transit Accessibility in the San Francisco Bay Area of California with UrbanAccess." Transportation Research Record: Journal of the Transportation Research Board. No. 2654. pp. 45–54. <https://journals.sagepub.com/doi/pdf/10.3141/2654-06>`__
 
 Reporting bugs
 ~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/urbanaccess/gtfsfeeds.py b/urbanaccess/gtfsfeeds.py
@@ -5,7 +5,7 @@
 import os
 import logging as lg
 import time
-from six.moves.urllib.request import urlopen
+from six.moves.urllib import request
 
 from urbanaccess.utils import log
 from urbanaccess import config
@@ -78,9 +78,11 @@ def from_yaml(cls, gtfsfeeddir=os.path.join(config.settings.data_folder,
             for value in yaml_config['gtfs_feeds'][key]:
                 if not isinstance(value, str):
                     raise ValueError('{} must be a string'.format(value))
-
-        if (pd.Series(
-                yaml_config['gtfs_feeds'].values()).value_counts() != 1).all():
+        unique_url_count = len(
+            pd.DataFrame.from_dict(yaml_config['gtfs_feeds'], orient='index')[
+                0].unique())
+        url_count = len(yaml_config['gtfs_feeds'])
+        if unique_url_count != url_count:
             raise ValueError(
                 'duplicate values were found when the passed add_dict '
                 'dictionary was added to the existing dictionary. Feed URL '
@@ -439,7 +441,7 @@ def download(data_folder=os.path.join(config.settings.data_folder),
                     raise ValueError('{} must be a string'.format(value))
 
         for key, value in feed_dict.items():
-            if value in feed_dict.gtfs_feeds.values():
+            if value in feeds.gtfs_feeds.values():
                 raise ValueError(
                     'duplicate values were found when the passed add_dict '
                     'dictionary was added to the existing dictionary. Feed '
@@ -458,70 +460,81 @@ def download(data_folder=os.path.join(config.settings.data_folder),
     if not os.path.exists(download_folder):
         os.makedirs(download_folder)
         log('{} does not exist. Directory was created'.format(download_folder))
-    log('{} GTFS feeds will be downloaded here: {}'.format(
+    log('{:,} GTFS feed(s) will be downloaded here: {}'.format(
         len(feeds.gtfs_feeds), download_folder))
 
     start_time1 = time.time()
+    msg_no_connection_w_status = ('Unable to connect. URL at {} returned '
+                                  'status code {} and no data')
+    msg_no_connection = 'Unable to connect to: {}. Error: {}'
+    msg_download_succeed = ('{} GTFS feed downloaded successfully. '
+                            'Took {:,.2f} seconds for {:,.1f}KB')
     # TODO: add file counter and print number to user
     for feed_name_key, feed_url_value in feeds.gtfs_feeds.items():
         start_time2 = time.time()
         zipfile_path = ''.join([download_folder, '/', feed_name_key, '.zip'])
 
-        if 'http' in feed_url_value:
-            status_code = urlopen(feed_url_value).getcode()
-            if status_code == 200:
-                file = urlopen(feed_url_value)
-
-                _zipfile_type_check(file=file,
-                                    feed_url_value=feed_url_value)
+        # add default user-agent header in request to avoid 403 Errors
+        opener = request.build_opener()
+        opener.addheaders = [('User-agent', '')]
+        request.install_opener(opener)
 
-                with open(zipfile_path, "wb") as local_file:
-                    local_file.write(file.read())
-                log(
-                    '{} GTFS feed downloaded successfully. Took {:,'
-                    '.2f} seconds for {:,.1f}KB'.format(
-                        feed_name_key, time.time() - start_time2,
-                        os.path.getsize(zipfile_path)))
-            elif status_code in [429, 504]:
-                log(
-                    'URL at {} returned status code {} and no data. '
-                    'Re-trying request in {:.2f} seconds.'.format(
-                        feed_url_value, status_code, error_pause_duration),
-                    level=lg.WARNING)
-                time.sleep(error_pause_duration)
-                try:
-                    file = urlopen(feed_url_value)
+        if 'http' in feed_url_value:
+            try:
+                status_code = request.urlopen(feed_url_value).getcode()
+                if status_code == 200:
+                    file = request.urlopen(feed_url_value)
 
                     _zipfile_type_check(file=file,
                                         feed_url_value=feed_url_value)
 
                     with open(zipfile_path, "wb") as local_file:
                         local_file.write(file.read())
-                except Exception:
-                    log('Unable to connect. URL at {} returned status code '
-                        '{} and no data'.format(feed_url_value, status_code),
+                    log(msg_download_succeed.format(
+                        feed_name_key, time.time() - start_time2,
+                        os.path.getsize(zipfile_path)))
+                elif status_code in [429, 504]:
+                    msg = ('URL at {} returned status code {} and no data. '
+                           'Re-trying request in {:.2f} seconds.')
+                    log(msg.format(feed_url_value, status_code,
+                                   error_pause_duration),
+                        level=lg.WARNING)
+                    time.sleep(error_pause_duration)
+                    try:
+                        file = request.urlopen(feed_url_value)
+
+                        _zipfile_type_check(file=file,
+                                            feed_url_value=feed_url_value)
+
+                        with open(zipfile_path, "wb") as local_file:
+                            local_file.write(file.read())
+                    except Exception:
+                        log(msg_no_connection_w_status.format(
+                            feed_url_value, status_code),
+                            level=lg.ERROR)
+                else:
+                    log(msg_no_connection_w_status.format(
+                        feed_url_value, status_code),
                         level=lg.ERROR)
-            else:
-                log(
-                    'Unable to connect. URL at {} returned status code {} '
-                    'and no data'.format(
-                        feed_url_value, status_code), level=lg.ERROR)
+            except Exception:
+                log(msg_no_connection.format(
+                    feed_url_value, traceback.format_exc()),
+                    level=lg.ERROR)
         else:
             try:
-                file = urlopen(feed_url_value)
+                file = request.urlopen(feed_url_value)
                 _zipfile_type_check(file=file,
                                     feed_url_value=feed_url_value)
-                with open(
-                        ''.join([download_folder, '/', feed_name_key, '.zip']),
-                        "wb") as local_file:
+                file_path = ''.join(
+                    [download_folder, '/', feed_name_key, '.zip'])
+                with open(file_path, "wb") as local_file:
                     local_file.write(file.read())
-                log(
-                    '{} GTFS feed downloaded successfully. Took {:,'
-                    '.2f} seconds for {:,.1f}KB'.format(
-                        feed_name_key, time.time() - start_time2,
-                        os.path.getsize(zipfile_path)))
+                log(msg_download_succeed.format(
+                    feed_name_key, time.time() - start_time2,
+                    os.path.getsize(zipfile_path)))
             except Exception:
-                log('Unable to connect: {}'.format(traceback.format_exc()),
+                log(msg_no_connection.format(
+                    feed_url_value, traceback.format_exc()),
                     level=lg.ERROR)
 
     log('GTFS feed download completed. Took {:,.2f} seconds'.format(

diff --git a/urbanaccess/tests/test_gtfsfeeds.py b/urbanaccess/tests/test_gtfsfeeds.py
@@ -0,0 +1,219 @@
+import pytest
+import os
+import pandas as pd
+import yaml
+
+from urbanaccess import gtfsfeeds
+from urbanaccess.gtfsfeeds import feeds
+
+
+@pytest.fixture
+def feed_dict1():
+    return {
+        'ac transit':
+            'http://www.actransit.org/wp-content/uploads/GTFSJune182017B.zip'}
+
+
+@pytest.fixture
+def feed_dict2():
+    return {
+        'Bay Area Rapid Transit':
+            'http://www.gtfs-data-exchange.com/agency/bay-area-rapid-transit'
+            '/latest.zip'}
+
+
+@pytest.fixture
+def feed_dict3():
+    return {
+        'ac transit': 'http://www.actransit.org/wp-content/uploads'
+                      '/GTFSJune182017B.zip',
+        'Bay Area Rapid Transit':
+            'http://www.gtfs-data-exchange.com/agency/bay-area-rapid-transit'
+            '/latest.zip'}
+
+
+@pytest.fixture
+def feed_yaml(tmpdir):
+    yaml_dict = {
+        'gtfs_feeds': {
+            'ac transit': 'http://www.actransit.org/wp-content/uploads'
+                          '/GTFSJune182017B.zip',
+            'Bay Area Rapid Transit':
+                'http://www.gtfs-data-exchange.com/agency/bay-area-rapid'
+                '-transit/latest.zip'}}
+
+    yaml_path = os.path.join(tmpdir.strpath, 'gtfsfeeds.yaml')
+    with open(yaml_path, 'w') as f:
+        yaml.dump(yaml_dict, f, default_flow_style=False)
+    return tmpdir.strpath
+
+
+def test_feed_object():
+    assert isinstance(gtfsfeeds.feeds, gtfsfeeds.urbanaccess_gtfsfeeds)
+    assert isinstance(feeds.to_dict(), dict)
+
+
+def test_add_feed(feed_dict1, feed_dict2):
+    feeds.add_feed(add_dict=feed_dict1)
+    assert len(feeds.gtfs_feeds.keys()) == 1
+    feeds.add_feed(add_dict=feed_dict2)
+    assert len(feeds.gtfs_feeds.keys()) == 2
+    feed_dict_replace = {'Bay Area Rapid Transit': 'test'}
+    feeds.add_feed(add_dict=feed_dict_replace, replace=True)
+
+    for key, value in feeds.gtfs_feeds.items():
+        if key == 'Bay Area Rapid Transit':
+            assert value == 'test'
+    assert isinstance(feeds, gtfsfeeds.urbanaccess_gtfsfeeds)
+    # clear feeds from global memory
+    feeds.remove_feed(remove_all=True)
+
+
+def test_remove_feed(feed_dict3):
+    feeds.add_feed(add_dict=feed_dict3)
+    feeds.remove_feed(del_key='ac transit')
+    assert len(feeds.gtfs_feeds.keys()) == 1
+    assert 'ac transit' not in feeds.gtfs_feeds.keys()
+    feeds.remove_feed(remove_all=True)
+    assert len(feeds.gtfs_feeds.keys()) == 0
+    assert isinstance(feeds, gtfsfeeds.urbanaccess_gtfsfeeds)
+    # clear feeds from global memory
+    feeds.remove_feed(remove_all=True)
+
+
+def test_to_yaml_feed(tmpdir, feed_dict3):
+    feeds.add_feed(add_dict=feed_dict3)
+    feeds.to_yaml(tmpdir.strpath, overwrite=True)
+
+    yaml_path = os.path.join(tmpdir.strpath, 'gtfsfeeds.yaml')
+    with open(yaml_path, 'r') as f:
+        yaml_config = yaml.load(f)
+    assert yaml_config['gtfs_feeds'] == feed_dict3
+    # clear feeds from global memory
+    feeds.remove_feed(remove_all=True)
+
+
+def test_from_yaml_feed(feed_yaml):
+    yaml_path = feed_yaml
+    feeds_from_yaml = feeds.from_yaml(yaml_path, 'gtfsfeeds.yaml')
+
+    assert isinstance(feeds_from_yaml, gtfsfeeds.urbanaccess_gtfsfeeds)
+    assert len(feeds_from_yaml.gtfs_feeds.keys()) == 2
+
+    valid_feed = ('http://www.gtfs-data-exchange.com/'
+                  'agency/bay-area-rapid-transit/latest.zip')
+    assert feeds_from_yaml.gtfs_feeds['Bay Area Rapid Transit'] == valid_feed
+
+    valid_feed = ('http://www.actransit.org/wp-content/'
+                  'uploads/GTFSJune182017B.zip')
+    assert feeds_from_yaml.gtfs_feeds['ac transit'] == valid_feed
+    # clear feeds from global memory
+    feeds.remove_feed(remove_all=True)
+
+
+def test_search_contains_gtfs_data_exchange():
+    search_result = gtfsfeeds.search(api='gtfsdataexch',
+                                     search_text=['ac transit', 'santa rosa'],
+                                     search_field=None, match='contains',
+                                     add_feed=False, overwrite_feed=False)
+
+    assert isinstance(search_result, pd.DataFrame)
+    assert search_result.empty is False
+    assert len(search_result) == 2
+
+    col_list = ['dataexchange_url', 'dataexchange_id', 'name']
+    for col in col_list:
+        assert col in search_result.columns
+        assert search_result[col].isnull().all() == False  # noqa
+
+    value_list = ['ac-transit', 'santa-rosa-citybus']
+    for value in value_list:
+        assert value in list(search_result['dataexchange_id'])
+
+
+def test_search_contains_add_feed_gtfs_data_exchange():
+    gtfsfeeds.search(api='gtfsdataexch',
+                     search_text='ac transit',
+                     search_field=None, match='contains',
+                     add_feed=True, overwrite_feed=False)
+
+    assert len(feeds.gtfs_feeds.keys()) == 1
+    assert 'AC Transit' in feeds.gtfs_feeds.keys()
+
+    # test overwrite feed
+    gtfsfeeds.search(api='gtfsdataexch',
+                     search_text='Bay Area Rapid Transit',
+                     search_field=None, match='exact',
+                     add_feed=True, overwrite_feed=True)
+
+    assert len(feeds.gtfs_feeds.keys()) == 1
+    assert 'Bay Area Rapid Transit' in feeds.gtfs_feeds.keys()
+    # clear feeds from global memory
+    feeds.remove_feed(remove_all=True)
+
+
+def test_search_exact_search_field_gtfs_data_exchange():
+    # test search field
+    search_result = gtfsfeeds.search(api='gtfsdataexch',
+                                     search_text='San Francisco Bay Area',
+                                     search_field=['area'], match='exact',
+                                     add_feed=False, overwrite_feed=False)
+    assert len(search_result) == 8
+
+
+def test_download_gtfs_feed_via_feed_object(feed_dict3, tmpdir):
+    feeds.add_feed(add_dict=feed_dict3)
+    tmp_path = tmpdir.strpath
+    gtfsfeeds.download(data_folder=tmp_path)
+
+    filelist = ['ac transit.zip', 'Bay Area Rapid Transit.zip']
+    txtlist = ['calendar.txt', 'routes.txt', 'stop_times.txt',
+               'stops.txt', 'trips.txt']
+    zip_path = os.path.join(tmp_path, 'gtfsfeed_zips')
+    txt_path = os.path.join(tmp_path, 'gtfsfeed_text')
+    for zipfile in filelist:
+        assert os.path.exists(os.path.join(zip_path, zipfile)) is True
+    for folder in filelist:
+        check_path = os.path.join(txt_path, folder.replace('.zip', ''))
+        assert os.path.exists(check_path) is True
+        for txt in txtlist:
+            check_path = os.path.join(
+                txt_path, folder.replace('.zip', ''), txt)
+            assert os.path.exists(check_path) is True
+    # clear feeds from global memory
+    feeds.remove_feed(remove_all=True)
+
+
+def test_download_gtfs_feed_via_feed_name_and_dict(tmpdir):
+    tmp_path = tmpdir.strpath
+    gtfsfeeds.download(
+        data_folder=tmp_path,
+        feed_name='test_agency',
+        feed_url=('http://www.gtfs-data-exchange.com/'
+                  'agency/bay-area-rapid-transit/latest.zip'),
+        feed_dict=None,
+        error_pause_duration=5, delete_zips=False)
+
+    gtfsfeeds.download(
+        data_folder=tmp_path,
+        feed_dict={
+            'test_agency_dict': 'http://www.gtfs-data-exchange.com/agency/'
+                                'ac-transit/latest.zip'},
+        error_pause_duration=5, delete_zips=False)
+
+    filelist = ['test_agency.zip', 'test_agency_dict.zip']
+    txtlist = ['calendar.txt', 'routes.txt', 'stop_times.txt',
+               'stops.txt', 'trips.txt']
+    zip_path = os.path.join(tmp_path, 'gtfsfeed_zips')
+    txt_path = os.path.join(tmp_path, 'gtfsfeed_text')
+    for zipfile in filelist:
+        assert os.path.exists(os.path.join(zip_path, zipfile)) is True
+    for folder in filelist:
+        check_path = os.path.join(txt_path, folder.replace('.zip', ''))
+        assert os.path.exists(check_path) is True
+        for txt in txtlist:
+            check_path = os.path.join(
+                txt_path, folder.replace('.zip', ''), txt)
+            assert os.path.exists(check_path) is True
+    # clear feeds from global memory
+    feeds.remove_feed(remove_all=True)