From 65947320f561bb3ace2840f86987d9958b1ea93e Mon Sep 17 00:00:00 2001 From: David Date: Fri, 4 Sep 2020 12:32:59 +0200 Subject: [PATCH 01/35] Allowing for passing axes to plot_net so the user can choose a projection --- urbanaccess/plot.py | 13 ++++++++++--- .../tests/integration/integration_sandiego.py | 13 ++++++++++++- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/urbanaccess/plot.py b/urbanaccess/plot.py index 69dbae3..ab24594 100644 --- a/urbanaccess/plot.py +++ b/urbanaccess/plot.py @@ -13,7 +13,8 @@ def plot_net(nodes, edges, x_col=None, y_col=None, from_col=None, fig_height=6, margin=0.02, edge_color='#999999', edge_linewidth=1, edge_alpha=1, node_color='black', node_size=15, node_alpha=1, - node_edgecolor='none', node_zorder=3, nodes_only=False): + node_edgecolor='none', node_zorder=3, nodes_only=False, + ax=None): """ plot urbanaccess network nodes and edges @@ -59,6 +60,9 @@ def plot_net(nodes, edges, x_col=None, y_col=None, from_col=None, nodes under the edges, 3 will plot nodes on top nodes_only : bool if true only the nodes will plot + ax : matplotlib.axes._subplots.AxesSubplot, optional + matplotlib axes, as given by, for example, plt.subplot. + Use to specify the projection. Returns ------- @@ -117,8 +121,11 @@ def plot_net(nodes, edges, x_col=None, y_col=None, from_col=None, 'in a negative value or 0')) bbox_aspect_ratio = (y_max - y_min) / (x_max - x_min) - fig, ax = plt.subplots(figsize=(fig_height / bbox_aspect_ratio, - fig_height)) + if ax is None: + fig, ax = plt.subplots(figsize=(fig_height / bbox_aspect_ratio, + fig_height)) + else: + fig = ax.figure if nodes_only is False: # TODO: optimize for speed by calculating only for edges that are diff --git a/urbanaccess/tests/integration/integration_sandiego.py b/urbanaccess/tests/integration/integration_sandiego.py index e6459a1..5630dd8 100644 --- a/urbanaccess/tests/integration/integration_sandiego.py +++ b/urbanaccess/tests/integration/integration_sandiego.py @@ -3,7 +3,11 @@ import pandas as pd import matplotlib + matplotlib.use('agg') +import matplotlib.pyplot as plt + +import cartopy.crs as ccrs import urbanaccess @@ -56,6 +60,12 @@ 'schedule_type': 'WD'}, timerange=['07:00:00', '10:00:00']) +# This is the standard map projection for California +teale_albers = ccrs.AlbersEqualArea(false_northing=-4000000.0, false_easting=0, + central_longitude=-120.0, central_latitude=0, + standard_parallels=(34.0, 40.5)) +teale_albers_ax = plt.axes(projection=teale_albers) + urbanaccess.plot.plot_net(nodes=transit_net.transit_nodes, edges=transit_net.transit_edges, bbox=bbox, @@ -68,7 +78,8 @@ node_alpha=1, node_edgecolor='none', node_zorder=3, - nodes_only=False) + nodes_only=False, + ax=teale_albers_ax) print('{} integration test completed successfully. Took {:,' '.2f} seconds'.format(name, time.time() - start_time)) From 2791cbd3e676848097d354cb72180331c3547211 Mon Sep 17 00:00:00 2001 From: eli knaap Date: Tue, 15 Sep 2020 21:22:38 -0700 Subject: [PATCH 02/35] add flex in calendar/dates handling --- urbanaccess/gtfs/load.py | 30 ++++++++++++++++++++---------- urbanaccess/gtfs/utils_format.py | 2 +- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/urbanaccess/gtfs/load.py b/urbanaccess/gtfs/load.py index 0facd16..de39908 100644 --- a/urbanaccess/gtfs/load.py +++ b/urbanaccess/gtfs/load.py @@ -236,8 +236,16 @@ def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, os.listdir(os.path.join(gtfsfeed_path, folder)) if textfilename.endswith(".txt")] required_gtfsfiles = ['stops.txt', 'routes.txt', 'trips.txt', - 'stop_times.txt', 'calendar.txt'] - optional_gtfsfiles = ['agency.txt', 'calendar_dates.txt'] + 'stop_times.txt'] + optional_gtfsfiles = ['agency.txt'] + + calendar_files = [i for i in ['calendar.txt', 'calendar_dates.txt'] if i in textfilelist] + if len(calendar_files)==0: + raise ValueError( + 'at least one of `calendar.txt` or `calendar_dates.txt` is required to complete a GTFS dataset but neither was found in ' + 'folder {}'.format( + os.path.join(gtfsfeed_path, folder))) + for required_file in required_gtfsfiles: if required_file not in textfilelist: raise ValueError( @@ -263,10 +271,19 @@ def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, stop_times_df = utils_format._read_gtfs_stop_times( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) - if textfile == 'calendar.txt': + + for textfile in calendar_files: + if textfile == 'calendar.txt': # if calendar, use that and set the other as blank calendar_df = utils_format._read_gtfs_calendar( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) + calendar_dates_df = pd.DataFrame() + + else: # otherwise, use calendar_dates and set the other as blank + calendar_dates_df = utils_format._read_gtfs_calendar_dates( + textfile_path=os.path.join(gtfsfeed_path, folder), + textfile=textfile) + calendar_df = pd.DataFrame(columns=['service_id']) for textfile in optional_gtfsfiles: if textfile == 'agency.txt': @@ -276,13 +293,6 @@ def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, textfile=textfile) else: agency_df = pd.DataFrame() - if textfile == 'calendar_dates.txt': - if textfile in textfilelist: - calendar_dates_df = utils_format._read_gtfs_calendar_dates( - textfile_path=os.path.join(gtfsfeed_path, folder), - textfile=textfile) - else: - calendar_dates_df = pd.DataFrame() stops_df, routes_df, trips_df, stop_times_df, calendar_df, \ calendar_dates_df = (utils_format diff --git a/urbanaccess/gtfs/utils_format.py b/urbanaccess/gtfs/utils_format.py index 6ed694d..93c388b 100644 --- a/urbanaccess/gtfs/utils_format.py +++ b/urbanaccess/gtfs/utils_format.py @@ -217,7 +217,7 @@ def _read_gtfs_calendar_dates(textfile_path, textfile): df = pd.read_csv(os.path.join(textfile_path, textfile), dtype={'service_id': object}, low_memory=False) if len(df) == 0: - raise ValueError('{} has no records'.format(os.path.join( + log('{} has no records'.format(os.path.join( textfile_path, textfile))) # remove any extra whitespace in column names df.rename(columns=lambda x: x.strip(), inplace=True) From d8f32c8f7bb614aadea985d5f8764c7a608f3418 Mon Sep 17 00:00:00 2001 From: eli knaap Date: Tue, 15 Sep 2020 22:39:03 -0700 Subject: [PATCH 03/35] create cols when necessary and dont check for calendar file in network --- urbanaccess/gtfs/load.py | 7 +++++-- urbanaccess/gtfs/network.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/urbanaccess/gtfs/load.py b/urbanaccess/gtfs/load.py index de39908..c5f3cb4 100644 --- a/urbanaccess/gtfs/load.py +++ b/urbanaccess/gtfs/load.py @@ -277,13 +277,16 @@ def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, calendar_df = utils_format._read_gtfs_calendar( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) - calendar_dates_df = pd.DataFrame() + if len(calendar_files)==1: + calendar_dates_df = pd.DataFrame(columns=['service_id', 'dates', 'exception_type']) else: # otherwise, use calendar_dates and set the other as blank calendar_dates_df = utils_format._read_gtfs_calendar_dates( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) - calendar_df = pd.DataFrame(columns=['service_id']) + if len(calendar_files)==1: + calendar_df = pd.DataFrame(columns=['service_id', 'monday', 'tuesday', 'wednesday', 'thursday', + 'friday', 'saturday', 'sunday', 'start_date', 'end_date']) for textfile in optional_gtfsfiles: if textfile == 'agency.txt': diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index fa6592b..e84fd9d 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -97,9 +97,9 @@ def create_transit_net(gtfsfeeds_dfs, day, level=lg.WARNING) if gtfsfeeds_dfs is None: raise ValueError('gtfsfeeds_dfs is None') - if gtfsfeeds_dfs.trips.empty or gtfsfeeds_dfs.calendar.empty or \ + if gtfsfeeds_dfs.trips.empty or \ gtfsfeeds_dfs.stop_times.empty or gtfsfeeds_dfs.stops.empty: - raise ValueError('one of the gtfsfeeds_dfs object trips, calendar, ' + raise ValueError('one of the gtfsfeeds_dfs object trips, ' 'stops, or stop_times were found to be empty.') if not isinstance(overwrite_existing_stop_times_int, bool): raise ValueError('overwrite_existing_stop_times_int must be bool') From 72f2c02100cc57fc8e7db4bb7a10dd9d14116708 Mon Sep 17 00:00:00 2001 From: eli knaap Date: Thu, 17 Sep 2020 09:15:45 -0700 Subject: [PATCH 04/35] formatting --- urbanaccess/gtfs/load.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/urbanaccess/gtfs/load.py b/urbanaccess/gtfs/load.py index c5f3cb4..3f50812 100644 --- a/urbanaccess/gtfs/load.py +++ b/urbanaccess/gtfs/load.py @@ -240,9 +240,10 @@ def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, optional_gtfsfiles = ['agency.txt'] calendar_files = [i for i in ['calendar.txt', 'calendar_dates.txt'] if i in textfilelist] - if len(calendar_files)==0: + if len(calendar_files) == 0: raise ValueError( - 'at least one of `calendar.txt` or `calendar_dates.txt` is required to complete a GTFS dataset but neither was found in ' + 'at least one of `calendar.txt` or `calendar_dates.txt` is required to complete a ' + 'GTFS dataset but neither was found in ' 'folder {}'.format( os.path.join(gtfsfeed_path, folder))) @@ -277,16 +278,19 @@ def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, calendar_df = utils_format._read_gtfs_calendar( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) - if len(calendar_files)==1: - calendar_dates_df = pd.DataFrame(columns=['service_id', 'dates', 'exception_type']) + if len(calendar_files) == 1: + calendar_dates_df = pd.DataFrame(columns=['service_id', 'dates', + 'exception_type']) else: # otherwise, use calendar_dates and set the other as blank calendar_dates_df = utils_format._read_gtfs_calendar_dates( - textfile_path=os.path.join(gtfsfeed_path, folder), - textfile=textfile) - if len(calendar_files)==1: - calendar_df = pd.DataFrame(columns=['service_id', 'monday', 'tuesday', 'wednesday', 'thursday', - 'friday', 'saturday', 'sunday', 'start_date', 'end_date']) + textfile_path=os.path.join(gtfsfeed_path, folder), + textfile=textfile) + if len(calendar_files) == 1: + calendar_df = pd.DataFrame(columns=['service_id', 'monday', + 'tuesday', 'wednesday', 'thursday', + 'friday', 'saturday', 'sunday', + 'start_date', 'end_date']) for textfile in optional_gtfsfiles: if textfile == 'agency.txt': From 35a307045af24890efd641578aeb0c4446e41f3c Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 24 Sep 2020 19:36:52 -0700 Subject: [PATCH 05/35] update readme and docs on support of calendar or calendar_dates --- README.rst | 5 ++--- docs/source/introduction.rst | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 03af455..17c39a9 100644 --- a/README.rst +++ b/README.rst @@ -113,9 +113,8 @@ Minimum GTFS data requirements The minimum `GTFS data types `__ required to use -UrbanAccess are: ``stop_times``, ``stops``, ``routes``, ``calendar``, -and ``trips`` however if there is no ``calendar``, ``calendar_dates`` -can be used as a replacement. +UrbanAccess are: ``stop_times``, ``stops``, ``routes`` and ``trips`` and +one of either ``calendar`` or ``calendar_dates``. Related UDST libraries ---------------------- diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index 0f8e0ed..318a3fb 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -39,7 +39,7 @@ A `demo `__ is available a Minimum GTFS data requirements ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The minimum `GTFS data types `__ required to use UrbanAccess are: ``stop_times``, ``stops``, ``routes``, ``calendar``, and ``trips``. If you are using a feed that does not have or utilize a calendar you may use the ``calendar_dates`` file instead of ``calendar`` with the ``calendar_dates_lookup`` parameter :ref:`here `. +The minimum `GTFS data types `__ required to use UrbanAccess are: ``stop_times``, ``stops``, ``routes``, and ``trips`` and either ``calendar`` or ``calendar_dates``. If you are using a feed that does not have or utilize a calendar you may use the ``calendar_dates`` file instead of ``calendar`` with the ``calendar_dates_lookup`` parameter :ref:`here `. License ~~~~~~~~ From e7ae976549e679613f87e0d6e2af83b17312a236 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 24 Sep 2020 19:38:41 -0700 Subject: [PATCH 06/35] remove generation of blank calendar df in san diego int test --- .../tests/integration/integration_sandiego.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/urbanaccess/tests/integration/integration_sandiego.py b/urbanaccess/tests/integration/integration_sandiego.py index 5630dd8..8f67636 100644 --- a/urbanaccess/tests/integration/integration_sandiego.py +++ b/urbanaccess/tests/integration/integration_sandiego.py @@ -1,6 +1,5 @@ import os import time -import pandas as pd import matplotlib @@ -26,19 +25,6 @@ urbanaccess.gtfsfeeds.download(data_folder=root_path) -# create dummy calendar.txt file because -dummy_txt_file = os.path.join(root_path, - 'gtfsfeed_text', - 'MTS', - 'calendar.txt') - -data = {'service_id': -99, 'monday': 0, 'tuesday': 0, 'wednesday': 0, - 'thursday': 0, 'friday': 0, 'saturday': 0, 'sunday': 0} - -index = range(1) - -pd.DataFrame(data, index).to_csv(dummy_txt_file, index=False) - validation = True verbose = True # small bbox for testing purposes From 22c998eef47cad02f7ff44ec0b8c5dbb6707dec0 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 24 Sep 2020 19:41:06 -0700 Subject: [PATCH 07/35] make blank df detection print for read of calendar and calendar_dates the same and as warning instead of valueerror --- urbanaccess/gtfs/utils_format.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/urbanaccess/gtfs/utils_format.py b/urbanaccess/gtfs/utils_format.py index 93c388b..65d71ef 100644 --- a/urbanaccess/gtfs/utils_format.py +++ b/urbanaccess/gtfs/utils_format.py @@ -179,13 +179,11 @@ def _read_gtfs_calendar(textfile_path, textfile): df = pd.read_csv(os.path.join(textfile_path, textfile), dtype={'service_id': object}, low_memory=False) - if len(df) == 0: - error_msg = ('{} has no records. This could indicate that this feed ' - 'is using calendar_dates.txt for service_ids. If so, ' - 'make a dummy row in calendar.txt to proceed.') - raise ValueError(error_msg.format(os.path.join(textfile_path, - textfile))) + warning_msg = ('{} has no records. This could indicate that this feed ' + 'is using calendar_dates.txt for service_ids.') + log(warning_msg.format(os.path.join( + textfile_path, textfile)), level=lg.WARNING) columnlist = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'] @@ -217,8 +215,11 @@ def _read_gtfs_calendar_dates(textfile_path, textfile): df = pd.read_csv(os.path.join(textfile_path, textfile), dtype={'service_id': object}, low_memory=False) if len(df) == 0: - log('{} has no records'.format(os.path.join( - textfile_path, textfile))) + warning_msg = ('{} has no records. This could indicate that this feed ' + 'is using calendar.txt for service_ids.') + log(warning_msg.format(os.path.join( + textfile_path, textfile)), level=lg.WARNING) + # remove any extra whitespace in column names df.rename(columns=lambda x: x.strip(), inplace=True) return df From 1e42d0d1683de4b112f085c5c4c9cb98c2bf83d1 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 24 Sep 2020 19:43:43 -0700 Subject: [PATCH 08/35] minor formatting and notes for calendar and calendar_dates df loading section --- urbanaccess/gtfs/load.py | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/urbanaccess/gtfs/load.py b/urbanaccess/gtfs/load.py index 3f50812..a5f3180 100644 --- a/urbanaccess/gtfs/load.py +++ b/urbanaccess/gtfs/load.py @@ -238,14 +238,17 @@ def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, required_gtfsfiles = ['stops.txt', 'routes.txt', 'trips.txt', 'stop_times.txt'] optional_gtfsfiles = ['agency.txt'] + # either calendar or calendar_dates is required + calendar_gtfsfiles = ['calendar.txt', 'calendar_dates.txt'] - calendar_files = [i for i in ['calendar.txt', 'calendar_dates.txt'] if i in textfilelist] + calendar_files = [i for i in calendar_gtfsfiles if i in textfilelist] if len(calendar_files) == 0: - raise ValueError( - 'at least one of `calendar.txt` or `calendar_dates.txt` is required to complete a ' - 'GTFS dataset but neither was found in ' - 'folder {}'.format( - os.path.join(gtfsfeed_path, folder))) + error_msg = ( + 'at least one of `calendar.txt` or `calendar_dates.txt` is ' + 'required to complete a GTFS dataset but neither was found in ' + 'folder {}') + raise ValueError(error_msg.format(os.path.join( + gtfsfeed_path, folder))) for required_file in required_gtfsfiles: if required_file not in textfilelist: @@ -274,23 +277,30 @@ def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, textfile=textfile) for textfile in calendar_files: - if textfile == 'calendar.txt': # if calendar, use that and set the other as blank + # use both calendar and calendar_dates if they exist, otherwise + # if only one of them exists use the one that exists and set the + # other one that does not exist to a blank df + if textfile == 'calendar.txt': calendar_df = utils_format._read_gtfs_calendar( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) + # if only calendar, set calendar_dates as blank + # with default required columns if len(calendar_files) == 1: - calendar_dates_df = pd.DataFrame(columns=['service_id', 'dates', - 'exception_type']) - - else: # otherwise, use calendar_dates and set the other as blank + calendar_dates_df = pd.DataFrame( + columns=['service_id', 'dates', 'exception_type']) + else: calendar_dates_df = utils_format._read_gtfs_calendar_dates( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) + # if only calendar_files, set calendar as blank + # with default required columns if len(calendar_files) == 1: - calendar_df = pd.DataFrame(columns=['service_id', 'monday', - 'tuesday', 'wednesday', 'thursday', - 'friday', 'saturday', 'sunday', - 'start_date', 'end_date']) + calendar_df = pd.DataFrame( + columns=['service_id', 'monday', + 'tuesday', 'wednesday', 'thursday', + 'friday', 'saturday', 'sunday', + 'start_date', 'end_date']) for textfile in optional_gtfsfiles: if textfile == 'agency.txt': From e224c0231e720d6d50a231253e361857b1ab4f25 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 24 Sep 2020 19:46:13 -0700 Subject: [PATCH 09/35] minor formatting and add valueerror is both calendar and calendar_dates dfs are empty --- urbanaccess/gtfs/network.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index e84fd9d..a15658e 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -97,10 +97,15 @@ def create_transit_net(gtfsfeeds_dfs, day, level=lg.WARNING) if gtfsfeeds_dfs is None: raise ValueError('gtfsfeeds_dfs is None') - if gtfsfeeds_dfs.trips.empty or \ - gtfsfeeds_dfs.stop_times.empty or gtfsfeeds_dfs.stops.empty: - raise ValueError('one of the gtfsfeeds_dfs object trips, ' - 'stops, or stop_times were found to be empty.') + error_msg = ('one of the following gtfsfeeds_dfs objects {} were ' + 'found to be empty.') + if gtfsfeeds_dfs.trips.empty or gtfsfeeds_dfs.stop_times.empty or \ + gtfsfeeds_dfs.stops.empty: + error_msg_case_1 = 'trips, stops, or stop_times' + raise ValueError(error_msg.format(error_msg_case_1)) + if gtfsfeeds_dfs.calendar.empty and gtfsfeeds_dfs.calendar_dates.empty: + error_msg_case_2 = 'calendar or calendar_dates' + raise ValueError(error_msg.format(error_msg_case_2)) if not isinstance(overwrite_existing_stop_times_int, bool): raise ValueError('overwrite_existing_stop_times_int must be bool') if not isinstance(use_existing_stop_times_int, bool): From 26ad4f85d8033f4daa231da2f8c860d9da5243e1 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 24 Sep 2020 19:47:31 -0700 Subject: [PATCH 10/35] fix print for when calendar_dates_lookup does not find match --- urbanaccess/gtfs/network.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index a15658e..f264a2b 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -374,18 +374,19 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, subset_result = input_calendar_dates_df[ input_calendar_dates_df[col_name_key].str.match( text, case=False, na=False)] - feed_id_list = subset_result['unique_feed_id'].unique() - for index, id in enumerate(feed_id_list): - feed_id_list[index] = ' '.join(id.split('_')[:-1]) - - log('Found {:,} records that matched query: column: {} and ' - 'string: {} for GTFS feed(s): {}'.format(len( - subset_result), - col_name_key, - text, - feed_id_list)) - - subset_result_df = subset_result_df.append(subset_result) + if len(subset_result) != 0: + feed_id_list = subset_result['unique_feed_id'].unique() + for index, id in enumerate(feed_id_list): + feed_id_list[index] = ' '.join(id.split('_')[:-1]) + + log('Found {:,} records that matched query: column: {} and ' + 'string: {} for GTFS feed(s): {}'.format(len( + subset_result), + col_name_key, + text, + feed_id_list)) + + subset_result_df = subset_result_df.append(subset_result) subset_result_df.drop_duplicates(inplace=True) subset_result_df = subset_result_df[['unique_service_id']] From 40cdf312e8604646e0f0c0366bcf7e0327cf15d6 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 24 Sep 2020 19:48:59 -0700 Subject: [PATCH 11/35] formatting --- urbanaccess/gtfs/network.py | 4 ++-- urbanaccess/tests/integration/integration_sandiego.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index f264a2b..f4ad46c 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -379,8 +379,8 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, for index, id in enumerate(feed_id_list): feed_id_list[index] = ' '.join(id.split('_')[:-1]) - log('Found {:,} records that matched query: column: {} and ' - 'string: {} for GTFS feed(s): {}'.format(len( + log('Found {:,} records that matched query: column: {} ' + 'and string: {} for GTFS feed(s): {}'.format(len( subset_result), col_name_key, text, diff --git a/urbanaccess/tests/integration/integration_sandiego.py b/urbanaccess/tests/integration/integration_sandiego.py index 8f67636..e3fd242 100644 --- a/urbanaccess/tests/integration/integration_sandiego.py +++ b/urbanaccess/tests/integration/integration_sandiego.py @@ -47,9 +47,10 @@ timerange=['07:00:00', '10:00:00']) # This is the standard map projection for California -teale_albers = ccrs.AlbersEqualArea(false_northing=-4000000.0, false_easting=0, - central_longitude=-120.0, central_latitude=0, - standard_parallels=(34.0, 40.5)) +teale_albers = ccrs.AlbersEqualArea( + false_northing=-4000000.0, false_easting=0, + central_longitude=-120.0, central_latitude=0, + standard_parallels=(34.0, 40.5)) teale_albers_ax = plt.axes(projection=teale_albers) urbanaccess.plot.plot_net(nodes=transit_net.transit_nodes, From 739332a2d36ff9fe2db35f26b7269ffe472790ad Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 24 Sep 2020 19:53:36 -0700 Subject: [PATCH 12/35] added TODO and removed calendar day param and notation from interpolate_stop_times() to make agnostic to calendar or calendar_dates file usage --- urbanaccess/gtfs/network.py | 24 +++++++++++------------- urbanaccess/tests/test_gtfs_network.py | 6 +++--- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index f4ad46c..c22142e 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -122,6 +122,9 @@ def create_transit_net(gtfsfeeds_dfs, day, if 'direction_id' not in gtfsfeeds_dfs.trips.columns: columns.remove('direction_id') + # TODO: support use case where only calendar_dates is in use: make 'day' + # optional as None but require either day or calendar_dates_lookup + # to exist but both are not required calendar_selected_trips_df = _trip_schedule_selector( input_trips_df=gtfsfeeds_dfs.trips[columns], input_calendar_df=gtfsfeeds_dfs.calendar, @@ -134,8 +137,7 @@ def create_transit_net(gtfsfeeds_dfs, day, is False: gtfsfeeds_dfs.stop_times_int = _interpolate_stop_times( stop_times_df=gtfsfeeds_dfs.stop_times, - calendar_selected_trips_df=calendar_selected_trips_df, - day=day) + calendar_selected_trips_df=calendar_selected_trips_df) gtfsfeeds_dfs.stop_times_int = _time_difference( stop_times_df=gtfsfeeds_dfs.stop_times_int) @@ -434,7 +436,7 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, return calendar_selected_trips_df -def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df, day): +def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): """ Interpolate missing stop times using a linear interpolator between known stop times @@ -445,10 +447,6 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df, day): stop times DataFrame calendar_selected_trips_df : pandas.DataFrame DataFrame of trips that run on specific day - day : {'friday','monday','saturday','sunday','thursday','tuesday', - 'wednesday'} - day of the week to extract transit schedule from that corresponds - to the day in the GTFS calendar Returns ------- @@ -486,7 +484,7 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df, day): 'unique_trip_id'].unique().tolist() # select trip ids that match the trips in the # calendar_selected_trips_df -- resulting df will be stop times - # only for trips that run on the service day of interest + # only for trips that run on the service day or dates of interest stop_times_df = stop_times_df[ stop_times_df['unique_trip_id'].isin(uniquetriplist)] @@ -504,10 +502,10 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df, day): level=lg.WARNING) log('Starting departure stop time interpolation...') log( - 'Departure time records missing from trips following {} ' - 'schedule: {:,} ({:.2f} percent of {:,} total ' + 'Departure time records missing from trips following the ' + 'specified schedule: {:,} ({:.2f} percent of {:,} total ' 'records)'.format( - day, missing_stop_times_count, + missing_stop_times_count, (missing_stop_times_count / len(stop_times_df)) * 100, len(stop_times_df['departure_time_sec']))) @@ -516,8 +514,8 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df, day): else: log('There are no departure time records missing from trips ' - 'following {} schedule. There are no records to ' - 'interpolate.'.format(day)) + 'following the specified schedule. There are no records to ' + 'interpolate.') # Find trips with more than one missing time # Note: all trip ids have at least 1 null departure time because the diff --git a/urbanaccess/tests/test_gtfs_network.py b/urbanaccess/tests/test_gtfs_network.py index 4bb7f71..46accfb 100644 --- a/urbanaccess/tests/test_gtfs_network.py +++ b/urbanaccess/tests/test_gtfs_network.py @@ -82,7 +82,7 @@ def stop_times_interpolated(): def test_interpolator(stop_times, calendar): - df = network._interpolate_stop_times(stop_times, calendar, day='monday') + df = gtfs_network._interpolate_stop_times(stop_times, calendar) # unique_trip_id should be generated assert df.loc[1, 'unique_trip_id'] == 'a_citytrains' @@ -121,7 +121,7 @@ def test_skip_interpolator(stop_times, calendar): stop_times['departure_time_sec'] = series - df = network._interpolate_stop_times(stop_times, calendar, day='monday') + df = gtfs_network._interpolate_stop_times(stop_times, calendar) # everything should be the same, # with one row dropped for calendar day filter @@ -132,7 +132,7 @@ def test_skip_interpolator(stop_times, calendar): def test_edge_reformatter(stop_times_interpolated): - df = network._format_transit_net_edge(stop_times_interpolated) + df = gtfs_network._format_transit_net_edge(stop_times_interpolated) # length of edge df should be 16 assert len(df) == 16 From ada144e089f48103675cfecdf7debe4c0e3c0fc8 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 24 Sep 2020 19:55:20 -0700 Subject: [PATCH 13/35] updated unit tests adding conftest and moving fixtures to there for reuse --- urbanaccess/tests/conftest.py | 799 ++++++++++++++++++++ urbanaccess/tests/test_gtfs_utils_format.py | 678 ----------------- 2 files changed, 799 insertions(+), 678 deletions(-) create mode 100644 urbanaccess/tests/conftest.py diff --git a/urbanaccess/tests/conftest.py b/urbanaccess/tests/conftest.py new file mode 100644 index 0000000..fdf9b0a --- /dev/null +++ b/urbanaccess/tests/conftest.py @@ -0,0 +1,799 @@ +import pytest +import os +import pandas as pd +import numpy as np + + +@pytest.fixture +def agency_feed_1(): + data = { + 'agency_id': 'agency a', + 'agency_name': 'agency a city a', + 'agency_url': 'http://www.agency_a.org', + 'agency_timezone': 'America/Los_Angeles', + 'agency_phone': '(000) 000-0000' + } + index = range(1) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def agency_feed_2(): + data = { + 'agency_id': ['agency b bus', 'agency b rail'], + 'agency_name': ['agency b district 1', 'agency b district 2'], + 'agency_url': ['http://www.agency_b.org', 'http://www.agency_b.org'], + 'agency_timezone': ['America/Los_Angeles', 'America/Los_Angeles'], + 'agency_phone': ['(000) 000-0000', '(000) 000-0000'] + } + index = range(2) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def agency_feed_3(): + data = { + 'agency_id': '', + 'agency_name': 'agency c', + 'agency_url': 'http://www.agency_c.org', + 'agency_timezone': 'America/Los_Angeles', + 'agency_phone': '(000) 000-0000' + } + index = range(1) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def agency_feed_4(): + data = { + 'agency_id': ['agency 1', 'agency 2', 'agency 3'], + 'agency_name': ['agency 1 bus', 'agency 2 rail', 'agency 3 metro'], + 'agency_url': ['http://www.agency_1.org', 'http://www.agency_2.org', + 'http://www.agency_2.org'], + 'agency_timezone': ['America/Los_Angeles', 'America/Los_Angeles', + 'America/Los_Angeles'], + 'agency_phone': ['(000) 000-0000', '(000) 000-0000', '(000) 000-0000'] + } + index = range(3) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def routes_feed_1(): + data = { + 'agency_id': ['agency a'] * 4, + 'route_id': ['10-101', '11-101', '12-101', '13-101'], + 'route_short_name': ['10', '11', 'red', 'blue'], + 'route_long_name': ['ave a local', 'ave a express', 'red line', + 'blue line'], + 'route_type': [3, 3, 1, 1] + } + + index = range(4) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def routes_feed_2(): + data = { + 'agency_id': ['agency b bus', 'agency b bus', 'agency b rail', + 'agency b rail'], + 'route_id': ['40-4', '40-4x', 'r-2', 'r-2ext'], + 'route_short_name': ['40', '40', 'red', 'red-ext'], + 'route_long_name': ['ave a local', 'ave a express', 'red line', + 'red line extension'], + 'route_type': [3, 3, 1, 1] + } + + index = range(4) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def routes_feed_4(): + data = { + 'agency_id': ['agency 1', 'agency 1', 'agency 2', 'agency 2', + 'agency 3', 'agency 3'], + 'route_id': ['a1x', 'a1', 'a2x', 'a2', 'a3x', 'a3'], + 'route_short_name': ['1x', '1', '2x', '2', '3x', '3'], + 'route_long_name': ['1 express', '1 local', '2 express', + '2 local', '3 express', '3 local'], + 'route_type': [3, 3, 3, 3, 3, 3] + } + + index = range(6) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def stops_feed_1(): + data = { + 'stop_id': ['1', '2', '3', '4', '5', '6', + '7', '8', '9'], + 'stop_name': ['ave a', 'ave b', 'ave c', 'ave d', 'ave e', 'ave f', + '1st st', '2nd st', '3rd st'], + 'stop_lat': [37.797484, 37.774963, 37.803664, 37.80787, 37.828415, + 37.844601, 37.664174, 37.591208, 37.905628], + 'stop_lon': [-122.265609, -122.224274, -122.271604, -122.269029, + -122.267227, -122.251793, -122.444116, -122.017867, + -122.067423], + 'location_type': [1, 1, 1, 1, 1, 1, + 2, 2, 2], + 'wheelchair_boarding': [1, 0, 0, 0, 0, 0, + 1, 1, 1] + } + + index = range(9) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def stops_feed_2(): + data = { + 'stop_id': ['60', '61', '62', '63', '64', '65', + '66', '67', '68', + '600', '601', '602', '603', '604', '605', '606'], + 'stop_name': ['ave m', 'ave n', 'ave o', 'ave p', 'ave q', 'ave r', + '10th st', '11th st', '12th st', + '121th st', '122th st', '123th st', '124th st', + '125th st', '126th st', '127th st'], + 'stop_lat': [38.797484, 38.774963, 38.803664, 38.80787, 38.828415, + 38.844601, 38.664174, 38.591208, 38.905628, + 38.603664, 38.60787, 38.628415, + 38.644601, 38.660000, 38.691208, 38.605628], + 'stop_lon': [-121.265609, -121.224274, -121.271604, -121.269029, + -121.267227, -121.251793, -121.444116, -121.017867, + -121.067423, -122.271604, -122.269029, -122.267227, + -122.251793, -122.444116, -122.017867, -122.067423], + 'location_type': [1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + 'wheelchair_boarding': [1, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + } + + index = range(16) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def stops_feed_4(): + data = { + 'stop_id': ['70', '71', '72', '73', '74', '75', + '76', '77', '78'], + 'stop_name': ['station 1', 'station 2', 'station 3', 'station 4', + 'station 5', 'station 6', + 'station 7', 'station 8', 'station 9'], + 'stop_lat': [20.797484, 20.774963, 20.803664, 20.80787, 20.828415, + 20.844601, 20.664174, 20.591208, 20.905628], + 'stop_lon': [-100.265609, -100.224274, -100.271604, -100.269029, + -100.267227, -100.251793, -100.444116, -100.017867, + -100.067423] + } + + index = range(9) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def trips_feed_1(): + data = { + 'route_id': ['10-101', '10-101', '10-101', '10-101', + '11-101', '11-101', + '12-101', '12-101', + '13-101', '13-101'], + 'trip_id': ['a1', 'a2', 'a3', 'a4', + 'b1', 'b2', + 'c1', 'c2', + 'd1', 'd2'], + 'service_id': ['weekday-1', 'weekday-1', 'weekday-1', 'weekday-1', + 'weekday-2', 'weekday-2', + 'weekday-3', 'weekday-3', + 'weekend-1', 'weekend-1'], + 'direction_id': [1, 0, 1, 0, + 1, 0, + 1, 0, + 1, 0], + 'wheelchair_accessible': [1, 1, 1, 1, + 0, 0, + 0, 0, + 0, 0], + 'bikes_allowed': [1, 1, 1, 1, + 0, 0, + 0, 0, + 0, 0] + } + + index = range(10) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def trips_feed_2(): + data = { + + 'route_id': ['40-4', '40-4', '40-4', '40-4', + '40-4x', '40-4x', + 'r-2', 'r-2', + 'r-2ext', 'r-2ext'], + 'trip_id': ['11', '12', '13', '14', + '21', '22', + '31', '32', + '41', '42'], + 'service_id': ['weekday-1', 'weekday-1', 'weekday-1', 'weekday-1', + 'weekday-2', 'weekday-2', + 'weekday-3', 'weekday-3', + 'weekend-1', 'weekend-1'] + } + + index = range(10) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def trips_feed_4(): + data = { + 'route_id': ['a1x', 'a1x', 'a1x', 'a1x', + 'a1', 'a1', + 'a2x', 'a2x', + 'a2', 'a2', + 'a3x', 'a3x', + 'a3', 'a3'], + 'trip_id': ['a131', 'a132', 'a133', 'a134', + 'a135', 'a136', + 'a237', 'a238', + 'a239', 'a240', + 'a341', 'a342', + 'a343', 'a344'], + 'service_id': ['wk-1', 'wk-1', 'wk-1', 'wk-1', + 'wk-1', 'wk-1', + 'wk-1', 'wk-1', + 'wk-1', 'wk-1', + 'wk-1', 'wk-1', + 'wk-1', 'wk-1'] + } + + index = range(14) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def calendar_feed_1(): + data = { + 'service_id': ['weekday-1', + 'weekday-2', + 'weekday-3', + 'weekend-1'], + 'monday': [1, 1, 1, 0], + 'tuesday': [1, 1, 1, 0], + 'wednesday': [1, 1, 1, 0], + 'thursday': [1, 1, 1, 0], + 'friday': [1, 1, 1, 0], + 'saturday': [0, 0, 0, 1], + 'sunday': [0, 0, 0, 1], + 'start_date': [20161224] * 4, + 'end_date': [20170318] * 4} + + index = range(4) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def calendar_feed_2(): + data = { + 'service_id': ['weekday-1', + 'weekday-2', + 'weekday-3', + 'weekend-1'], + 'monday': [1, 1, 1, 0], + 'tuesday': [1, 1, 1, 0], + 'wednesday': [1, 1, 1, 0], + 'thursday': [1, 1, 1, 0], + 'friday': [1, 1, 1, 0], + 'saturday': [0, 0, 0, 1], + 'sunday': [0, 0, 0, 1], + 'start_date': [20161224] * 4, + 'end_date': [20170318] * 4} + + index = range(4) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def calendar_feed_4(): + data = { + 'service_id': ['wk-1'], + 'monday': [1], + 'tuesday': [1], + 'wednesday': [1], + 'thursday': [1], + 'friday': [1], + 'saturday': [0], + 'sunday': [0], + 'start_date': [20161224], + 'end_date': [20170318]} + + index = range(1) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def calendar_empty(): + columns = {'service_id', + 'monday', + 'tuesday', + 'wednesday', + 'thursday', + 'friday', + 'saturday', + 'sunday', + 'start_date', + 'end_date'} + + df = pd.DataFrame(columns=columns) + return df + + +@pytest.fixture +def calendar_dates_feed_1(): + data = { + 'service_id': ['weekday-1', + 'weekday-2', + 'weekday-3', + 'weekend-1'], + 'date': [20161224, 20170318, 20160424, 20161230], + 'exception_type': [1, 2, 1, 1]} + + index = range(4) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def calendar_dates_feed_2(): + data = { + 'service_id': ['weekday-1', + 'weekday-2', + 'weekday-3', + 'weekend-1'], + 'date': [20161224, 20170318, 20160424, 20161230], + 'exception_type': [1, 2, 1, 1], + 'schedule_type': ['WD', 'WD', 'WD', 'SA'] + } + + index = range(4) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def calendar_dates_feed_4(): + data = { + 'service_id': ['wk-1'], + 'date': [20161224], + 'exception_type': [1]} + + index = range(1) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def stop_times_feed_1(): + data = { + 'trip_id': ['a1', 'a1', 'a1', 'a1', 'a1', 'a1', + 'a2', 'a2', 'a2', 'a2', 'a2', 'a2', + 'a3', 'a3', 'a3', 'a3', 'a3', 'a3', + 'a4', 'a4', 'a4', 'a4', 'a4', 'a4', + 'b1', 'b1', 'b1', 'b1', 'b1', 'b1', + 'b2', 'b2', 'b2', 'b2', 'b2', 'b2', + 'c1', 'c1', 'c1', 'c1', 'c1', 'c1', + 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', + 'd1', 'd1', 'd1', + 'd2', 'd2', 'd2'], + 'stop_id': ['1', '2', '3', '4', '5', '6', + '6', '5', '4', '3', '2', '1', + '1', '2', '3', '4', '5', '6', + '6', '5', '4', '3', '2', '1', + '1', '2', '3', '4', '5', '6', + '6', '5', '4', '3', '2', '1', + '1', '2', '3', '4', '5', '6', + '6', '5', '4', '3', '2', '1', + '7', '8', '9', + '9', '8', '7'], + 'arrival_time': ['06:15:00', '06:20:00', np.nan, np.nan, '06:35:00', + '06:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '08:15:00', '08:20:00', '08:25:00', '08:30:00', + '08:35:00', '08:40:00', + '13:15:00', '13:20:00', '13:25:00', '13:30:00', + '13:35:00', '13:40:00', + '06:15:00', '06:20:00', np.nan, np.nan, '06:35:00', + '06:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '26:15:00', '26:20:00', np.nan, np.nan, '26:35:00', + '26:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '06:15:00', '06:20:00', '06:25:00', + '06:15:00', '06:20:00', '06:25:00'], + 'departure_time': ['06:15:00', '06:20:00', np.nan, np.nan, + '06:35:00', '06:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '08:15:00', '08:20:00', '08:25:00', '08:30:00', + '08:35:00', '08:40:00', + '13:15:00', '13:20:00', '13:25:00', '13:30:00', + '13:35:00', '13:40:00', + '06:15:00', '06:20:00', np.nan, np.nan, + '06:35:00', '06:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '26:15:00', '26:20:00', np.nan, np.nan, + '26:35:00', '26:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '06:15:00', '06:20:00', '06:25:00', + '06:15:00', '06:20:00', '06:25:00'], + 'stop_sequence': [1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, + 1, 2, 3], + 'pickup_type': [0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, + 0, 0, 0], + 'drop_off_type': [0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, + 0, 0, 0] + } + index = range(54) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def stop_times_feed_2(): + data = { + 'trip_id': ['11', '11', '11', '11', '11', '11', + '12', '12', '12', '12', '12', '12', + '13', '13', '13', '13', '13', '13', + '14', '14', '14', '14', '14', '14', + '21', '21', '21', '21', '21', '21', + '22', '22', '22', '22', '22', '22', + '31', '31', '31', '31', '31', '31', + '32', '32', '32', '32', '32', '32', + '41', '41', '41', + '42', '42', '42'], + 'stop_id': ['60', '61', '62', '63', '64', '65', + '65', '64', '63', '62', '61', '60', + '60', '61', '62', '63', '64', '65', + '65', '64', '63', '62', '61', '60', + '60', '61', '62', '63', '64', '65', + '65', '64', '63', '62', '61', '60', + '600', '601', '602', '603', '604', '605', + '606', '605', '604', '603', '602', '601', + '66', '67', '68', + '68', '67', '66'], + 'arrival_time': ['06:15:00', '06:20:00', np.nan, np.nan, '06:35:00', + '06:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '08:15:00', '08:20:00', '08:25:00', '08:30:00', + '08:35:00', '08:40:00', + '13:15:00', '13:20:00', '13:25:00', '13:30:00', + '13:35:00', '13:40:00', + '06:15:00', '06:20:00', np.nan, np.nan, '06:35:00', + '06:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '26:15:00', '26:20:00', np.nan, np.nan, '26:35:00', + '26:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '06:15:00', '06:20:00', '06:25:00', + '06:15:00', '06:20:00', '06:25:00'], + 'departure_time': ['06:15:00', '06:20:00', np.nan, np.nan, + '06:35:00', '06:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '08:15:00', '08:20:00', '08:25:00', '08:30:00', + '08:35:00', '08:40:00', + '13:15:00', '13:20:00', '13:25:00', '13:30:00', + '13:35:00', '13:40:00', + '06:15:00', '06:20:00', np.nan, np.nan, + '06:35:00', '06:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '26:15:00', '26:20:00', np.nan, np.nan, + '26:35:00', '26:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '06:15:00', '06:20:00', '06:25:00', + '06:15:00', '06:20:00', '06:25:00'], + 'stop_sequence': [1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, + 1, 2, 3] + } + index = range(54) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def stop_times_feed_4(): + data = { + 'trip_id': ['a131', 'a131', 'a131', 'a131', 'a131', 'a131', + 'a132', 'a132', 'a132', 'a132', 'a132', 'a132', + 'a133', 'a133', 'a133', 'a133', 'a133', 'a133', + 'a134', 'a134', 'a134', 'a134', 'a134', 'a134', + 'a135', 'a135', 'a135', 'a135', 'a135', 'a135', + 'a136', 'a136', 'a136', 'a136', 'a136', 'a136', + 'a237', 'a237', 'a237', 'a237', 'a237', 'a237', + 'a238', 'a238', 'a238', 'a238', 'a238', 'a238', + 'a239', 'a239', 'a239', + 'a240', 'a240', 'a240', + 'a341', 'a341', 'a341', + 'a342', 'a342', 'a342', + 'a343', 'a343', 'a343', + 'a344', 'a344', 'a344'], + 'stop_id': ['70', '71', '72', '73', '74', '75', + '75', '74', '73', '72', '71', '70', + '70', '71', '72', '73', '74', '75', + '75', '74', '73', '72', '71', '70', + '70', '71', '72', '73', '74', '75', + '75', '74', '73', '72', '71', '70', + '70', '71', '72', '73', '74', '75', + '75', '74', '73', '72', '71', '70', + '76', '77', '78', + '78', '77', '76', + '76', '77', '78', + '78', '77', '76', + '76', '77', '78', + '78', '77', '76'], + + 'arrival_time': ['06:15:00', '06:20:00', np.nan, np.nan, '06:35:00', + '06:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '08:15:00', '08:20:00', '08:25:00', '08:30:00', + '08:35:00', '08:40:00', + '13:15:00', '13:20:00', '13:25:00', '13:30:00', + '13:35:00', '13:40:00', + '06:15:00', '06:20:00', np.nan, np.nan, '06:35:00', + '06:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '26:15:00', '26:20:00', np.nan, np.nan, '26:35:00', + '26:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '06:15:00', '06:20:00', '06:25:00', + '06:15:00', '06:20:00', '06:25:00', + '06:15:00', '06:20:00', '06:25:00', + '06:15:00', '06:20:00', '06:25:00', + '06:15:00', '06:20:00', '06:25:00', + '06:15:00', '06:20:00', '06:25:00'], + 'departure_time': ['06:15:00', '06:20:00', np.nan, np.nan, + '06:35:00', '06:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '08:15:00', '08:20:00', '08:25:00', '08:30:00', + '08:35:00', '08:40:00', + '13:15:00', '13:20:00', '13:25:00', '13:30:00', + '13:35:00', '13:40:00', + '06:15:00', '06:20:00', np.nan, np.nan, + '06:35:00', '06:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '26:15:00', '26:20:00', np.nan, np.nan, + '26:35:00', '26:40:00', + '06:15:00', '06:20:00', '06:25:00', '06:30:00', + np.nan, '06:40:00', + '06:15:00', '06:20:00', '06:25:00', + '06:15:00', '06:20:00', '06:25:00', + '06:15:00', '06:20:00', '06:25:00', + '06:15:00', '06:20:00', '06:25:00', + '06:15:00', '06:20:00', '06:25:00', + '06:15:00', '06:20:00', '06:25:00'], + 'stop_sequence': [1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, + 1, 2, 3, + 1, 2, 3, + 1, 2, 3, + 1, 2, 3, + 1, 2, 3, + 1, 2, 3] + } + index = range(66) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture() +def agency_a_feed_on_disk_wo_calendar_dates( + tmpdir, + agency_feed_1, stop_times_feed_1, stops_feed_1, + routes_feed_1, trips_feed_1, calendar_feed_1): + feed_file_dict = {'agency': agency_feed_1, + 'stop_times': stop_times_feed_1, + 'stops': stops_feed_1, + 'routes': routes_feed_1, + 'trips': trips_feed_1, + 'calendar': calendar_feed_1} + feed_path = os.path.join(tmpdir, 'agency_a_wo_calendar_dates') + os.makedirs(feed_path) + print('writing test data to dir: {}'.format(feed_path)) + for feed_file, feed_df in feed_file_dict.items(): + feed_file_name = '{}.txt'.format(feed_file) + feed_df.to_csv(os.path.join(feed_path, feed_file_name), index=False) + return feed_path + + +@pytest.fixture() +def agency_a_feed_on_disk_wo_calendar( + tmpdir, + agency_feed_1, stop_times_feed_1, stops_feed_1, + routes_feed_1, trips_feed_1, calendar_dates_feed_1): + feed_file_dict = {'agency': agency_feed_1, + 'stop_times': stop_times_feed_1, + 'stops': stops_feed_1, + 'routes': routes_feed_1, + 'trips': trips_feed_1, + 'calendar_dates': calendar_dates_feed_1} + feed_path = os.path.join(tmpdir, 'agency_a_wo_calendar') + os.makedirs(feed_path) + print('writing test data to dir: {}'.format(feed_path)) + for feed_file, feed_df in feed_file_dict.items(): + feed_file_name = '{}.txt'.format(feed_file) + feed_df.to_csv(os.path.join(feed_path, feed_file_name), index=False) + return feed_path + + +@pytest.fixture() +def agency_a_feed_on_disk_w_calendar_and_calendar_dates( + tmpdir, + agency_feed_1, stop_times_feed_1, stops_feed_1, + routes_feed_1, trips_feed_1, calendar_feed_1, calendar_dates_feed_1): + feed_file_dict = {'agency': agency_feed_1, + 'stop_times': stop_times_feed_1, + 'stops': stops_feed_1, + 'routes': routes_feed_1, + 'trips': trips_feed_1, + 'calendar': calendar_feed_1, + 'calendar_dates': calendar_dates_feed_1} + feed_path = os.path.join(tmpdir, 'agency_a_w_both_calendars') + os.makedirs(feed_path) + print('writing test data to dir: {}'.format(feed_path)) + for feed_file, feed_df in feed_file_dict.items(): + feed_file_name = '{}.txt'.format(feed_file) + feed_df.to_csv(os.path.join(feed_path, feed_file_name), index=False) + return feed_path + + +@pytest.fixture() +def agency_a_feed_on_disk_wo_calendar_and_calendar_dates( + tmpdir, + agency_feed_1, stop_times_feed_1, stops_feed_1, + routes_feed_1, trips_feed_1): + feed_file_dict = {'agency': agency_feed_1, + 'stop_times': stop_times_feed_1, + 'stops': stops_feed_1, + 'routes': routes_feed_1, + 'trips': trips_feed_1} + feed_path = os.path.join(tmpdir, 'agency_a_wo_both_calendar') + os.makedirs(feed_path) + print('writing test data to dir: {}'.format(feed_path)) + for feed_file, feed_df in feed_file_dict.items(): + feed_file_name = '{}.txt'.format(feed_file) + feed_df.to_csv(os.path.join(feed_path, feed_file_name), index=False) + return feed_path + + +@pytest.fixture() +def agency_a_feed_on_disk_wo_req_file( + tmpdir, + agency_feed_1, stop_times_feed_1, stops_feed_1, + routes_feed_1, calendar_feed_1): + feed_file_dict = {'agency': agency_feed_1, + 'stop_times': stop_times_feed_1, + 'stops': stops_feed_1, + 'routes': routes_feed_1, + 'calendar': calendar_feed_1} + feed_path = os.path.join(tmpdir, 'agency_a_wo_req_file') + os.makedirs(feed_path) + print('writing test data to dir: {}'.format(feed_path)) + for feed_file, feed_df in feed_file_dict.items(): + feed_file_name = '{}.txt'.format(feed_file) + feed_df.to_csv(os.path.join(feed_path, feed_file_name), index=False) + return feed_path + + +@pytest.fixture() +def agency_a_feed_on_disk_wo_agency( + tmpdir, + agency_feed_1, stop_times_feed_1, stops_feed_1, + routes_feed_1, trips_feed_1, calendar_feed_1): + feed_file_dict = {'stop_times': stop_times_feed_1, + 'stops': stops_feed_1, + 'routes': routes_feed_1, + 'trips': trips_feed_1, + 'calendar': calendar_feed_1} + feed_path = os.path.join(tmpdir, 'agency_a_wo_agency') + os.makedirs(feed_path) + print('writing test data to dir: {}'.format(feed_path)) + for feed_file, feed_df in feed_file_dict.items(): + feed_file_name = '{}.txt'.format(feed_file) + feed_df.to_csv(os.path.join(feed_path, feed_file_name), index=False) + return feed_path diff --git a/urbanaccess/tests/test_gtfs_utils_format.py b/urbanaccess/tests/test_gtfs_utils_format.py index 659b872..4857c9f 100644 --- a/urbanaccess/tests/test_gtfs_utils_format.py +++ b/urbanaccess/tests/test_gtfs_utils_format.py @@ -1,689 +1,11 @@ import pytest import pandas as pd -import numpy as np import os from re import sub from urbanaccess.gtfs import utils_format -@pytest.fixture -def agency_feed_1(): - data = { - 'agency_id': 'agency a', - 'agency_name': 'agency a city a', - 'agency_url': 'http://www.agency_a.org', - 'agency_timezone': 'America/Los_Angeles', - 'agency_phone': '(000) 000-0000' - } - index = range(1) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def agency_feed_2(): - data = { - 'agency_id': ['agency b bus', 'agency b rail'], - 'agency_name': ['agency b district 1', 'agency b district 2'], - 'agency_url': ['http://www.agency_b.org', 'http://www.agency_b.org'], - 'agency_timezone': ['America/Los_Angeles', 'America/Los_Angeles'], - 'agency_phone': ['(000) 000-0000', '(000) 000-0000'] - } - index = range(2) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def agency_feed_3(): - data = { - 'agency_id': '', - 'agency_name': 'agency c', - 'agency_url': 'http://www.agency_c.org', - 'agency_timezone': 'America/Los_Angeles', - 'agency_phone': '(000) 000-0000' - } - index = range(1) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def agency_feed_4(): - data = { - 'agency_id': ['agency 1', 'agency 2', 'agency 3'], - 'agency_name': ['agency 1 bus', 'agency 2 rail', 'agency 3 metro'], - 'agency_url': ['http://www.agency_1.org', 'http://www.agency_2.org', - 'http://www.agency_2.org'], - 'agency_timezone': ['America/Los_Angeles', 'America/Los_Angeles', - 'America/Los_Angeles'], - 'agency_phone': ['(000) 000-0000', '(000) 000-0000', '(000) 000-0000'] - } - index = range(3) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def routes_feed_1(): - data = { - 'agency_id': ['agency a'] * 4, - 'route_id': ['10-101', '11-101', '12-101', '13-101'], - 'route_short_name': ['10', '11', 'red', 'blue'], - 'route_long_name': ['ave a local', 'ave a express', 'red line', - 'blue line'], - 'route_type': [3, 3, 1, 1] - } - - index = range(4) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def routes_feed_2(): - data = { - 'agency_id': ['agency b bus', 'agency b bus', 'agency b rail', - 'agency b rail'], - 'route_id': ['40-4', '40-4x', 'r-2', 'r-2ext'], - 'route_short_name': ['40', '40', 'red', 'red-ext'], - 'route_long_name': ['ave a local', 'ave a express', 'red line', - 'red line extension'], - 'route_type': [3, 3, 1, 1] - } - - index = range(4) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def routes_feed_4(): - data = { - 'agency_id': ['agency 1', 'agency 1', 'agency 2', 'agency 2', - 'agency 3', 'agency 3'], - 'route_id': ['a1x', 'a1', 'a2x', 'a2', 'a3x', 'a3'], - 'route_short_name': ['1x', '1', '2x', '2', '3x', '3'], - 'route_long_name': ['1 express', '1 local', '2 express', - '2 local', '3 express', '3 local'], - 'route_type': [3, 3, 3, 3, 3, 3] - } - - index = range(6) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def stops_feed_1(): - data = { - 'stop_id': ['1', '2', '3', '4', '5', '6', - '7', '8', '9'], - 'stop_name': ['ave a', 'ave b', 'ave c', 'ave d', 'ave e', 'ave f', - '1st st', '2nd st', '3rd st'], - 'stop_lat': [37.797484, 37.774963, 37.803664, 37.80787, 37.828415, - 37.844601, 37.664174, 37.591208, 37.905628], - 'stop_lon': [-122.265609, -122.224274, -122.271604, -122.269029, - -122.267227, -122.251793, -122.444116, -122.017867, - -122.067423], - 'location_type': [1, 1, 1, 1, 1, 1, - 2, 2, 2], - 'wheelchair_boarding': [1, 0, 0, 0, 0, 0, - 1, 1, 1] - } - - index = range(9) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def stops_feed_2(): - data = { - 'stop_id': ['60', '61', '62', '63', '64', '65', - '66', '67', '68', - '600', '601', '602', '603', '604', '605', '606'], - 'stop_name': ['ave m', 'ave n', 'ave o', 'ave p', 'ave q', 'ave r', - '10th st', '11th st', '12th st', - '121th st', '122th st', '123th st', '124th st', - '125th st', '126th st', '127th st'], - 'stop_lat': [38.797484, 38.774963, 38.803664, 38.80787, 38.828415, - 38.844601, 38.664174, 38.591208, 38.905628, - 38.603664, 38.60787, 38.628415, - 38.644601, 38.660000, 38.691208, 38.605628], - 'stop_lon': [-121.265609, -121.224274, -121.271604, -121.269029, - -121.267227, -121.251793, -121.444116, -121.017867, - -121.067423, -122.271604, -122.269029, -122.267227, - -122.251793, -122.444116, -122.017867, -122.067423], - 'location_type': [1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], - 'wheelchair_boarding': [1, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - } - - index = range(16) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def stops_feed_4(): - data = { - 'stop_id': ['70', '71', '72', '73', '74', '75', - '76', '77', '78'], - 'stop_name': ['station 1', 'station 2', 'station 3', 'station 4', - 'station 5', 'station 6', - 'station 7', 'station 8', 'station 9'], - 'stop_lat': [20.797484, 20.774963, 20.803664, 20.80787, 20.828415, - 20.844601, 20.664174, 20.591208, 20.905628], - 'stop_lon': [-100.265609, -100.224274, -100.271604, -100.269029, - -100.267227, -100.251793, -100.444116, -100.017867, - -100.067423] - } - - index = range(9) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def trips_feed_1(): - data = { - 'route_id': ['10-101', '10-101', '10-101', '10-101', - '11-101', '11-101', - '12-101', '12-101', - '13-101', '13-101'], - 'trip_id': ['a1', 'a2', 'a3', 'a4', - 'b1', 'b2', - 'c1', 'c2', - 'd1', 'd2'], - 'service_id': ['weekday-1', 'weekday-1', 'weekday-1', 'weekday-1', - 'weekday-2', 'weekday-2', - 'weekday-3', 'weekday-3', - 'weekend-1', 'weekend-1'], - 'direction_id': [1, 0, 1, 0, - 1, 0, - 1, 0, - 1, 0], - 'wheelchair_accessible': [1, 1, 1, 1, - 0, 0, - 0, 0, - 0, 0], - 'bikes_allowed': [1, 1, 1, 1, - 0, 0, - 0, 0, - 0, 0] - } - - index = range(10) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def trips_feed_2(): - data = { - - 'route_id': ['40-4', '40-4', '40-4', '40-4', - '40-4x', '40-4x', - 'r-2', 'r-2', - 'r-2ext', 'r-2ext'], - 'trip_id': ['11', '12', '13', '14', - '21', '22', - '31', '32', - '41', '42'], - 'service_id': ['weekday-1', 'weekday-1', 'weekday-1', 'weekday-1', - 'weekday-2', 'weekday-2', - 'weekday-3', 'weekday-3', - 'weekend-1', 'weekend-1'] - } - - index = range(10) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def trips_feed_4(): - data = { - 'route_id': ['a1x', 'a1x', 'a1x', 'a1x', - 'a1', 'a1', - 'a2x', 'a2x', - 'a2', 'a2', - 'a3x', 'a3x', - 'a3', 'a3'], - 'trip_id': ['a131', 'a132', 'a133', 'a134', - 'a135', 'a136', - 'a237', 'a238', - 'a239', 'a240', - 'a341', 'a342', - 'a343', 'a344'], - 'service_id': ['wk-1', 'wk-1', 'wk-1', 'wk-1', - 'wk-1', 'wk-1', - 'wk-1', 'wk-1', - 'wk-1', 'wk-1', - 'wk-1', 'wk-1', - 'wk-1', 'wk-1'] - } - - index = range(14) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def calendar_feed_1(): - data = { - 'service_id': ['weekday-1', - 'weekday-2', - 'weekday-3', - 'weekend-1'], - 'monday': [1, 1, 1, 0], - 'tuesday': [1, 1, 1, 0], - 'wednesday': [1, 1, 1, 0], - 'thursday': [1, 1, 1, 0], - 'friday': [1, 1, 1, 0], - 'saturday': [0, 0, 0, 1], - 'sunday': [0, 0, 0, 1], - 'start_date': [20161224] * 4, - 'end_date': [20170318] * 4} - - index = range(4) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def calendar_feed_2(): - data = { - 'service_id': ['weekday-1', - 'weekday-2', - 'weekday-3', - 'weekend-1'], - 'monday': [1, 1, 1, 0], - 'tuesday': [1, 1, 1, 0], - 'wednesday': [1, 1, 1, 0], - 'thursday': [1, 1, 1, 0], - 'friday': [1, 1, 1, 0], - 'saturday': [0, 0, 0, 1], - 'sunday': [0, 0, 0, 1], - 'start_date': [20161224] * 4, - 'end_date': [20170318] * 4} - - index = range(4) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def calendar_feed_4(): - data = { - 'service_id': ['wk-1'], - 'monday': [1], - 'tuesday': [1], - 'wednesday': [1], - 'thursday': [1], - 'friday': [1], - 'saturday': [0], - 'sunday': [0], - 'start_date': [20161224], - 'end_date': [20170318]} - - index = range(1) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def calendar_empty(): - columns = {'service_id', - 'monday', - 'tuesday', - 'wednesday', - 'thursday', - 'friday', - 'saturday', - 'sunday', - 'start_date', - 'end_date'} - - df = pd.DataFrame(columns=columns) - return df - - -@pytest.fixture -def calendar_dates_feed_1(): - data = { - 'service_id': ['weekday-1', - 'weekday-2', - 'weekday-3', - 'weekend-1'], - 'date': [20161224, 20170318, 20160424, 20161230], - 'exception_type': [1, 2, 1, 1]} - - index = range(4) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def calendar_dates_feed_2(): - data = { - 'service_id': ['weekday-1', - 'weekday-2', - 'weekday-3', - 'weekend-1'], - 'date': [20161224, 20170318, 20160424, 20161230], - 'exception_type': [1, 2, 1, 1], - 'schedule_type': ['WD', 'WD', 'WD', 'SA'] - } - - index = range(4) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def calendar_dates_feed_4(): - data = { - 'service_id': ['wk-1'], - 'date': [20161224], - 'exception_type': [1]} - - index = range(1) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def stop_times_feed_1(): - data = { - 'trip_id': ['a1', 'a1', 'a1', 'a1', 'a1', 'a1', - 'a2', 'a2', 'a2', 'a2', 'a2', 'a2', - 'a3', 'a3', 'a3', 'a3', 'a3', 'a3', - 'a4', 'a4', 'a4', 'a4', 'a4', 'a4', - 'b1', 'b1', 'b1', 'b1', 'b1', 'b1', - 'b2', 'b2', 'b2', 'b2', 'b2', 'b2', - 'c1', 'c1', 'c1', 'c1', 'c1', 'c1', - 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', - 'd1', 'd1', 'd1', - 'd2', 'd2', 'd2'], - 'stop_id': ['1', '2', '3', '4', '5', '6', - '6', '5', '4', '3', '2', '1', - '1', '2', '3', '4', '5', '6', - '6', '5', '4', '3', '2', '1', - '1', '2', '3', '4', '5', '6', - '6', '5', '4', '3', '2', '1', - '1', '2', '3', '4', '5', '6', - '6', '5', '4', '3', '2', '1', - '7', '8', '9', - '9', '8', '7'], - 'arrival_time': ['06:15:00', '06:20:00', np.nan, np.nan, '06:35:00', - '06:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '08:15:00', '08:20:00', '08:25:00', '08:30:00', - '08:35:00', '08:40:00', - '13:15:00', '13:20:00', '13:25:00', '13:30:00', - '13:35:00', '13:40:00', - '06:15:00', '06:20:00', np.nan, np.nan, '06:35:00', - '06:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '26:15:00', '26:20:00', np.nan, np.nan, '26:35:00', - '26:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '06:15:00', '06:20:00', '06:25:00', - '06:15:00', '06:20:00', '06:25:00'], - 'departure_time': ['06:15:00', '06:20:00', np.nan, np.nan, - '06:35:00', '06:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '08:15:00', '08:20:00', '08:25:00', '08:30:00', - '08:35:00', '08:40:00', - '13:15:00', '13:20:00', '13:25:00', '13:30:00', - '13:35:00', '13:40:00', - '06:15:00', '06:20:00', np.nan, np.nan, - '06:35:00', '06:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '26:15:00', '26:20:00', np.nan, np.nan, - '26:35:00', '26:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '06:15:00', '06:20:00', '06:25:00', - '06:15:00', '06:20:00', '06:25:00'], - 'stop_sequence': [1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, - 1, 2, 3], - 'pickup_type': [0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - 0, 0, 0, - 0, 0, 0], - 'drop_off_type': [0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - 0, 0, 0, - 0, 0, 0] - } - index = range(54) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def stop_times_feed_2(): - data = { - 'trip_id': ['11', '11', '11', '11', '11', '11', - '12', '12', '12', '12', '12', '12', - '13', '13', '13', '13', '13', '13', - '14', '14', '14', '14', '14', '14', - '21', '21', '21', '21', '21', '21', - '22', '22', '22', '22', '22', '22', - '31', '31', '31', '31', '31', '31', - '32', '32', '32', '32', '32', '32', - '41', '41', '41', - '42', '42', '42'], - 'stop_id': ['60', '61', '62', '63', '64', '65', - '65', '64', '63', '62', '61', '60', - '60', '61', '62', '63', '64', '65', - '65', '64', '63', '62', '61', '60', - '60', '61', '62', '63', '64', '65', - '65', '64', '63', '62', '61', '60', - '600', '601', '602', '603', '604', '605', - '606', '605', '604', '603', '602', '601', - '66', '67', '68', - '68', '67', '66'], - 'arrival_time': ['06:15:00', '06:20:00', np.nan, np.nan, '06:35:00', - '06:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '08:15:00', '08:20:00', '08:25:00', '08:30:00', - '08:35:00', '08:40:00', - '13:15:00', '13:20:00', '13:25:00', '13:30:00', - '13:35:00', '13:40:00', - '06:15:00', '06:20:00', np.nan, np.nan, '06:35:00', - '06:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '26:15:00', '26:20:00', np.nan, np.nan, '26:35:00', - '26:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '06:15:00', '06:20:00', '06:25:00', - '06:15:00', '06:20:00', '06:25:00'], - 'departure_time': ['06:15:00', '06:20:00', np.nan, np.nan, - '06:35:00', '06:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '08:15:00', '08:20:00', '08:25:00', '08:30:00', - '08:35:00', '08:40:00', - '13:15:00', '13:20:00', '13:25:00', '13:30:00', - '13:35:00', '13:40:00', - '06:15:00', '06:20:00', np.nan, np.nan, - '06:35:00', '06:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '26:15:00', '26:20:00', np.nan, np.nan, - '26:35:00', '26:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '06:15:00', '06:20:00', '06:25:00', - '06:15:00', '06:20:00', '06:25:00'], - 'stop_sequence': [1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, - 1, 2, 3] - } - index = range(54) - - df = pd.DataFrame(data, index) - return df - - -@pytest.fixture -def stop_times_feed_4(): - data = { - 'trip_id': ['a131', 'a131', 'a131', 'a131', 'a131', 'a131', - 'a132', 'a132', 'a132', 'a132', 'a132', 'a132', - 'a133', 'a133', 'a133', 'a133', 'a133', 'a133', - 'a134', 'a134', 'a134', 'a134', 'a134', 'a134', - 'a135', 'a135', 'a135', 'a135', 'a135', 'a135', - 'a136', 'a136', 'a136', 'a136', 'a136', 'a136', - 'a237', 'a237', 'a237', 'a237', 'a237', 'a237', - 'a238', 'a238', 'a238', 'a238', 'a238', 'a238', - 'a239', 'a239', 'a239', - 'a240', 'a240', 'a240', - 'a341', 'a341', 'a341', - 'a342', 'a342', 'a342', - 'a343', 'a343', 'a343', - 'a344', 'a344', 'a344'], - 'stop_id': ['70', '71', '72', '73', '74', '75', - '75', '74', '73', '72', '71', '70', - '70', '71', '72', '73', '74', '75', - '75', '74', '73', '72', '71', '70', - '70', '71', '72', '73', '74', '75', - '75', '74', '73', '72', '71', '70', - '70', '71', '72', '73', '74', '75', - '75', '74', '73', '72', '71', '70', - '76', '77', '78', - '78', '77', '76', - '76', '77', '78', - '78', '77', '76', - '76', '77', '78', - '78', '77', '76'], - - 'arrival_time': ['06:15:00', '06:20:00', np.nan, np.nan, '06:35:00', - '06:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '08:15:00', '08:20:00', '08:25:00', '08:30:00', - '08:35:00', '08:40:00', - '13:15:00', '13:20:00', '13:25:00', '13:30:00', - '13:35:00', '13:40:00', - '06:15:00', '06:20:00', np.nan, np.nan, '06:35:00', - '06:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '26:15:00', '26:20:00', np.nan, np.nan, '26:35:00', - '26:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '06:15:00', '06:20:00', '06:25:00', - '06:15:00', '06:20:00', '06:25:00', - '06:15:00', '06:20:00', '06:25:00', - '06:15:00', '06:20:00', '06:25:00', - '06:15:00', '06:20:00', '06:25:00', - '06:15:00', '06:20:00', '06:25:00'], - 'departure_time': ['06:15:00', '06:20:00', np.nan, np.nan, - '06:35:00', '06:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '08:15:00', '08:20:00', '08:25:00', '08:30:00', - '08:35:00', '08:40:00', - '13:15:00', '13:20:00', '13:25:00', '13:30:00', - '13:35:00', '13:40:00', - '06:15:00', '06:20:00', np.nan, np.nan, - '06:35:00', '06:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '26:15:00', '26:20:00', np.nan, np.nan, - '26:35:00', '26:40:00', - '06:15:00', '06:20:00', '06:25:00', '06:30:00', - np.nan, '06:40:00', - '06:15:00', '06:20:00', '06:25:00', - '06:15:00', '06:20:00', '06:25:00', - '06:15:00', '06:20:00', '06:25:00', - '06:15:00', '06:20:00', '06:25:00', - '06:15:00', '06:20:00', '06:25:00', - '06:15:00', '06:20:00', '06:25:00'], - 'stop_sequence': [1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, 4, 5, 6, - 1, 2, 3, - 1, 2, 3, - 1, 2, 3, - 1, 2, 3, - 1, 2, 3, - 1, 2, 3] - } - index = range(66) - - df = pd.DataFrame(data, index) - return df - - @pytest.fixture def folder_feed_1(): return r'/data/gtfs_feeds/agency_a' From a0d01c82d802905159be01360d4e4318c53fa523 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 24 Sep 2020 20:00:21 -0700 Subject: [PATCH 14/35] added unit tests for gtfsfeed_to_df() and create_transit_net() for calendar and calendar_dates cases --- urbanaccess/tests/test_gtfs_load.py | 142 +++++++++++++++++++++++++ urbanaccess/tests/test_gtfs_network.py | 91 +++++++++++++++- 2 files changed, 232 insertions(+), 1 deletion(-) create mode 100644 urbanaccess/tests/test_gtfs_load.py diff --git a/urbanaccess/tests/test_gtfs_load.py b/urbanaccess/tests/test_gtfs_load.py new file mode 100644 index 0000000..e2b8ad9 --- /dev/null +++ b/urbanaccess/tests/test_gtfs_load.py @@ -0,0 +1,142 @@ +import pytest +import pandas as pd + +import urbanaccess.gtfs.load as gtfs_load +from urbanaccess.gtfs.gtfsfeeds_dataframe import urbanaccess_gtfs_df + + +@pytest.fixture +def expected_urbanaccess_gtfs_df_keys(): + expected_keys = ['stops', 'routes', 'trips', 'stop_times', + 'calendar', 'calendar_dates', 'stop_times_int', + 'headways'] + return expected_keys.sort() + + +def test_loadgtfsfeed_to_df_wo_calendar( + agency_a_feed_on_disk_wo_calendar, + expected_urbanaccess_gtfs_df_keys): + feed_dir = agency_a_feed_on_disk_wo_calendar + loaded_feeds = gtfs_load.gtfsfeed_to_df( + gtfsfeed_path=feed_dir, + validation=False, + verbose=True, + bbox=None, + remove_stops_outsidebbox=False, + append_definitions=False) + assert isinstance(loaded_feeds, urbanaccess_gtfs_df) + urbanaccess_gtfs_df_info = vars(loaded_feeds) + expected_dfs = ['stops', 'routes', 'trips', 'stop_times', + 'calendar_dates'] + assert expected_urbanaccess_gtfs_df_keys == list( + urbanaccess_gtfs_df_info.keys()).sort() + for key, value in urbanaccess_gtfs_df_info.items(): + assert isinstance(value, pd.core.frame.DataFrame) + # check that df is not empty + if key in expected_dfs: + assert value.empty is False + + +def test_loadgtfsfeed_to_df_wo_calendar_dates( + agency_a_feed_on_disk_wo_calendar_dates, + expected_urbanaccess_gtfs_df_keys): + feed_dir = agency_a_feed_on_disk_wo_calendar_dates + loaded_feeds = gtfs_load.gtfsfeed_to_df( + gtfsfeed_path=feed_dir, + validation=False, + verbose=True, + bbox=None, + remove_stops_outsidebbox=False, + append_definitions=False) + assert isinstance(loaded_feeds, urbanaccess_gtfs_df) + urbanaccess_gtfs_df_info = vars(loaded_feeds) + expected_dfs = ['stops', 'routes', 'trips', 'stop_times', + 'calendar'] + assert expected_urbanaccess_gtfs_df_keys == list( + urbanaccess_gtfs_df_info.keys()).sort() + for key, value in urbanaccess_gtfs_df_info.items(): + assert isinstance(value, pd.core.frame.DataFrame) + # check that df is not empty + if key in expected_dfs: + assert value.empty is False + + +def test_loadgtfsfeed_to_df_w_calendar_and_calendar_dates( + agency_a_feed_on_disk_w_calendar_and_calendar_dates, + expected_urbanaccess_gtfs_df_keys): + feed_dir = agency_a_feed_on_disk_w_calendar_and_calendar_dates + loaded_feeds = gtfs_load.gtfsfeed_to_df( + gtfsfeed_path=feed_dir, + validation=False, + verbose=True, + bbox=None, + remove_stops_outsidebbox=False, + append_definitions=False) + assert isinstance(loaded_feeds, urbanaccess_gtfs_df) + urbanaccess_gtfs_df_info = vars(loaded_feeds) + expected_dfs = ['stops', 'routes', 'trips', 'stop_times', + 'calendar', 'calendar_dates'] + assert expected_urbanaccess_gtfs_df_keys == list( + urbanaccess_gtfs_df_info.keys()).sort() + for key, value in urbanaccess_gtfs_df_info.items(): + assert isinstance(value, pd.core.frame.DataFrame) + # check that df is not empty + if key in expected_dfs: + assert value.empty is False + + +def test_loadgtfsfeed_to_df_wo_calendar_and_calendar_dates( + agency_a_feed_on_disk_wo_calendar_and_calendar_dates): + feed_dir = agency_a_feed_on_disk_wo_calendar_and_calendar_dates + with pytest.raises(ValueError) as excinfo: + loaded_feeds = gtfs_load.gtfsfeed_to_df( + gtfsfeed_path=feed_dir, + validation=False, + verbose=True, + bbox=None, + remove_stops_outsidebbox=False, + append_definitions=False) + expected_error = ( + "at least one of `calendar.txt` or `calendar_dates.txt` is required " + "to complete a GTFS dataset but neither was found in folder") + assert expected_error in str(excinfo.value) + + +def test_loadgtfsfeed_to_df_wo_req_file( + agency_a_feed_on_disk_wo_req_file): + feed_dir = agency_a_feed_on_disk_wo_req_file + with pytest.raises(ValueError) as excinfo: + loaded_feeds = gtfs_load.gtfsfeed_to_df( + gtfsfeed_path=feed_dir, + validation=False, + verbose=True, + bbox=None, + remove_stops_outsidebbox=False, + append_definitions=False) + expected_error = ( + "trips.txt is a required GTFS text file and was not found in folder") + assert expected_error in str(excinfo.value) + + +def test_loadgtfsfeed_to_df_wo_agency( + agency_a_feed_on_disk_wo_agency, + expected_urbanaccess_gtfs_df_keys): + feed_dir = agency_a_feed_on_disk_wo_agency + loaded_feeds = gtfs_load.gtfsfeed_to_df( + gtfsfeed_path=feed_dir, + validation=False, + verbose=True, + bbox=None, + remove_stops_outsidebbox=False, + append_definitions=False) + assert isinstance(loaded_feeds, urbanaccess_gtfs_df) + urbanaccess_gtfs_df_info = vars(loaded_feeds) + expected_dfs = ['stops', 'routes', 'trips', 'stop_times', + 'calendar'] + assert expected_urbanaccess_gtfs_df_keys == list( + urbanaccess_gtfs_df_info.keys()).sort() + for key, value in urbanaccess_gtfs_df_info.items(): + assert isinstance(value, pd.core.frame.DataFrame) + # check that df is not empty + if key in expected_dfs: + assert value.empty is False diff --git a/urbanaccess/tests/test_gtfs_network.py b/urbanaccess/tests/test_gtfs_network.py index 46accfb..01de1c8 100644 --- a/urbanaccess/tests/test_gtfs_network.py +++ b/urbanaccess/tests/test_gtfs_network.py @@ -1,7 +1,31 @@ import pytest import pandas as pd import numpy as np -from urbanaccess.gtfs import network + +import urbanaccess.gtfs.network as gtfs_network +import urbanaccess.gtfs.load as gtfs_load +from urbanaccess.network import urbanaccess_network + + +@pytest.fixture +def expected_urbanaccess_network_keys(): + expected_keys = ['transit_nodes', 'transit_edges', 'net_connector_edges', + 'osm_nodes', 'osm_edges', 'net_nodes', 'net_edges'] + return expected_keys.sort() + + +@pytest.fixture +def gtfs_feed_wo_calendar_dates( + tmpdir, agency_a_feed_on_disk_wo_calendar_dates): + feed_dir = agency_a_feed_on_disk_wo_calendar_dates + loaded_feeds = gtfs_load.gtfsfeed_to_df( + gtfsfeed_path=feed_dir, + validation=False, + verbose=True, + bbox=None, + remove_stops_outsidebbox=False, + append_definitions=False) + return loaded_feeds @pytest.fixture @@ -81,6 +105,71 @@ def stop_times_interpolated(): return df +def test_create_transit_net_wo_calendar_dates( + tmpdir, gtfs_feed_wo_calendar_dates, + expected_urbanaccess_network_keys): + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=tmpdir, + save_filename=None) + assert isinstance(transit_net, urbanaccess_network) + urbanaccess_network_info = vars(transit_net) + expected_dfs = ['transit_nodes', 'transit_edges'] + assert expected_urbanaccess_network_keys == list( + urbanaccess_network_info.keys()).sort() + for key, value in urbanaccess_network_info.items(): + assert isinstance(value, pd.core.frame.DataFrame) + # check that df is not empty + if key in expected_dfs: + assert value.empty is False + + +def test_create_transit_net_wo_req_file( + tmpdir, gtfs_feed_wo_calendar_dates): + # set trips df to blank df for test + gtfs_feed_wo_calendar_dates.trips = pd.DataFrame() + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=tmpdir, + save_filename=None) + expected_error = ( + "one of the following gtfsfeeds_dfs objects trips, stops, " + "or stop_times were found to be empty.") + assert expected_error in str(excinfo.value) + + +def test_create_transit_net_wo_calendar_and_calendar_dates( + tmpdir, gtfs_feed_wo_calendar_dates): + # set calendar_dates and calendar dfs to blank df for test + gtfs_feed_wo_calendar_dates.calendar_dates = pd.DataFrame() + gtfs_feed_wo_calendar_dates.calendar = pd.DataFrame() + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=tmpdir, + save_filename=None) + expected_error = ( + "one of the following gtfsfeeds_dfs objects calendar or " + "calendar_dates were found to be empty.") + assert expected_error in str(excinfo.value) + + def test_interpolator(stop_times, calendar): df = gtfs_network._interpolate_stop_times(stop_times, calendar) From f4c8d012d7aa88304f20999dbf03486e8b538d4a Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 24 Sep 2020 20:19:08 -0700 Subject: [PATCH 15/35] update note --- urbanaccess/gtfs/load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/urbanaccess/gtfs/load.py b/urbanaccess/gtfs/load.py index a5f3180..96cc286 100644 --- a/urbanaccess/gtfs/load.py +++ b/urbanaccess/gtfs/load.py @@ -293,7 +293,7 @@ def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, calendar_dates_df = utils_format._read_gtfs_calendar_dates( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) - # if only calendar_files, set calendar as blank + # if only calendar_dates, set calendar as blank # with default required columns if len(calendar_files) == 1: calendar_df = pd.DataFrame( From 88467be023cc393a4bf0116d1347fa309a8b5a51 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Fri, 25 Sep 2020 10:14:47 -0700 Subject: [PATCH 16/35] fix unit tests for py27 and 35 tmpdir -> tmpdir.strpath --- urbanaccess/tests/conftest.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/urbanaccess/tests/conftest.py b/urbanaccess/tests/conftest.py index fdf9b0a..5ca8297 100644 --- a/urbanaccess/tests/conftest.py +++ b/urbanaccess/tests/conftest.py @@ -692,7 +692,7 @@ def agency_a_feed_on_disk_wo_calendar_dates( 'routes': routes_feed_1, 'trips': trips_feed_1, 'calendar': calendar_feed_1} - feed_path = os.path.join(tmpdir, 'agency_a_wo_calendar_dates') + feed_path = os.path.join(tmpdir.strpath, 'agency_a_wo_calendar_dates') os.makedirs(feed_path) print('writing test data to dir: {}'.format(feed_path)) for feed_file, feed_df in feed_file_dict.items(): @@ -712,7 +712,7 @@ def agency_a_feed_on_disk_wo_calendar( 'routes': routes_feed_1, 'trips': trips_feed_1, 'calendar_dates': calendar_dates_feed_1} - feed_path = os.path.join(tmpdir, 'agency_a_wo_calendar') + feed_path = os.path.join(tmpdir.strpath, 'agency_a_wo_calendar') os.makedirs(feed_path) print('writing test data to dir: {}'.format(feed_path)) for feed_file, feed_df in feed_file_dict.items(): @@ -733,7 +733,7 @@ def agency_a_feed_on_disk_w_calendar_and_calendar_dates( 'trips': trips_feed_1, 'calendar': calendar_feed_1, 'calendar_dates': calendar_dates_feed_1} - feed_path = os.path.join(tmpdir, 'agency_a_w_both_calendars') + feed_path = os.path.join(tmpdir.strpath, 'agency_a_w_both_calendars') os.makedirs(feed_path) print('writing test data to dir: {}'.format(feed_path)) for feed_file, feed_df in feed_file_dict.items(): @@ -752,7 +752,7 @@ def agency_a_feed_on_disk_wo_calendar_and_calendar_dates( 'stops': stops_feed_1, 'routes': routes_feed_1, 'trips': trips_feed_1} - feed_path = os.path.join(tmpdir, 'agency_a_wo_both_calendar') + feed_path = os.path.join(tmpdir.strpath, 'agency_a_wo_both_calendar') os.makedirs(feed_path) print('writing test data to dir: {}'.format(feed_path)) for feed_file, feed_df in feed_file_dict.items(): @@ -771,7 +771,7 @@ def agency_a_feed_on_disk_wo_req_file( 'stops': stops_feed_1, 'routes': routes_feed_1, 'calendar': calendar_feed_1} - feed_path = os.path.join(tmpdir, 'agency_a_wo_req_file') + feed_path = os.path.join(tmpdir.strpath, 'agency_a_wo_req_file') os.makedirs(feed_path) print('writing test data to dir: {}'.format(feed_path)) for feed_file, feed_df in feed_file_dict.items(): @@ -790,7 +790,7 @@ def agency_a_feed_on_disk_wo_agency( 'routes': routes_feed_1, 'trips': trips_feed_1, 'calendar': calendar_feed_1} - feed_path = os.path.join(tmpdir, 'agency_a_wo_agency') + feed_path = os.path.join(tmpdir.strpath, 'agency_a_wo_agency') os.makedirs(feed_path) print('writing test data to dir: {}'.format(feed_path)) for feed_file, feed_df in feed_file_dict.items(): From 761e29762a232e3efee87e4033e5fd338b90e8ef Mon Sep 17 00:00:00 2001 From: sablanchard Date: Mon, 28 Sep 2020 21:57:31 -0700 Subject: [PATCH 17/35] gtfsfeeds.download() use user agent header on request, add try block, update msgs --- urbanaccess/gtfsfeeds.py | 95 ++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 42 deletions(-) diff --git a/urbanaccess/gtfsfeeds.py b/urbanaccess/gtfsfeeds.py index 0eb5373..56c00a8 100644 --- a/urbanaccess/gtfsfeeds.py +++ b/urbanaccess/gtfsfeeds.py @@ -5,7 +5,7 @@ import os import logging as lg import time -from six.moves.urllib.request import urlopen +from six.moves.urllib import request from urbanaccess.utils import log from urbanaccess import config @@ -462,66 +462,77 @@ def download(data_folder=os.path.join(config.settings.data_folder), len(feeds.gtfs_feeds), download_folder)) start_time1 = time.time() + msg_no_connection_w_status = ('Unable to connect. URL at {} returned ' + 'status code {} and no data') + msg_no_connection = 'Unable to connect to: {}. Error: {}' + msg_download_succeed = ('{} GTFS feed downloaded successfully. ' + 'Took {:,.2f} seconds for {:,.1f}KB') # TODO: add file counter and print number to user for feed_name_key, feed_url_value in feeds.gtfs_feeds.items(): start_time2 = time.time() zipfile_path = ''.join([download_folder, '/', feed_name_key, '.zip']) - if 'http' in feed_url_value: - status_code = urlopen(feed_url_value).getcode() - if status_code == 200: - file = urlopen(feed_url_value) + # add default user-agent header in request to avoid 403 Errors + opener = request.build_opener() + opener.addheaders = [('User-agent', '')] + request.install_opener(opener) - _zipfile_type_check(file=file, - feed_url_value=feed_url_value) - - with open(zipfile_path, "wb") as local_file: - local_file.write(file.read()) - log( - '{} GTFS feed downloaded successfully. Took {:,' - '.2f} seconds for {:,.1f}KB'.format( - feed_name_key, time.time() - start_time2, - os.path.getsize(zipfile_path))) - elif status_code in [429, 504]: - log( - 'URL at {} returned status code {} and no data. ' - 'Re-trying request in {:.2f} seconds.'.format( - feed_url_value, status_code, error_pause_duration), - level=lg.WARNING) - time.sleep(error_pause_duration) - try: - file = urlopen(feed_url_value) + if 'http' in feed_url_value: + try: + status_code = request.urlopen(feed_url_value).getcode() + if status_code == 200: + file = request.urlopen(feed_url_value) _zipfile_type_check(file=file, feed_url_value=feed_url_value) with open(zipfile_path, "wb") as local_file: local_file.write(file.read()) - except Exception: - log('Unable to connect. URL at {} returned status code ' - '{} and no data'.format(feed_url_value, status_code), + log(msg_download_succeed.format( + feed_name_key, time.time() - start_time2, + os.path.getsize(zipfile_path))) + elif status_code in [429, 504]: + msg = ('URL at {} returned status code {} and no data. ' + 'Re-trying request in {:.2f} seconds.') + log(msg.format(feed_url_value, status_code, + error_pause_duration), + level=lg.WARNING) + time.sleep(error_pause_duration) + try: + file = request.urlopen(feed_url_value) + + _zipfile_type_check(file=file, + feed_url_value=feed_url_value) + + with open(zipfile_path, "wb") as local_file: + local_file.write(file.read()) + except Exception: + log(msg_no_connection_w_status.format( + feed_url_value, status_code), + level=lg.ERROR) + else: + log(msg_no_connection_w_status.format( + feed_url_value, status_code), level=lg.ERROR) - else: - log( - 'Unable to connect. URL at {} returned status code {} ' - 'and no data'.format( - feed_url_value, status_code), level=lg.ERROR) + except Exception: + log(msg_no_connection.format( + feed_url_value, traceback.format_exc()), + level=lg.ERROR) else: try: - file = urlopen(feed_url_value) + file = request.urlopen(feed_url_value) _zipfile_type_check(file=file, feed_url_value=feed_url_value) - with open( - ''.join([download_folder, '/', feed_name_key, '.zip']), - "wb") as local_file: + file_path = ''.join( + [download_folder, '/', feed_name_key, '.zip']) + with open(file_path, "wb") as local_file: local_file.write(file.read()) - log( - '{} GTFS feed downloaded successfully. Took {:,' - '.2f} seconds for {:,.1f}KB'.format( - feed_name_key, time.time() - start_time2, - os.path.getsize(zipfile_path))) + log(msg_download_succeed.format( + feed_name_key, time.time() - start_time2, + os.path.getsize(zipfile_path))) except Exception: - log('Unable to connect: {}'.format(traceback.format_exc()), + log(msg_no_connection.format( + feed_url_value, traceback.format_exc()), level=lg.ERROR) log('GTFS feed download completed. Took {:,.2f} seconds'.format( From f393b180ed1c5085fc6cad493d9a6b72e1a29591 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Mon, 28 Sep 2020 21:58:14 -0700 Subject: [PATCH 18/35] add unit tests for gtfsfeeds.py --- urbanaccess/tests/test_gtfsfeeds.py | 219 ++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 urbanaccess/tests/test_gtfsfeeds.py diff --git a/urbanaccess/tests/test_gtfsfeeds.py b/urbanaccess/tests/test_gtfsfeeds.py new file mode 100644 index 0000000..aea6a9a --- /dev/null +++ b/urbanaccess/tests/test_gtfsfeeds.py @@ -0,0 +1,219 @@ +import pytest +import os +import pandas as pd +import yaml + +from urbanaccess import gtfsfeeds +from urbanaccess.gtfsfeeds import feeds + + +@pytest.fixture +def feed_dict1(): + return { + 'ac transit': + 'http://www.actransit.org/wp-content/uploads/GTFSJune182017B.zip'} + + +@pytest.fixture +def feed_dict2(): + return { + 'Bay Area Rapid Transit': + 'http://www.gtfs-data-exchange.com/agency/bay-area-rapid-transit' + '/latest.zip'} + + +@pytest.fixture +def feed_dict3(): + return { + 'ac transit': 'http://www.actransit.org/wp-content/uploads' + '/GTFSJune182017B.zip', + 'Bay Area Rapid Transit': + 'http://www.gtfs-data-exchange.com/agency/bay-area-rapid-transit' + '/latest.zip'} + + +@pytest.fixture +def feed_yaml(tmpdir): + yaml_dict = { + 'gtfs_feeds': { + 'ac transit': 'http://www.actransit.org/wp-content/uploads' + '/GTFSJune182017B.zip', + 'Bay Area Rapid Transit': + 'http://www.gtfs-data-exchange.com/agency/bay-area-rapid' + '-transit/latest.zip'}} + + yaml_path = os.path.join(tmpdir.strpath, 'gtfsfeeds.yaml') + with open(yaml_path, 'w') as f: + yaml.dump(yaml_dict, f, default_flow_style=False) + return tmpdir.strpath + + +def test_feed_object(): + assert isinstance(gtfsfeeds.feeds, gtfsfeeds.urbanaccess_gtfsfeeds) + assert isinstance(feeds.to_dict(), dict) + + +def test_add_feed(feed_dict1, feed_dict2): + feeds.add_feed(add_dict=feed_dict1) + assert len(feeds.gtfs_feeds.keys()) == 1 + feeds.add_feed(add_dict=feed_dict2) + assert len(feeds.gtfs_feeds.keys()) == 2 + feed_dict_replace = {'Bay Area Rapid Transit': 'test'} + feeds.add_feed(add_dict=feed_dict_replace, replace=True) + + for key, value in feeds.gtfs_feeds.items(): + if key == 'Bay Area Rapid Transit': + assert value == 'test' + assert isinstance(feeds, gtfsfeeds.urbanaccess_gtfsfeeds) + # clear feeds from global memory + feeds.remove_feed(remove_all=True) + + +def test_remove_feed(feed_dict3): + feeds.add_feed(add_dict=feed_dict3) + feeds.remove_feed(del_key='ac transit') + assert len(feeds.gtfs_feeds.keys()) == 1 + assert 'ac transit' not in feeds.gtfs_feeds.keys() + feeds.remove_feed(remove_all=True) + assert len(feeds.gtfs_feeds.keys()) == 0 + assert isinstance(feeds, gtfsfeeds.urbanaccess_gtfsfeeds) + # clear feeds from global memory + feeds.remove_feed(remove_all=True) + + +def test_to_yaml_feed(tmpdir, feed_dict3): + feeds.add_feed(add_dict=feed_dict3) + feeds.to_yaml(tmpdir.strpath, overwrite=True) + + yaml_path = os.path.join(tmpdir.strpath, 'gtfsfeeds.yaml') + with open(yaml_path, 'r') as f: + yaml_config = yaml.load(f) + assert yaml_config['gtfs_feeds'] == feed_dict3 + # clear feeds from global memory + feeds.remove_feed(remove_all=True) + + +def test_from_yaml_feed(feed_yaml): + yaml_path = feed_yaml + feeds_from_yaml = feeds.from_yaml(yaml_path, 'gtfsfeeds.yaml') + + assert isinstance(feeds_from_yaml, gtfsfeeds.urbanaccess_gtfsfeeds) + assert len(feeds_from_yaml.gtfs_feeds.keys()) == 2 + + valid_feed = ('http://www.gtfs-data-exchange.com/' + 'agency/bay-area-rapid-transit/latest.zip') + assert feeds_from_yaml.gtfs_feeds['Bay Area Rapid Transit'] == valid_feed + + valid_feed = ('http://www.actransit.org/wp-content/' + 'uploads/GTFSJune182017B.zip') + assert feeds_from_yaml.gtfs_feeds['ac transit'] == valid_feed + # clear feeds from global memory + feeds.remove_feed(remove_all=True) + + +def test_search_contains_gtfs_data_exchange(): + search_result = gtfsfeeds.search(api='gtfsdataexch', + search_text=['ac transit', 'santa rosa'], + search_field=None, match='contains', + add_feed=False, overwrite_feed=False) + + assert isinstance(search_result, pd.DataFrame) + assert search_result.empty is False + assert len(search_result) == 2 + + col_list = ['dataexchange_url', 'dataexchange_id', 'name'] + for col in col_list: + assert col in search_result.columns + assert search_result[col].isnull().all() == False # noqa + + value_list = ['ac-transit', 'santa-rosa-citybus'] + for value in value_list: + assert value in list(search_result['dataexchange_id']) + + +def test_search_contains_add_feed_gtfs_data_exchange(): + gtfsfeeds.search(api='gtfsdataexch', + search_text='ac transit', + search_field=None, match='contains', + add_feed=True, overwrite_feed=False) + + assert len(feeds.gtfs_feeds.keys()) == 1 + assert 'AC Transit' in feeds.gtfs_feeds.keys() + + # test overwrite feed + gtfsfeeds.search(api='gtfsdataexch', + search_text='Bay Area Rapid Transit', + search_field=None, match='exact', + add_feed=True, overwrite_feed=True) + + assert len(feeds.gtfs_feeds.keys()) == 1 + assert 'Bay Area Rapid Transit' in feeds.gtfs_feeds.keys() + # clear feeds from global memory + feeds.remove_feed(remove_all=True) + + +def test_search_exact_search_field_gtfs_data_exchange(): + # test search field + search_result = gtfsfeeds.search(api='gtfsdataexch', + search_text='San Francisco Bay Area', + search_field=['area'], match='exact', + add_feed=False, overwrite_feed=False) + assert len(search_result) == 8 + + +def test_download_gtfs_feed_via_feed_object(feed_dict3, tmpdir): + feeds.add_feed(add_dict=feed_dict3) + tmp_path = tmpdir.strpath + gtfsfeeds.download(data_folder=tmp_path) + + filelist = ['ac transit.zip', 'Bay Area Rapid Transit.zip'] + txtlist = ['calendar.txt', 'routes.txt', 'stop_times.txt', + 'stops.txt', 'trips.txt'] + zip_path = os.path.join(tmp_path, 'gtfsfeed_zips') + txt_path = os.path.join(tmp_path, 'gtfsfeed_text') + for zipfile in filelist: + assert os.path.exists(os.path.join(zip_path, zipfile)) is True + for folder in filelist: + check_path = os.path.join(txt_path, folder.replace('.zip', '')) + assert os.path.exists(check_path) is True + for txt in txtlist: + check_path = os.path.join( + txt_path, folder.replace('.zip', ''), txt) + assert os.path.exists(check_path) is True + # clear feeds from global memory + feeds.remove_feed(remove_all=True) + + +def test_download_gtfs_feed_via_feed_name_and_dict(tmpdir): + tmp_path = tmpdir.strpath + gtfsfeeds.download( + data_folder=tmp_path, + feed_name='test_agency', + feed_url=('http://www.gtfs-data-exchange.com/' + 'agency/bay-area-rapid-transit/latest.zip'), + feed_dict=None, + error_pause_duration=5, delete_zips=False) + + gtfsfeeds.download( + data_folder=tmp_path, + feed_dict={ + 'test_agency_dict': 'http://www.gtfs-data-exchange.com/agency/' + 'ac-transit/latest.zip'}, + error_pause_duration=5, delete_zips=False) + + filelist = ['test_agency.zip', 'test_agency_dict.zip'] + txtlist = ['calendar.txt', 'routes.txt', 'stop_times.txt', + 'stops.txt', 'trips.txt'] + zip_path = os.path.join(tmp_path, 'gtfsfeed_zips') + txt_path = os.path.join(tmp_path, 'gtfsfeed_text') + for zipfile in filelist: + assert os.path.exists(os.path.join(zip_path, zipfile)) is True + for folder in filelist: + check_path = os.path.join(txt_path, folder.replace('.zip', '')) + assert os.path.exists(check_path) is True + for txt in txtlist: + check_path = os.path.join( + txt_path, folder.replace('.zip', ''), txt) + assert os.path.exists(check_path) is True + # clear feeds from global memory + feeds.remove_feed(remove_all=True) From 0beed37cbe4f157352b11e6af5e0f318eb7a9b13 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Mon, 28 Sep 2020 22:00:24 -0700 Subject: [PATCH 19/35] update duplicate feed url checks to catch dups correctly --- urbanaccess/gtfsfeeds.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/urbanaccess/gtfsfeeds.py b/urbanaccess/gtfsfeeds.py index 56c00a8..306389d 100644 --- a/urbanaccess/gtfsfeeds.py +++ b/urbanaccess/gtfsfeeds.py @@ -78,9 +78,11 @@ def from_yaml(cls, gtfsfeeddir=os.path.join(config.settings.data_folder, for value in yaml_config['gtfs_feeds'][key]: if not isinstance(value, str): raise ValueError('{} must be a string'.format(value)) - - if (pd.Series( - yaml_config['gtfs_feeds'].values()).value_counts() != 1).all(): + unique_url_count = len( + pd.DataFrame.from_dict(yaml_config['gtfs_feeds'], orient='index')[ + 0].unique()) + url_count = len(yaml_config['gtfs_feeds']) + if unique_url_count != url_count: raise ValueError( 'duplicate values were found when the passed add_dict ' 'dictionary was added to the existing dictionary. Feed URL ' @@ -439,7 +441,7 @@ def download(data_folder=os.path.join(config.settings.data_folder), raise ValueError('{} must be a string'.format(value)) for key, value in feed_dict.items(): - if value in feed_dict.gtfs_feeds.values(): + if value in feeds.gtfs_feeds.values(): raise ValueError( 'duplicate values were found when the passed add_dict ' 'dictionary was added to the existing dictionary. Feed ' From ad0d8ea92cc0e5e087222eb16029ed0e53208f7a Mon Sep 17 00:00:00 2001 From: sablanchard Date: Mon, 28 Sep 2020 22:00:52 -0700 Subject: [PATCH 20/35] pycodestyle formatting --- urbanaccess/tests/test_osm_network.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/urbanaccess/tests/test_osm_network.py b/urbanaccess/tests/test_osm_network.py index 10ffad7..cbcf516 100644 --- a/urbanaccess/tests/test_osm_network.py +++ b/urbanaccess/tests/test_osm_network.py @@ -9,11 +9,11 @@ def bbox1(): def test_column_names(bbox1): - nodes, edges = ua_network_from_bbox(bbox=bbox1, network_type='walk', - timeout=180, memory=None, - max_query_area_size=50 * 1000 * 50 * - 1000, - remove_lcn=False) # noqa + nodes, edges = ua_network_from_bbox( + bbox=bbox1, network_type='walk', + timeout=180, memory=None, + max_query_area_size=50 * 1000 * 50 * 1000, + remove_lcn=False) col_list = ['x', 'y', 'id'] for col in col_list: assert col in nodes.columns From 56449e843fc55f54a013966ebc9181a4e1116ed5 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Mon, 28 Sep 2020 22:01:31 -0700 Subject: [PATCH 21/35] fix log_console unicode --- urbanaccess/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/urbanaccess/utils.py b/urbanaccess/utils.py index e8fbf9e..d5f0420 100644 --- a/urbanaccess/utils.py +++ b/urbanaccess/utils.py @@ -63,7 +63,7 @@ def log(message, level=None, name=None, filename=None): # convert message to ascii for proper console display in windows # terminals - message = unicodedata.normalize('NFKD', unicode(message)).encode( + message = unicodedata.normalize('NFKD', str(message)).encode( 'ascii', errors='replace').decode() print(message) sys.stdout = standard_out From 329949591a622e322716171ad681a0e886a026a3 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Mon, 28 Sep 2020 22:23:35 -0700 Subject: [PATCH 22/35] update print --- urbanaccess/gtfsfeeds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/urbanaccess/gtfsfeeds.py b/urbanaccess/gtfsfeeds.py index 306389d..26acb02 100644 --- a/urbanaccess/gtfsfeeds.py +++ b/urbanaccess/gtfsfeeds.py @@ -460,7 +460,7 @@ def download(data_folder=os.path.join(config.settings.data_folder), if not os.path.exists(download_folder): os.makedirs(download_folder) log('{} does not exist. Directory was created'.format(download_folder)) - log('{} GTFS feeds will be downloaded here: {}'.format( + log('{:,} GTFS feed(s) will be downloaded here: {}'.format( len(feeds.gtfs_feeds), download_folder)) start_time1 = time.time() From 92e99757d3889735e5004ed78b966f08b7067ad3 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Mon, 28 Sep 2020 23:15:23 -0700 Subject: [PATCH 23/35] formatting --- urbanaccess/tests/test_gtfsfeeds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/urbanaccess/tests/test_gtfsfeeds.py b/urbanaccess/tests/test_gtfsfeeds.py index aea6a9a..dacf5be 100644 --- a/urbanaccess/tests/test_gtfsfeeds.py +++ b/urbanaccess/tests/test_gtfsfeeds.py @@ -124,7 +124,7 @@ def test_search_contains_gtfs_data_exchange(): col_list = ['dataexchange_url', 'dataexchange_id', 'name'] for col in col_list: assert col in search_result.columns - assert search_result[col].isnull().all() == False # noqa + assert search_result[col].isnull().all() == False # noqa value_list = ['ac-transit', 'santa-rosa-citybus'] for value in value_list: From 6dc1a1327ffa886d229828478800629c4a15e8e3 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Fri, 2 Oct 2020 10:21:36 -0700 Subject: [PATCH 24/35] update paper links --- README.rst | 2 +- docs/source/introduction.rst | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 17c39a9..2391036 100644 --- a/README.rst +++ b/README.rst @@ -51,7 +51,7 @@ Citation and academic literature To cite this tool and for a complete description of the UrbanAccess methodology see the paper below: -`Samuel D. Blanchard and Paul Waddell. 2017. "UrbanAccess: Generalized Methodology for Measuring Regional Accessibility with an Integrated Pedestrian and Transit Network." Transportation Research Record: Journal of the Transportation Research Board. No. 2653. pp. 35–44. `__ +`Samuel D. Blanchard and Paul Waddell. 2017. "UrbanAccess: Generalized Methodology for Measuring Regional Accessibility with an Integrated Pedestrian and Transit Network." Transportation Research Record: Journal of the Transportation Research Board. No. 2653. pp. 35–44. `__ For other related literature see `here `__. diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index 318a3fb..2274b91 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -51,11 +51,11 @@ Citation and academic literature To cite this tool and for a complete description of the UrbanAccess methodology see the paper below: -`Samuel D. Blanchard and Paul Waddell. 2017. "UrbanAccess: Generalized Methodology for Measuring Regional Accessibility with an Integrated Pedestrian and Transit Network." Transportation Research Record: Journal of the Transportation Research Board. No. 2653. pp. 35–44. `__ +`Samuel D. Blanchard and Paul Waddell. 2017. "UrbanAccess: Generalized Methodology for Measuring Regional Accessibility with an Integrated Pedestrian and Transit Network." Transportation Research Record: Journal of the Transportation Research Board. No. 2653. pp. 35–44. `__ For a detailed use case of the tool see the following paper: -`Samuel D. Blanchard and Paul Waddell. 2017. "Assessment of Regional Transit Accessibility in the San Francisco Bay Area of California with UrbanAccess." Transportation Research Record: Journal of the Transportation Research Board. No. 2654. pp. 45–54. `__ +`Samuel D. Blanchard and Paul Waddell. 2017. "Assessment of Regional Transit Accessibility in the San Francisco Bay Area of California with UrbanAccess." Transportation Research Record: Journal of the Transportation Research Board. No. 2654. pp. 45–54. `__ Reporting bugs ~~~~~~~~~~~~~~~~~~~~~~~~ From 3b7d2bac3295bf6ad45fbebdd6937411176cfa0e Mon Sep 17 00:00:00 2001 From: sablanchard Date: Mon, 2 Nov 2020 08:26:54 -0800 Subject: [PATCH 25/35] add txt_encoding to config with 'utf-8' default --- urbanaccess/config.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/urbanaccess/config.py b/urbanaccess/config.py index 421458c..7e405bf 100644 --- a/urbanaccess/config.py +++ b/urbanaccess/config.py @@ -16,7 +16,8 @@ def _format_check(settings): """ valid_keys = ['data_folder', 'logs_folder', 'log_file', - 'log_console', 'log_name', 'log_filename', 'gtfs_api'] + 'log_console', 'log_name', 'log_filename', + 'txt_encoding', 'gtfs_api'] for key in settings.keys(): if key not in valid_keys: @@ -49,6 +50,9 @@ class urbanaccess_config(object): name of the logger log_filename : str name of the log file + txt_encoding : str + default encoding to use to read and write GTFS txt files. Must be + a valid encoding recognized by Python codecs. gtfs_api : dict dictionary of the name of the GTFS API service as the key and the GTFS API server root URL as the value to pass to the GTFS loader @@ -61,6 +65,7 @@ def __init__(self, log_console=False, log_name='urbanaccess', log_filename='urbanaccess', + txt_encoding='utf-8', gtfs_api={'gtfsdataexch': ( 'http://www.gtfs-data-exchange.com/' 'api/agencies?format=csv')}): @@ -71,6 +76,7 @@ def __init__(self, self.log_console = log_console self.log_name = log_name self.log_filename = log_filename + self.txt_encoding = txt_encoding self.gtfs_api = gtfs_api @classmethod @@ -110,6 +116,7 @@ def from_yaml(cls, configdir='configs', log_name=yaml_config.get('log_name', 'urbanaccess'), log_filename=yaml_config.get('log_filename', 'urbanaccess'), + txt_encoding=yaml_config.get('txt_encoding', 'utf-8'), gtfs_api=yaml_config.get('gtfs_api', { 'gtfsdataexch': ('http://www.gtfs-data-exchange.com/' @@ -128,6 +135,7 @@ def to_dict(self): 'log_console': self.log_console, 'log_name': self.log_name, 'log_filename': self.log_filename, + 'txt_encoding': self.txt_encoding, 'gtfs_api': self.gtfs_api, } From 2fb690a8d24806d79c499ea7c7fdb1ea1368bb95 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Mon, 2 Nov 2020 08:28:25 -0800 Subject: [PATCH 26/35] fix typo and formatting --- urbanaccess/config.py | 6 +++--- urbanaccess/gtfs/load.py | 8 ++++---- urbanaccess/gtfs/network.py | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/urbanaccess/config.py b/urbanaccess/config.py index 7e405bf..a21a0df 100644 --- a/urbanaccess/config.py +++ b/urbanaccess/config.py @@ -21,7 +21,7 @@ def _format_check(settings): for key in settings.keys(): if key not in valid_keys: - raise ValueError('{} not found in list of valid configuation ' + raise ValueError('{} not found in list of valid configuration ' 'keys'.format(key)) if not isinstance(key, str): raise ValueError('{} must be a string'.format(key)) @@ -43,9 +43,9 @@ class urbanaccess_config(object): logs_folder : str location to write log files log_file : bool - if true, save log output to a log file in logs_folder + if True, save log output to a log file in logs_folder log_console : bool - if true, print log output to the console + if True, print log output to the console log_name : str name of the logger log_filename : str diff --git a/urbanaccess/gtfs/load.py b/urbanaccess/gtfs/load.py index 96cc286..4161186 100644 --- a/urbanaccess/gtfs/load.py +++ b/urbanaccess/gtfs/load.py @@ -20,7 +20,7 @@ def _standardize_txt(csv_rootpath=os.path.join(config.settings.data_folder, Parameters ---------- csv_rootpath : str, optional - root path where all gtfs feeds that make up a contiguous metropolitan + root path where all GTFS feeds that make up a contiguous metropolitan area are stored Returns @@ -100,9 +100,9 @@ def _txt_header_whitespace_check(gtfsfiles_to_use, Parameters ---------- gtfsfiles_to_use : list - list of gtfs feed txt files to utilize + list of GTFS feed txt files to utilize csv_rootpath : str, optional - root path where all gtfs feeds that make up a contiguous metropolitan + root path where all GTFS feeds that make up a contiguous metropolitan area are stored Returns @@ -156,7 +156,7 @@ def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, Parameters ---------- gtfsfeed_path : str, optional - root path where all gtfs feeds that make up a contiguous metropolitan + root path where all GTFS feeds that make up a contiguous metropolitan area are stored validation : bool if true, the validation check on stops checking for stops outside diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index c22142e..39b6725 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -58,7 +58,7 @@ def create_transit_net(gtfsfeeds_dfs, day, DataFrame for the same time period stored in the gtfsfeeds_dfs object it will be used instead of re-calculated save_processed_gtfs : bool, optional - if true, all processed gtfs DataFrames will + if true, all processed GTFS DataFrames will be stored to disk in a hdf5 file save_dir : str, optional directory to save the hdf5 file @@ -216,7 +216,7 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, day in the GTFS calendar calendar_dates_lookup : dict, optional dictionary of the lookup column (key) as a string and corresponding - string (value) a s string or list of strings to use to subset trips + string (value) as string or list of strings to use to subset trips using the calendar_dates DataFrame. Search will be exact. If none, then the calendar_dates DataFrame will not be used to select trips that are not in the calendar DataFrame. Note search will select all From dbb2752d960f5cd7a7ea1c7bd6580e4c90d4ae84 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Mon, 2 Nov 2020 08:34:03 -0800 Subject: [PATCH 27/35] add prints _txt_encoder_check() --- urbanaccess/gtfs/load.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/urbanaccess/gtfs/load.py b/urbanaccess/gtfs/load.py index 4161186..6a827e6 100644 --- a/urbanaccess/gtfs/load.py +++ b/urbanaccess/gtfs/load.py @@ -4,6 +4,7 @@ import time import pandas as pd import six +import logging as lg from urbanaccess import config from urbanaccess.utils import log @@ -59,6 +60,7 @@ def _txt_encoder_check(gtfsfiles_to_use, """ # UnicodeDecodeError start_time = time.time() + log('Checking GTFS text file for encoding issues...') folderlist = [foldername for foldername in os.listdir(csv_rootpath) if os.path.isdir(os.path.join(csv_rootpath, foldername))] @@ -74,14 +76,16 @@ def _txt_encoder_check(gtfsfiles_to_use, for textfile in textfilelist: if textfile in gtfsfiles_to_use: # Read from file - file_open = open(os.path.join(csv_rootpath, folder, textfile)) + file_path = os.path.join(csv_rootpath, folder, textfile) + file_open = open(file_path) raw = file_open.read() file_open.close() if raw.startswith(codecs.BOM_UTF8): + msg = 'Correcting encoding issue in: {}...' + log(msg.format(file_path)) raw = raw.replace(codecs.BOM_UTF8, '', 1) # Write to file - file_open = open( - os.path.join(csv_rootpath, folder, textfile), 'w') + file_open = open(file_path, 'w') file_open.write(raw) file_open.close() From 9cf4139753d685688361f21ace473ea8f308bc0a Mon Sep 17 00:00:00 2001 From: sablanchard Date: Mon, 2 Nov 2020 08:39:12 -0800 Subject: [PATCH 28/35] update _txt_header_whitespace_check() to use encoding from config, update for py2 vs py3, and added prints --- urbanaccess/gtfs/load.py | 55 +++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/urbanaccess/gtfs/load.py b/urbanaccess/gtfs/load.py index 6a827e6..43b88af 100644 --- a/urbanaccess/gtfs/load.py +++ b/urbanaccess/gtfs/load.py @@ -115,6 +115,11 @@ def _txt_header_whitespace_check(gtfsfiles_to_use, """ start_time = time.time() + txt_encoding = config.settings.txt_encoding + msg = ('Checking GTFS text file header whitespace... ' + 'Reading files using encoding: {} set in configuration.') + log(msg.format(txt_encoding)) + folderlist = [foldername for foldername in os.listdir(csv_rootpath) if os.path.isdir(os.path.join(csv_rootpath, foldername))] @@ -128,25 +133,41 @@ def _txt_header_whitespace_check(gtfsfiles_to_use, for textfile in textfilelist: if textfile in gtfsfiles_to_use: + file_path = os.path.join(csv_rootpath, folder, textfile) # Read from file - with open(os.path.join(csv_rootpath, folder, textfile)) as f: - lines = f.readlines() - lines[0] = re.sub(r'\s+', '', lines[0]) + '\n' - # Write to file try: - with open(os.path.join(csv_rootpath, folder, textfile), - 'w') as f: - f.writelines(lines) - except Exception: - log('Unable to read {}. Check that file is not currently' - 'being read or is not already in memory as this is ' - 'likely the cause of the error.' - ''.format(os.path.join(csv_rootpath, - folder, textfile))) - log( - 'GTFS text file header whitespace check completed. Took {:,' - '.2f} seconds'.format( - time.time() - start_time)) + if six.PY2: + with open(file_path) as f: + lines = f.readlines() + else: + # read with default 'utf-8' encoding + with open( + file_path, + encoding=txt_encoding) as f: + lines = f.readlines() + line_wo_whitespace = re.sub(r'\s+', '', lines[0]) + '\n' + # only write the file if there are changes to be made + if lines[0] != line_wo_whitespace: + msg = 'Removing whitespace from header(s) in: {}...' + log(msg.format(file_path)) + lines[0] = line_wo_whitespace + # Write to file + if six.PY2: + with open( + file_path, 'w') as f: + f.writelines(lines) + else: + # write with default 'utf-8' encoding + with open( + file_path, 'w', + encoding=txt_encoding) as f: + f.writelines(lines) + except Exception as e: + msg = 'Unable to process: {}. Exception: {}' + raise Exception(log(msg.format(file_path, e), + level=lg.ERROR)) + log('GTFS text file header whitespace check completed. ' + 'Took {:,.2f} seconds'.format(time.time() - start_time)) def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, From 827225d271a846bb928e7f479fec18e15dd819c0 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Mon, 2 Nov 2020 08:42:25 -0800 Subject: [PATCH 29/35] added unit tests for functions in _standardize_txt() --- urbanaccess/tests/test_gtfs_load.py | 91 +++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/urbanaccess/tests/test_gtfs_load.py b/urbanaccess/tests/test_gtfs_load.py index e2b8ad9..f813f27 100644 --- a/urbanaccess/tests/test_gtfs_load.py +++ b/urbanaccess/tests/test_gtfs_load.py @@ -1,5 +1,10 @@ +# coding=utf-8 import pytest import pandas as pd +import os +import six +import codecs +import sys import urbanaccess.gtfs.load as gtfs_load from urbanaccess.gtfs.gtfsfeeds_dataframe import urbanaccess_gtfs_df @@ -13,6 +18,92 @@ def expected_urbanaccess_gtfs_df_keys(): return expected_keys.sort() +@pytest.fixture +def test_txt_files(tmpdir): + # test file that does not need to be fixed + do_not_fix_txt = os.path.join(tmpdir.strpath, 'agency.txt') + data = ['name,text\n', ' Circulação , áéíóúüñ¿¡ \n'] + if six.PY2: + with open(do_not_fix_txt, 'w') as f: + f.writelines(data) + else: + with open(do_not_fix_txt, 'w', encoding='utf-8') as f: + f.writelines(data) + + # test file that does need to be fixed + fix_txt = os.path.join(tmpdir.strpath, 'calendar.txt') + data = [' name , text \n', ' Circulação , áéíóúüñ¿¡ \n'] + if six.PY2: + with open(fix_txt, 'w') as f: + f.writelines(data) + else: + with open(fix_txt, 'w', encoding='utf-8') as f: + f.writelines(data) + + fix_txt_wBOM = os.path.join(tmpdir.strpath, 'calendar_dates.txt') + if six.PY2: + data = [codecs.BOM_UTF8, + ' name , text \n', + ' Circulação , áéíóúüñ¿¡ \n'] + with open(fix_txt_wBOM, 'w') as f: + f.writelines(data) + else: + data = [str(codecs.BOM_UTF8), + ' name , text \n', + ' Circulação , áéíóúüñ¿¡ \n'] + with open(fix_txt_wBOM, 'w', encoding='utf-8') as f: + f.writelines(data) + + return tmpdir.strpath, do_not_fix_txt, fix_txt, fix_txt_wBOM + + +@pytest.fixture +def test_txt_files_to_use(): + gtfsfiles_to_use = ['stops.txt', 'routes.txt', 'trips.txt', + 'stop_times.txt', 'calendar.txt', + 'agency.txt', 'calendar_dates.txt'] + return gtfsfiles_to_use + + +def test_txt_standardization(test_txt_files): + root_dir, do_not_fix_txt, fix_txt, fix_txt_wBOM = test_txt_files + + gtfs_load._standardize_txt(csv_rootpath=root_dir) + + df = pd.read_csv(fix_txt) + assert list(df.columns) == list(df.columns.str.strip()) + + df = pd.read_csv(fix_txt_wBOM) + assert list(df.columns) == list(df.columns.str.strip()) + + +def test_txt_header_whitespace_check(test_txt_files, test_txt_files_to_use): + root_dir, do_not_fix_txt, fix_txt, fix_txt_wBOM = test_txt_files + + gtfs_load._txt_header_whitespace_check( + gtfsfiles_to_use=test_txt_files_to_use, + csv_rootpath=root_dir) + + # only check 'fix_txt' as 'fix_txt_wBOM' would need to be + # fixed by _txt_encoder_check first + df = pd.read_csv(fix_txt) + assert list(df.columns) == list(df.columns.str.strip()) + + +@pytest.mark.skipif( + sys.version_info >= (3, 0), reason="requires python < 3.0") +def test_txt_encoder_check(test_txt_files, test_txt_files_to_use): + root_dir, do_not_fix_txt, fix_txt, fix_txt_wBOM = test_txt_files + + gtfs_load._txt_encoder_check( + gtfsfiles_to_use=test_txt_files_to_use, + csv_rootpath=root_dir) + + with open(fix_txt_wBOM, 'r') as f: + raw = f.read() + assert raw.startswith(codecs.BOM_UTF8) is False + + def test_loadgtfsfeed_to_df_wo_calendar( agency_a_feed_on_disk_wo_calendar, expected_urbanaccess_gtfs_df_keys): From 98aa4a743361f74f718da24874d9345a5807b1e1 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Mon, 2 Nov 2020 09:51:00 -0800 Subject: [PATCH 30/35] updated doc string --- urbanaccess/config.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/urbanaccess/config.py b/urbanaccess/config.py index a21a0df..cba8f25 100644 --- a/urbanaccess/config.py +++ b/urbanaccess/config.py @@ -51,8 +51,9 @@ class urbanaccess_config(object): log_filename : str name of the log file txt_encoding : str - default encoding to use to read and write GTFS txt files. Must be - a valid encoding recognized by Python codecs. + default text encoding used by the GTFS files, to be passed to + Python's open() function. Must be a valid encoding recognized by + Python codecs. gtfs_api : dict dictionary of the name of the GTFS API service as the key and the GTFS API server root URL as the value to pass to the GTFS loader From acef0acca7bc53ceffb7ad9ad74793d6484ffb42 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Mon, 2 Nov 2020 14:34:21 -0800 Subject: [PATCH 31/35] fix travis build for missing pyepsg dep --- requirements-dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-dev.txt b/requirements-dev.txt index fbaf9c8..d578087 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -7,6 +7,7 @@ pycodestyle # testing demo notebook jupyter cartopy # requires conda +pyepsg # building documentation numpydoc From fdcc79c0f782d91056fcf9a9f8cae3202d82ea2b Mon Sep 17 00:00:00 2001 From: Sam Maurer Date: Mon, 9 Nov 2020 11:01:21 -0800 Subject: [PATCH 32/35] Release prep --- CHANGELOG.rst | 10 ++++++++++ docs/source/conf.py | 4 ++-- docs/source/index.rst | 2 +- setup.py | 2 +- urbanaccess/__init__.py | 2 +- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0c2f8f3..50b15f2 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,13 @@ +v0.2.2 +====== + +2020/11/09 + +* allows passing matplotlib axes to pandana.plot_net() +* adds flexibility to calendar/date handling +* improves GTFS downloading +* improves text encoding support + v0.2.1 ====== diff --git a/docs/source/conf.py b/docs/source/conf.py index f365eb5..a073f17 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -30,8 +30,8 @@ project = u'UrbanAccess' author = u'UrbanSim Inc.' copyright = u'{}, {}'.format(datetime.now().year, author) -version = u'0.2.1' -release = u'0.2.1' +version = u'0.2.2' +release = u'0.2.2' language = None # List of patterns to ignore when looking for source files. diff --git a/docs/source/index.rst b/docs/source/index.rst index 5b6133a..3c6cec9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,7 +3,7 @@ UrbanAccess A tool for computing GTFS transit and OSM pedestrian networks for accessibility analysis. -v0.2.1, released August 28, 2020. +v0.2.2, released November 9, 2020. Contents -------- diff --git a/setup.py b/setup.py index 1e75e96..297359e 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setup( name='urbanaccess', - version='0.2.1', + version='0.2.2', license='AGPL', description=description, long_description=long_description, diff --git a/urbanaccess/__init__.py b/urbanaccess/__init__.py index c45aee2..4c015c1 100644 --- a/urbanaccess/__init__.py +++ b/urbanaccess/__init__.py @@ -9,6 +9,6 @@ from .gtfsfeeds import * from .plot import * -__version__ = "0.2.1" +__version__ = "0.2.2" version = __version__ From 46369b862d529a01c90cba5bbbbc5546fa2a2e97 Mon Sep 17 00:00:00 2001 From: Sam Maurer Date: Mon, 9 Nov 2020 11:34:59 -0800 Subject: [PATCH 33/35] More detail for changelog --- CHANGELOG.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 50b15f2..6a72bc2 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,8 +4,8 @@ v0.2.2 2020/11/09 * allows passing matplotlib axes to pandana.plot_net() -* adds flexibility to calendar/date handling -* improves GTFS downloading +* adds flexibility to calendar/date handling (calendar_dates.txt now supported) +* improves GTFS downloading (solves issue where requests were rejected due to missing user agent header) * improves text encoding support v0.2.1 From 395da50df06bfd02fe54cd0b82ae3fcb6f3908d6 Mon Sep 17 00:00:00 2001 From: Sam Maurer Date: Mon, 9 Nov 2020 11:38:53 -0800 Subject: [PATCH 34/35] More detail for changelog --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6a72bc2..30d605f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -3,7 +3,7 @@ v0.2.2 2020/11/09 -* allows passing matplotlib axes to pandana.plot_net() +* allows passing matplotlib axes to pandana.plot.plot_net() * adds flexibility to calendar/date handling (calendar_dates.txt now supported) * improves GTFS downloading (solves issue where requests were rejected due to missing user agent header) * improves text encoding support From d3b2d2975b3655415c7ca46a73f41032e5cfde9a Mon Sep 17 00:00:00 2001 From: Sam Maurer Date: Mon, 9 Nov 2020 11:41:16 -0800 Subject: [PATCH 35/35] More detail for changelog --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 30d605f..22de2f0 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -3,7 +3,7 @@ v0.2.2 2020/11/09 -* allows passing matplotlib axes to pandana.plot.plot_net() +* allows passing matplotlib axes to urbanaccess.plot.plot_net() * adds flexibility to calendar/date handling (calendar_dates.txt now supported) * improves GTFS downloading (solves issue where requests were rejected due to missing user agent header) * improves text encoding support