Skip to content

Commit

Permalink
Merge pull request #80 from UDST/enhancement/default-txt-encoding-fro…
Browse files Browse the repository at this point in the history
…m-config

Enhancement/default txt encoding from config
  • Loading branch information
smmaurer authored Nov 5, 2020
2 parents 5628cca + acef0ac commit 3475577
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 30 deletions.
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ pycodestyle
# testing demo notebook
jupyter
cartopy # requires conda
pyepsg

# building documentation
numpydoc
Expand Down
17 changes: 13 additions & 4 deletions urbanaccess/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@ def _format_check(settings):
"""

valid_keys = ['data_folder', 'logs_folder', 'log_file',
'log_console', 'log_name', 'log_filename', 'gtfs_api']
'log_console', 'log_name', 'log_filename',
'txt_encoding', 'gtfs_api']

for key in settings.keys():
if key not in valid_keys:
raise ValueError('{} not found in list of valid configuation '
raise ValueError('{} not found in list of valid configuration '
'keys'.format(key))
if not isinstance(key, str):
raise ValueError('{} must be a string'.format(key))
Expand All @@ -42,13 +43,17 @@ class urbanaccess_config(object):
logs_folder : str
location to write log files
log_file : bool
if true, save log output to a log file in logs_folder
if True, save log output to a log file in logs_folder
log_console : bool
if true, print log output to the console
if True, print log output to the console
log_name : str
name of the logger
log_filename : str
name of the log file
txt_encoding : str
default text encoding used by the GTFS files, to be passed to
Python's open() function. Must be a valid encoding recognized by
Python codecs.
gtfs_api : dict
dictionary of the name of the GTFS API service as the key and
the GTFS API server root URL as the value to pass to the GTFS loader
Expand All @@ -61,6 +66,7 @@ def __init__(self,
log_console=False,
log_name='urbanaccess',
log_filename='urbanaccess',
txt_encoding='utf-8',
gtfs_api={'gtfsdataexch': (
'http://www.gtfs-data-exchange.com/'
'api/agencies?format=csv')}):
Expand All @@ -71,6 +77,7 @@ def __init__(self,
self.log_console = log_console
self.log_name = log_name
self.log_filename = log_filename
self.txt_encoding = txt_encoding
self.gtfs_api = gtfs_api

@classmethod
Expand Down Expand Up @@ -110,6 +117,7 @@ def from_yaml(cls, configdir='configs',
log_name=yaml_config.get('log_name', 'urbanaccess'),
log_filename=yaml_config.get('log_filename',
'urbanaccess'),
txt_encoding=yaml_config.get('txt_encoding', 'utf-8'),
gtfs_api=yaml_config.get('gtfs_api', {
'gtfsdataexch':
('http://www.gtfs-data-exchange.com/'
Expand All @@ -128,6 +136,7 @@ def to_dict(self):
'log_console': self.log_console,
'log_name': self.log_name,
'log_filename': self.log_filename,
'txt_encoding': self.txt_encoding,
'gtfs_api': self.gtfs_api,
}

Expand Down
73 changes: 49 additions & 24 deletions urbanaccess/gtfs/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import time
import pandas as pd
import six
import logging as lg

from urbanaccess import config
from urbanaccess.utils import log
Expand All @@ -20,7 +21,7 @@ def _standardize_txt(csv_rootpath=os.path.join(config.settings.data_folder,
Parameters
----------
csv_rootpath : str, optional
root path where all gtfs feeds that make up a contiguous metropolitan
root path where all GTFS feeds that make up a contiguous metropolitan
area are stored
Returns
Expand Down Expand Up @@ -59,6 +60,7 @@ def _txt_encoder_check(gtfsfiles_to_use,
"""
# UnicodeDecodeError
start_time = time.time()
log('Checking GTFS text file for encoding issues...')

folderlist = [foldername for foldername in os.listdir(csv_rootpath) if
os.path.isdir(os.path.join(csv_rootpath, foldername))]
Expand All @@ -74,14 +76,16 @@ def _txt_encoder_check(gtfsfiles_to_use,
for textfile in textfilelist:
if textfile in gtfsfiles_to_use:
# Read from file
file_open = open(os.path.join(csv_rootpath, folder, textfile))
file_path = os.path.join(csv_rootpath, folder, textfile)
file_open = open(file_path)
raw = file_open.read()
file_open.close()
if raw.startswith(codecs.BOM_UTF8):
msg = 'Correcting encoding issue in: {}...'
log(msg.format(file_path))
raw = raw.replace(codecs.BOM_UTF8, '', 1)
# Write to file
file_open = open(
os.path.join(csv_rootpath, folder, textfile), 'w')
file_open = open(file_path, 'w')
file_open.write(raw)
file_open.close()

Expand All @@ -100,9 +104,9 @@ def _txt_header_whitespace_check(gtfsfiles_to_use,
Parameters
----------
gtfsfiles_to_use : list
list of gtfs feed txt files to utilize
list of GTFS feed txt files to utilize
csv_rootpath : str, optional
root path where all gtfs feeds that make up a contiguous metropolitan
root path where all GTFS feeds that make up a contiguous metropolitan
area are stored
Returns
Expand All @@ -111,6 +115,11 @@ def _txt_header_whitespace_check(gtfsfiles_to_use,
"""
start_time = time.time()

txt_encoding = config.settings.txt_encoding
msg = ('Checking GTFS text file header whitespace... '
'Reading files using encoding: {} set in configuration.')
log(msg.format(txt_encoding))

folderlist = [foldername for foldername in os.listdir(csv_rootpath) if
os.path.isdir(os.path.join(csv_rootpath, foldername))]

Expand All @@ -124,25 +133,41 @@ def _txt_header_whitespace_check(gtfsfiles_to_use,

for textfile in textfilelist:
if textfile in gtfsfiles_to_use:
file_path = os.path.join(csv_rootpath, folder, textfile)
# Read from file
with open(os.path.join(csv_rootpath, folder, textfile)) as f:
lines = f.readlines()
lines[0] = re.sub(r'\s+', '', lines[0]) + '\n'
# Write to file
try:
with open(os.path.join(csv_rootpath, folder, textfile),
'w') as f:
f.writelines(lines)
except Exception:
log('Unable to read {}. Check that file is not currently'
'being read or is not already in memory as this is '
'likely the cause of the error.'
''.format(os.path.join(csv_rootpath,
folder, textfile)))
log(
'GTFS text file header whitespace check completed. Took {:,'
'.2f} seconds'.format(
time.time() - start_time))
if six.PY2:
with open(file_path) as f:
lines = f.readlines()
else:
# read with default 'utf-8' encoding
with open(
file_path,
encoding=txt_encoding) as f:
lines = f.readlines()
line_wo_whitespace = re.sub(r'\s+', '', lines[0]) + '\n'
# only write the file if there are changes to be made
if lines[0] != line_wo_whitespace:
msg = 'Removing whitespace from header(s) in: {}...'
log(msg.format(file_path))
lines[0] = line_wo_whitespace
# Write to file
if six.PY2:
with open(
file_path, 'w') as f:
f.writelines(lines)
else:
# write with default 'utf-8' encoding
with open(
file_path, 'w',
encoding=txt_encoding) as f:
f.writelines(lines)
except Exception as e:
msg = 'Unable to process: {}. Exception: {}'
raise Exception(log(msg.format(file_path, e),
level=lg.ERROR))
log('GTFS text file header whitespace check completed. '
'Took {:,.2f} seconds'.format(time.time() - start_time))


def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True,
Expand All @@ -156,7 +181,7 @@ def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True,
Parameters
----------
gtfsfeed_path : str, optional
root path where all gtfs feeds that make up a contiguous metropolitan
root path where all GTFS feeds that make up a contiguous metropolitan
area are stored
validation : bool
if true, the validation check on stops checking for stops outside
Expand Down
4 changes: 2 additions & 2 deletions urbanaccess/gtfs/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def create_transit_net(gtfsfeeds_dfs, day,
DataFrame for the same time period stored in the
gtfsfeeds_dfs object it will be used instead of re-calculated
save_processed_gtfs : bool, optional
if true, all processed gtfs DataFrames will
if true, all processed GTFS DataFrames will
be stored to disk in a hdf5 file
save_dir : str, optional
directory to save the hdf5 file
Expand Down Expand Up @@ -216,7 +216,7 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df,
day in the GTFS calendar
calendar_dates_lookup : dict, optional
dictionary of the lookup column (key) as a string and corresponding
string (value) a s string or list of strings to use to subset trips
string (value) as string or list of strings to use to subset trips
using the calendar_dates DataFrame. Search will be exact. If none,
then the calendar_dates DataFrame will not be used to select trips
that are not in the calendar DataFrame. Note search will select all
Expand Down
91 changes: 91 additions & 0 deletions urbanaccess/tests/test_gtfs_load.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# coding=utf-8
import pytest
import pandas as pd
import os
import six
import codecs
import sys

import urbanaccess.gtfs.load as gtfs_load
from urbanaccess.gtfs.gtfsfeeds_dataframe import urbanaccess_gtfs_df
Expand All @@ -13,6 +18,92 @@ def expected_urbanaccess_gtfs_df_keys():
return expected_keys.sort()


@pytest.fixture
def test_txt_files(tmpdir):
# test file that does not need to be fixed
do_not_fix_txt = os.path.join(tmpdir.strpath, 'agency.txt')
data = ['name,text\n', ' Circulação , áéíóúüñ¿¡ \n']
if six.PY2:
with open(do_not_fix_txt, 'w') as f:
f.writelines(data)
else:
with open(do_not_fix_txt, 'w', encoding='utf-8') as f:
f.writelines(data)

# test file that does need to be fixed
fix_txt = os.path.join(tmpdir.strpath, 'calendar.txt')
data = [' name , text \n', ' Circulação , áéíóúüñ¿¡ \n']
if six.PY2:
with open(fix_txt, 'w') as f:
f.writelines(data)
else:
with open(fix_txt, 'w', encoding='utf-8') as f:
f.writelines(data)

fix_txt_wBOM = os.path.join(tmpdir.strpath, 'calendar_dates.txt')
if six.PY2:
data = [codecs.BOM_UTF8,
' name , text \n',
' Circulação , áéíóúüñ¿¡ \n']
with open(fix_txt_wBOM, 'w') as f:
f.writelines(data)
else:
data = [str(codecs.BOM_UTF8),
' name , text \n',
' Circulação , áéíóúüñ¿¡ \n']
with open(fix_txt_wBOM, 'w', encoding='utf-8') as f:
f.writelines(data)

return tmpdir.strpath, do_not_fix_txt, fix_txt, fix_txt_wBOM


@pytest.fixture
def test_txt_files_to_use():
gtfsfiles_to_use = ['stops.txt', 'routes.txt', 'trips.txt',
'stop_times.txt', 'calendar.txt',
'agency.txt', 'calendar_dates.txt']
return gtfsfiles_to_use


def test_txt_standardization(test_txt_files):
root_dir, do_not_fix_txt, fix_txt, fix_txt_wBOM = test_txt_files

gtfs_load._standardize_txt(csv_rootpath=root_dir)

df = pd.read_csv(fix_txt)
assert list(df.columns) == list(df.columns.str.strip())

df = pd.read_csv(fix_txt_wBOM)
assert list(df.columns) == list(df.columns.str.strip())


def test_txt_header_whitespace_check(test_txt_files, test_txt_files_to_use):
root_dir, do_not_fix_txt, fix_txt, fix_txt_wBOM = test_txt_files

gtfs_load._txt_header_whitespace_check(
gtfsfiles_to_use=test_txt_files_to_use,
csv_rootpath=root_dir)

# only check 'fix_txt' as 'fix_txt_wBOM' would need to be
# fixed by _txt_encoder_check first
df = pd.read_csv(fix_txt)
assert list(df.columns) == list(df.columns.str.strip())


@pytest.mark.skipif(
sys.version_info >= (3, 0), reason="requires python < 3.0")
def test_txt_encoder_check(test_txt_files, test_txt_files_to_use):
root_dir, do_not_fix_txt, fix_txt, fix_txt_wBOM = test_txt_files

gtfs_load._txt_encoder_check(
gtfsfiles_to_use=test_txt_files_to_use,
csv_rootpath=root_dir)

with open(fix_txt_wBOM, 'r') as f:
raw = f.read()
assert raw.startswith(codecs.BOM_UTF8) is False


def test_loadgtfsfeed_to_df_wo_calendar(
agency_a_feed_on_disk_wo_calendar,
expected_urbanaccess_gtfs_df_keys):
Expand Down

0 comments on commit 3475577

Please sign in to comment.