From 9854509191aa3d5c898f5134c79ce0f70b72f4b6 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Mon, 6 Feb 2023 18:17:46 -0600 Subject: [PATCH 001/419] recode mazs in maz-to-maz table --- activitysim/core/los.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/activitysim/core/los.py b/activitysim/core/los.py index e69ed587f..366f260d6 100644 --- a/activitysim/core/los.py +++ b/activitysim/core/los.py @@ -294,6 +294,10 @@ def load_data(self): df = pd.read_csv(config.data_file_path(file_name, mandatory=True)) + # recode MAZs if needed + df["OMAZ"] = recode_based_on_table(df["OMAZ"], "land_use") + df["DMAZ"] = recode_based_on_table(df["DMAZ"], "land_use") + df["i"] = df.OMAZ * self.maz_ceiling + df.DMAZ df.set_index("i", drop=True, inplace=True, verify_integrity=True) logger.debug( From b4ac903de608d0e0ca08000494a8df537d9839c4 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Tue, 7 Feb 2023 10:56:06 -0600 Subject: [PATCH 002/419] basic implementation of parquet pipeline --- activitysim/core/pipeline.py | 98 ++++++++++++++++++++++++++++-------- 1 file changed, 76 insertions(+), 22 deletions(-) diff --git a/activitysim/core/pipeline.py b/activitysim/core/pipeline.py index fe3ceca45..9931fb378 100644 --- a/activitysim/core/pipeline.py +++ b/activitysim/core/pipeline.py @@ -4,6 +4,7 @@ import logging import os from builtins import map, next, object +from pathlib import Path import pandas as pd from orca import orca @@ -37,7 +38,7 @@ class Pipeline(object): def __init__(self): self.init_state() - def init_state(self): + def init_state(self, pipeline_file_format="parquet"): # most recent checkpoint self.last_checkpoint = {} @@ -72,7 +73,7 @@ def is_open(): def is_readonly(): if is_open(): store = get_pipeline_store() - if store and store._mode == "r": + if store and not isinstance(store, Path) and store._mode == "r": return True return False @@ -99,7 +100,11 @@ def close_open_files(): def open_pipeline_store(overwrite=False, mode="a"): """ - Open the pipeline checkpoint store + Open the pipeline checkpoint store. + + If the pipeline_file_name setting ends in ".h5", then the pandas + HDFStore file format is used, otherwise pipeline files are stored + as parquet files organized in regular file system directories. Parameters ---------- @@ -125,23 +130,36 @@ def open_pipeline_store(overwrite=False, mode="a"): inject.get_injectable("pipeline_file_name") ) - if overwrite: - try: - if os.path.isfile(pipeline_file_path): - logger.debug("removing pipeline store: %s" % pipeline_file_path) - os.unlink(pipeline_file_path) - except Exception as e: - print(e) - logger.warning("Error removing %s: %s" % (pipeline_file_path, e)) + if pipeline_file_path.endswith(".h5"): + + if overwrite: + try: + if os.path.isfile(pipeline_file_path): + logger.debug("removing pipeline store: %s" % pipeline_file_path) + os.unlink(pipeline_file_path) + except Exception as e: + print(e) + logger.warning("Error removing %s: %s" % (pipeline_file_path, e)) - _PIPELINE.pipeline_store = pd.HDFStore(pipeline_file_path, mode=mode) + _PIPELINE.pipeline_store = pd.HDFStore(pipeline_file_path, mode=mode) + + else: + _PIPELINE.pipeline_store = Path(pipeline_file_path) logger.debug(f"opened pipeline_store {pipeline_file_path}") def get_pipeline_store(): """ - Return the open pipeline hdf5 checkpoint store or return None if it not been opened + Get the pipeline store. + + If the pipeline filename ends in ".h5" then the legacy HDF5 pipeline + is used, otherwise the faster parquet format is used, and the value + returned here is just the path to the pipeline directory. + + Returns + ------- + pd.HDFStore or Path """ return _PIPELINE.pipeline_store @@ -181,7 +199,12 @@ def read_df(table_name, checkpoint_name=None): """ store = get_pipeline_store() - df = store[pipeline_table_key(table_name, checkpoint_name)] + if isinstance(store, Path): + df = pd.read_parquet( + store.joinpath(table_name, f"{checkpoint_name}.parquet"), + ) + else: + df = store[pipeline_table_key(table_name, checkpoint_name)] return df @@ -193,7 +216,11 @@ def write_df(df, table_name, checkpoint_name=None): We store multiple versions of all simulation tables, for every checkpoint in which they change, so we need to know both the table_name and the checkpoint_name to label the saved table - The only exception is the checkpoints dataframe, which just has a table_name + The only exception is the checkpoints dataframe, which just has a table_name, + although when using the parquet storage format this file is stored as "None.parquet" + to maintain a simple consistent file directory structure. + + If the Parameters ---------- @@ -209,10 +236,28 @@ def write_df(df, table_name, checkpoint_name=None): df.columns = df.columns.astype(str) store = get_pipeline_store() - - store[pipeline_table_key(table_name, checkpoint_name)] = df - - store.flush() + if isinstance(store, Path): + store.joinpath(table_name).mkdir(parents=True, exist_ok=True) + df.to_parquet(store.joinpath(table_name, f"{checkpoint_name}.parquet")) + else: + complib = config.setting("pipeline_complib", None) + if complib is None or len(df.columns) == 0: + # tables with no columns can't be compressed successfully, so to + # avoid them getting just lost and dropped they are instead written + # in fixed format with no compression, which should be just fine + # since they have no data anyhow. + store.put( + pipeline_table_key(table_name, checkpoint_name), + df, + ) + else: + store.put( + pipeline_table_key(table_name, checkpoint_name), + df, + "table", + complib=complib, + ) + store.flush() def rewrap(table_name, df=None): @@ -615,7 +660,8 @@ def close_pipeline(): close_open_files() - _PIPELINE.pipeline_store.close() + if not isinstance(_PIPELINE.pipeline_store, Path): + _PIPELINE.pipeline_store.close() _PIPELINE.init_state() @@ -789,12 +835,20 @@ def get_checkpoints(): store = get_pipeline_store() if store is not None: - df = store[CHECKPOINT_TABLE_NAME] + if isinstance(store, Path): + df = pd.read_parquet(store.joinpath(CHECKPOINT_TABLE_NAME, "None.parquet")) + else: + df = store[CHECKPOINT_TABLE_NAME] else: pipeline_file_path = config.pipeline_file_path( orca.get_injectable("pipeline_file_name") ) - df = pd.read_hdf(pipeline_file_path, CHECKPOINT_TABLE_NAME) + if pipeline_file_path.endswith(".h5"): + df = pd.read_hdf(pipeline_file_path, CHECKPOINT_TABLE_NAME) + else: + df = pd.read_parquet( + Path(pipeline_file_path).joinpath(CHECKPOINT_TABLE_NAME, "None.parquet") + ) # non-table columns first (column order in df is random because created from a dict) table_names = [name for name in df.columns.values if name not in NON_TABLE_COLUMNS] From d9f0157fa4a270c16dad271e19d53db25de6d3e2 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Tue, 7 Feb 2023 14:16:13 -0600 Subject: [PATCH 003/419] fix overrun caused by dtype optimization --- activitysim/core/los.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/activitysim/core/los.py b/activitysim/core/los.py index 366f260d6..377e106e7 100644 --- a/activitysim/core/los.py +++ b/activitysim/core/los.py @@ -298,7 +298,9 @@ def load_data(self): df["OMAZ"] = recode_based_on_table(df["OMAZ"], "land_use") df["DMAZ"] = recode_based_on_table(df["DMAZ"], "land_use") - df["i"] = df.OMAZ * self.maz_ceiling + df.DMAZ + df["i"] = df.OMAZ.astype(np.int32) * self.maz_ceiling.astype( + np.int32 + ) + df.DMAZ.astype(np.int32) df.set_index("i", drop=True, inplace=True, verify_integrity=True) logger.debug( f"loading maz_to_maz table {file_name} with {len(df)} rows" From 05c951df7fc1bedf9a77987d644d32d3888aeeab Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Tue, 7 Feb 2023 16:52:40 -0600 Subject: [PATCH 004/419] pipeline is a real class --- activitysim/core/pipeline.py | 1430 ++++++++++++------------- activitysim/core/tracing.py | 10 +- activitysim/core/workflow/__init__.py | 228 ++++ 3 files changed, 938 insertions(+), 730 deletions(-) create mode 100644 activitysim/core/workflow/__init__.py diff --git a/activitysim/core/pipeline.py b/activitysim/core/pipeline.py index fe3ceca45..d1f202613 100644 --- a/activitysim/core/pipeline.py +++ b/activitysim/core/pipeline.py @@ -3,11 +3,12 @@ import datetime as dt import logging import os -from builtins import map, next, object +from builtins import map, next import pandas as pd -from orca import orca +from pypyr.context import Context +from ..core.workflow import run_named_step from . import config, inject, mem, random, tracing, util from .tracing import print_elapsed_time @@ -33,8 +34,27 @@ NO_CHECKPOINT_PREFIX = "_" -class Pipeline(object): +def split_arg(s, sep, default=""): + """ + split str s in two at first sep, returning empty string as second result if no sep + """ + r = s.split(sep, 2) + r = list(map(str.strip, r)) + + arg = r[0] + + if len(r) == 1: + val = default + else: + val = r[1] + val = {"true": True, "false": False}.get(val.lower(), val) + + return arg, val + + +class Pipeline: def __init__(self): + self.context = Context() self.init_state() def init_state(self): @@ -53,926 +73,882 @@ def init_state(self): self.pipeline_store = None - self.is_open = False + self._is_open = False - tracing.initialize_traceable_tables() + self.context.update(tracing.initialize_traceable_tables()) - def rng(self): + self._TABLES = set() + def rng(self): return self._rng + @property + def is_open(self): + return self._is_open -_PIPELINE = Pipeline() - + @is_open.setter + def is_open(self, x): + self._is_open = bool(x) -def is_open(): - return _PIPELINE.is_open + def is_readonly(self): + if self.is_open: + store = self.get_pipeline_store() + if store and store._mode == "r": + return True + return False + def pipeline_table_key(self, table_name, checkpoint_name): + if checkpoint_name: + key = f"{table_name}/{checkpoint_name}" + else: + key = f"/{table_name}" + return key + + def close_on_exit(self, file, name): + assert name not in self.open_files + self.open_files[name] = file + + def close_open_files(self): + for name, file in self.open_files.items(): + print("Closing %s" % name) + file.close() + self.open_files.clear() + + def open_pipeline_store(self, overwrite=False, mode="a"): + """ + Open the pipeline checkpoint store + + Parameters + ---------- + overwrite : bool + delete file before opening (unless resuming) + mode : {'a', 'w', 'r', 'r+'}, default 'a' + ``'r'`` + Read-only; no data can be modified. + ``'w'`` + Write; a new file is created (an existing file with the same + name would be deleted). + ``'a'`` + Append; an existing file is opened for reading and writing, + and if the file does not exist it is created. + ``'r+'`` + It is similar to ``'a'``, but the file must already exist. + """ + + if self.pipeline_store is not None: + raise RuntimeError("Pipeline store is already open!") -def is_readonly(): - if is_open(): - store = get_pipeline_store() - if store and store._mode == "r": - return True - return False + pipeline_file_path = config.pipeline_file_path( + inject.get_injectable("pipeline_file_name") + ) + if overwrite: + try: + if os.path.isfile(pipeline_file_path): + logger.debug("removing pipeline store: %s" % pipeline_file_path) + os.unlink(pipeline_file_path) + except Exception as e: + print(e) + logger.warning("Error removing %s: %s" % (pipeline_file_path, e)) -def pipeline_table_key(table_name, checkpoint_name): - if checkpoint_name: - key = f"{table_name}/{checkpoint_name}" - else: - key = f"/{table_name}" - return key + self.pipeline_store = pd.HDFStore(pipeline_file_path, mode=mode) + logger.debug(f"opened pipeline_store {pipeline_file_path}") -def close_on_exit(file, name): - assert name not in _PIPELINE.open_files - _PIPELINE.open_files[name] = file + def get_pipeline_store(self): + """ + Return the open pipeline hdf5 checkpoint store or return None if it not been opened + """ + return self.pipeline_store + def get_rn_generator(self): + """ + Return the singleton random number object -def close_open_files(): - for name, file in _PIPELINE.open_files.items(): - print("Closing %s" % name) - file.close() - _PIPELINE.open_files.clear() + Returns + ------- + activitysim.random.Random + """ + return self.rng() + def read_df(self, table_name, checkpoint_name=None): + """ + Read a pandas dataframe from the pipeline store. -def open_pipeline_store(overwrite=False, mode="a"): - """ - Open the pipeline checkpoint store - - Parameters - ---------- - overwrite : bool - delete file before opening (unless resuming) - mode : {'a', 'w', 'r', 'r+'}, default 'a' - ``'r'`` - Read-only; no data can be modified. - ``'w'`` - Write; a new file is created (an existing file with the same - name would be deleted). - ``'a'`` - Append; an existing file is opened for reading and writing, - and if the file does not exist it is created. - ``'r+'`` - It is similar to ``'a'``, but the file must already exist. - """ + We store multiple versions of all simulation tables, for every checkpoint in which they change, + so we need to know both the table_name and the checkpoint_name of hte desired table. - if _PIPELINE.pipeline_store is not None: - raise RuntimeError("Pipeline store is already open!") + The only exception is the checkpoints dataframe, which just has a table_name - pipeline_file_path = config.pipeline_file_path( - inject.get_injectable("pipeline_file_name") - ) + An error will be raised by HDFStore if the table is not found - if overwrite: - try: - if os.path.isfile(pipeline_file_path): - logger.debug("removing pipeline store: %s" % pipeline_file_path) - os.unlink(pipeline_file_path) - except Exception as e: - print(e) - logger.warning("Error removing %s: %s" % (pipeline_file_path, e)) + Parameters + ---------- + table_name : str + checkpoint_name : str - _PIPELINE.pipeline_store = pd.HDFStore(pipeline_file_path, mode=mode) + Returns + ------- + df : pandas.DataFrame + the dataframe read from the store - logger.debug(f"opened pipeline_store {pipeline_file_path}") + """ + store = self.get_pipeline_store() + df = store[self.pipeline_table_key(table_name, checkpoint_name)] -def get_pipeline_store(): - """ - Return the open pipeline hdf5 checkpoint store or return None if it not been opened - """ - return _PIPELINE.pipeline_store + return df + def write_df(self, df, table_name, checkpoint_name=None): + """ + Write a pandas dataframe to the pipeline store. -def get_rn_generator(): - """ - Return the singleton random number object + We store multiple versions of all simulation tables, for every checkpoint in which they change, + so we need to know both the table_name and the checkpoint_name to label the saved table - Returns - ------- - activitysim.random.Random - """ - return _PIPELINE.rng() + The only exception is the checkpoints dataframe, which just has a table_name + Parameters + ---------- + df : pandas.DataFrame + dataframe to store + table_name : str + also conventionally the injected table name + checkpoint_name : str + the checkpoint at which the table was created/modified + """ -def read_df(table_name, checkpoint_name=None): - """ - Read a pandas dataframe from the pipeline store. - - We store multiple versions of all simulation tables, for every checkpoint in which they change, - so we need to know both the table_name and the checkpoint_name of hte desired table. - - The only exception is the checkpoints dataframe, which just has a table_name - - An error will be raised by HDFStore if the table is not found - - Parameters - ---------- - table_name : str - checkpoint_name : str + # coerce column names to str as unicode names will cause PyTables to pickle them + df.columns = df.columns.astype(str) - Returns - ------- - df : pandas.DataFrame - the dataframe read from the store + store = self.get_pipeline_store() - """ + store[self.pipeline_table_key(table_name, checkpoint_name)] = df - store = get_pipeline_store() - df = store[pipeline_table_key(table_name, checkpoint_name)] + store.flush() - return df + def add_table(self, name, content): + self._TABLES.add(name) + self.context.update({name: content}) + def is_table(self, name): + return name in self._TABLES -def write_df(df, table_name, checkpoint_name=None): - """ - Write a pandas dataframe to the pipeline store. + def rewrap(self, table_name, df=None): + """ + Add or replace an orca registered table as a unitary DataFrame-backed DataFrameWrapper table - We store multiple versions of all simulation tables, for every checkpoint in which they change, - so we need to know both the table_name and the checkpoint_name to label the saved table + if df is None, then get the dataframe from orca (table_name should be registered, or + an error will be thrown) which may involve evaluating added columns, etc. - The only exception is the checkpoints dataframe, which just has a table_name + If the orca table already exists, deregister it along with any associated columns before + re-registering it. - Parameters - ---------- - df : pandas.DataFrame - dataframe to store - table_name : str - also conventionally the injected table name - checkpoint_name : str - the checkpoint at which the table was created/modified - """ + The net result is that the dataframe is a registered orca DataFrameWrapper table with no + computed or added columns. - # coerce column names to str as unicode names will cause PyTables to pickle them - df.columns = df.columns.astype(str) + Parameters + ---------- + table_name + df - store = get_pipeline_store() + Returns + ------- + the underlying df of the rewrapped table + """ - store[pipeline_table_key(table_name, checkpoint_name)] = df + logger.debug("rewrap table %s inplace=%s" % (table_name, (df is None))) - store.flush() + if self.is_table(table_name): + if df is None: + # # logger.debug("rewrap - orca.get_table(%s)" % (table_name,)) + # t = orca.get_table(table_name) + # df = t.to_frame() + df = self.context.get(table_name) + else: + # logger.debug("rewrap - orca.get_raw_table(%s)" % (table_name,)) + # don't trigger function call of TableFuncWrapper + # t = orca.get_raw_table(table_name) + df = self.context.get(table_name) -def rewrap(table_name, df=None): - """ - Add or replace an orca registered table as a unitary DataFrame-backed DataFrameWrapper table - - if df is None, then get the dataframe from orca (table_name should be registered, or - an error will be thrown) which may involve evaluating added columns, etc. - - If the orca table already exists, deregister it along with any associated columns before - re-registering it. - - The net result is that the dataframe is a registered orca DataFrameWrapper table with no - computed or added columns. - - Parameters - ---------- - table_name - df - - Returns - ------- - the underlying df of the rewrapped table - """ - - logger.debug("rewrap table %s inplace=%s" % (table_name, (df is None))) - - if orca.is_table(table_name): + assert df is not None - if df is None: - # logger.debug("rewrap - orca.get_table(%s)" % (table_name,)) - t = orca.get_table(table_name) - df = t.to_frame() - else: - # logger.debug("rewrap - orca.get_raw_table(%s)" % (table_name,)) - # don't trigger function call of TableFuncWrapper - t = orca.get_raw_table(table_name) + self.add_table(table_name, df) - t.clear_cached() + return df - for column_name in orca.list_columns_for_table(table_name): - # logger.debug("pop %s.%s: %s" % (table_name, column_name, t.column_type(column_name))) - # fixme - orca._COLUMNS.pop((table_name, column_name), None) + def add_checkpoint(self, checkpoint_name): + """ + Create a new checkpoint with specified name, write all data required to restore the simulation + to its current state. - # remove from orca's table list - orca._TABLES.pop(table_name, None) + Detect any changed tables , re-wrap them and write the current version to the pipeline store. + Write the current state of the random number generator. - assert df is not None + Parameters + ---------- + checkpoint_name : str + """ + timestamp = dt.datetime.now() - orca.add_table(table_name, df) + logger.debug("add_checkpoint %s timestamp %s" % (checkpoint_name, timestamp)) - return df + for table_name in self.registered_tables(): + # if we have not already checkpointed it or it has changed + # FIXME - this won't detect if the orca table was modified + if ( + table_name not in self.last_checkpoint + or table_name in self.replaced_tables + ): + df = self.get_table(table_name).to_frame() + else: + continue -def add_checkpoint(checkpoint_name): - """ - Create a new checkpoint with specified name, write all data required to restore the simulation - to its current state. + logger.debug( + "add_checkpoint '%s' table '%s' %s" + % (checkpoint_name, table_name, util.df_size(df)) + ) + self.write_df(df, table_name, checkpoint_name) - Detect any changed tables , re-wrap them and write the current version to the pipeline store. - Write the current state of the random number generator. + # remember which checkpoint it was last written + self.last_checkpoint[table_name] = checkpoint_name - Parameters - ---------- - checkpoint_name : str - """ - timestamp = dt.datetime.now() - - logger.debug("add_checkpoint %s timestamp %s" % (checkpoint_name, timestamp)) - - for table_name in registered_tables(): - - # if we have not already checkpointed it or it has changed - # FIXME - this won't detect if the orca table was modified - if len(orca.list_columns_for_table(table_name)): - # rewrap the changed orca table as a unitary DataFrame-backed DataFrameWrapper table - df = rewrap(table_name) - elif ( - table_name not in _PIPELINE.last_checkpoint - or table_name in _PIPELINE.replaced_tables - ): - df = orca.get_table(table_name).to_frame() - else: - continue + self.replaced_tables.clear() - logger.debug( - "add_checkpoint '%s' table '%s' %s" - % (checkpoint_name, table_name, util.df_size(df)) - ) - write_df(df, table_name, checkpoint_name) + self.last_checkpoint[CHECKPOINT_NAME] = checkpoint_name + self.last_checkpoint[TIMESTAMP] = timestamp - # remember which checkpoint it was last written - _PIPELINE.last_checkpoint[table_name] = checkpoint_name + # append to the array of checkpoint history + self.checkpoints.append(self.last_checkpoint.copy()) - _PIPELINE.replaced_tables.clear() + # create a pandas dataframe of the checkpoint history, one row per checkpoint + checkpoints = pd.DataFrame(self.checkpoints) - _PIPELINE.last_checkpoint[CHECKPOINT_NAME] = checkpoint_name - _PIPELINE.last_checkpoint[TIMESTAMP] = timestamp + # convert empty values to str so PyTables doesn't pickle object types + for c in checkpoints.columns: + checkpoints[c] = checkpoints[c].fillna("") - # append to the array of checkpoint history - _PIPELINE.checkpoints.append(_PIPELINE.last_checkpoint.copy()) + # write it to the store, overwriting any previous version (no way to simply extend) + self.write_df(checkpoints, CHECKPOINT_TABLE_NAME) - # create a pandas dataframe of the checkpoint history, one row per checkpoint - checkpoints = pd.DataFrame(_PIPELINE.checkpoints) + def registered_tables(self): + """ + Return a list of the names of all currently registered dataframe tables + """ + return [ + name + for name in self._TABLES + if isinstance(self.context.get(name, None), (pd.DataFrame,)) + ] - # convert empty values to str so PyTables doesn't pickle object types - for c in checkpoints.columns: - checkpoints[c] = checkpoints[c].fillna("") + def checkpointed_tables(self): + """ + Return a list of the names of all checkpointed tables + """ - # write it to the store, overwriting any previous version (no way to simply extend) - write_df(checkpoints, CHECKPOINT_TABLE_NAME) + return [ + name + for name, checkpoint_name in self.last_checkpoint.items() + if checkpoint_name and name not in NON_TABLE_COLUMNS + ] + def load_checkpoint(self, checkpoint_name): + """ + Load dataframes and restore random number channel state from pipeline hdf5 file. + This restores the pipeline state that existed at the specified checkpoint in a prior simulation. + This allows us to resume the simulation after the specified checkpoint -def registered_tables(): - """ - Return a list of the names of all currently registered dataframe tables - """ - return [name for name in orca.list_tables() if orca.table_type(name) == "dataframe"] + Parameters + ---------- + checkpoint_name : str + model_name of checkpoint to load (resume_after argument to open_pipeline) + """ + logger.info("load_checkpoint %s" % (checkpoint_name)) -def checkpointed_tables(): - """ - Return a list of the names of all checkpointed tables - """ + checkpoints = self.read_df(CHECKPOINT_TABLE_NAME) - return [ - name - for name, checkpoint_name in _PIPELINE.last_checkpoint.items() - if checkpoint_name and name not in NON_TABLE_COLUMNS - ] + if checkpoint_name == LAST_CHECKPOINT: + checkpoint_name = checkpoints[CHECKPOINT_NAME].iloc[-1] + logger.info("loading checkpoint '%s'" % checkpoint_name) + try: + # truncate rows after target checkpoint + i = checkpoints[checkpoints[CHECKPOINT_NAME] == checkpoint_name].index[0] + checkpoints = checkpoints.loc[:i] + + # if the store is not open in read-only mode, + # write it to the store to ensure so any subsequent checkpoints are forgotten + if not self.is_readonly(): + self.write_df(checkpoints, CHECKPOINT_TABLE_NAME) + + except IndexError: + msg = "Couldn't find checkpoint '%s' in checkpoints" % (checkpoint_name,) + print(checkpoints[CHECKPOINT_NAME]) + logger.error(msg) + raise RuntimeError(msg) + + # convert pandas dataframe back to array of checkpoint dicts + checkpoints = checkpoints.to_dict(orient="records") + + # drop tables with empty names + for checkpoint in checkpoints: + for key in list(checkpoint.keys()): + if key not in NON_TABLE_COLUMNS and not checkpoint[key]: + del checkpoint[key] + + # patch _CHECKPOINTS array of dicts + self.checkpoints = checkpoints + + # patch _CHECKPOINTS dict with latest checkpoint info + self.last_checkpoint.clear() + self.last_checkpoint.update(self.checkpoints[-1]) + + logger.info( + "load_checkpoint %s timestamp %s" + % (checkpoint_name, self.last_checkpoint["timestamp"]) + ) -def load_checkpoint(checkpoint_name): - """ - Load dataframes and restore random number channel state from pipeline hdf5 file. - This restores the pipeline state that existed at the specified checkpoint in a prior simulation. - This allows us to resume the simulation after the specified checkpoint - - Parameters - ---------- - checkpoint_name : str - model_name of checkpoint to load (resume_after argument to open_pipeline) - """ + tables = self.checkpointed_tables() - logger.info("load_checkpoint %s" % (checkpoint_name)) - - checkpoints = read_df(CHECKPOINT_TABLE_NAME) - - if checkpoint_name == LAST_CHECKPOINT: - checkpoint_name = checkpoints[CHECKPOINT_NAME].iloc[-1] - logger.info("loading checkpoint '%s'" % checkpoint_name) - - try: - # truncate rows after target checkpoint - i = checkpoints[checkpoints[CHECKPOINT_NAME] == checkpoint_name].index[0] - checkpoints = checkpoints.loc[:i] - - # if the store is not open in read-only mode, - # write it to the store to ensure so any subsequent checkpoints are forgotten - if not is_readonly(): - write_df(checkpoints, CHECKPOINT_TABLE_NAME) - - except IndexError: - msg = "Couldn't find checkpoint '%s' in checkpoints" % (checkpoint_name,) - print(checkpoints[CHECKPOINT_NAME]) - logger.error(msg) - raise RuntimeError(msg) - - # convert pandas dataframe back to array of checkpoint dicts - checkpoints = checkpoints.to_dict(orient="records") - - # drop tables with empty names - for checkpoint in checkpoints: - for key in list(checkpoint.keys()): - if key not in NON_TABLE_COLUMNS and not checkpoint[key]: - del checkpoint[key] - - # patch _CHECKPOINTS array of dicts - _PIPELINE.checkpoints = checkpoints - - # patch _CHECKPOINTS dict with latest checkpoint info - _PIPELINE.last_checkpoint.clear() - _PIPELINE.last_checkpoint.update(_PIPELINE.checkpoints[-1]) - - logger.info( - "load_checkpoint %s timestamp %s" - % (checkpoint_name, _PIPELINE.last_checkpoint["timestamp"]) - ) - - tables = checkpointed_tables() - - loaded_tables = {} - for table_name in tables: - # read dataframe from pipeline store - df = read_df(table_name, checkpoint_name=_PIPELINE.last_checkpoint[table_name]) - logger.info("load_checkpoint table %s %s" % (table_name, df.shape)) - # register it as an orca table - rewrap(table_name, df) - loaded_tables[table_name] = df - if table_name == "land_use" and "_original_zone_id" in df.columns: - # The presence of _original_zone_id indicates this table index was - # decoded to zero-based, so we need to disable offset - # processing for legacy skim access. - # TODO: this "magic" column name should be replaced with a mechanism - # to write and recover particular settings from the pipeline - # store, but we don't have that mechanism yet - config.override_setting("offset_preprocessing", True) - - # register for tracing in order that tracing.register_traceable_table wants us to register them - traceable_tables = inject.get_injectable("traceable_tables", []) - - for table_name in traceable_tables: - if table_name in loaded_tables: - tracing.register_traceable_table(table_name, loaded_tables[table_name]) - - # add tables of known rng channels - rng_channels = inject.get_injectable("rng_channels", []) - if rng_channels: - logger.debug("loading random channels %s" % rng_channels) - for table_name in rng_channels: + loaded_tables = {} + for table_name in tables: + # read dataframe from pipeline store + df = self.read_df( + table_name, checkpoint_name=self.last_checkpoint[table_name] + ) + logger.info("load_checkpoint table %s %s" % (table_name, df.shape)) + # register it as an orca table + self.rewrap(table_name, df) + loaded_tables[table_name] = df + if table_name == "land_use" and "_original_zone_id" in df.columns: + # The presence of _original_zone_id indicates this table index was + # decoded to zero-based, so we need to disable offset + # processing for legacy skim access. + # TODO: this "magic" column name should be replaced with a mechanism + # to write and recover particular settings from the pipeline + # store, but we don't have that mechanism yet + config.override_setting("offset_preprocessing", True) + + # register for tracing in order that tracing.register_traceable_table wants us to register them + traceable_tables = inject.get_injectable("traceable_tables", []) + + for table_name in traceable_tables: if table_name in loaded_tables: - logger.debug("adding channel %s" % (table_name,)) - _PIPELINE.rng().add_channel(table_name, loaded_tables[table_name]) - - -def split_arg(s, sep, default=""): - """ - split str s in two at first sep, returning empty string as second result if no sep - """ - r = s.split(sep, 2) - r = list(map(str.strip, r)) - - arg = r[0] - - if len(r) == 1: - val = default - else: - val = r[1] - val = {"true": True, "false": False}.get(val.lower(), val) + tracing.register_traceable_table(table_name, loaded_tables[table_name]) + + # add tables of known rng channels + rng_channels = inject.get_injectable("rng_channels", []) + if rng_channels: + logger.debug("loading random channels %s" % rng_channels) + for table_name in rng_channels: + if table_name in loaded_tables: + logger.debug("adding channel %s" % (table_name,)) + self.rng().add_channel(table_name, loaded_tables[table_name]) + + def run_model(self, model_name): + """ + Run the specified model and add checkpoint for model_name + + Since we use model_name as checkpoint name, the same model may not be run more than once. + + Parameters + ---------- + model_name : str + model_name is assumed to be the name of a registered orca step + """ + + if not self.is_open: + raise RuntimeError("Pipeline not initialized! Did you call open_pipeline?") + + # can't run same model more than once + if model_name in [ + checkpoint[CHECKPOINT_NAME] for checkpoint in self.checkpoints + ]: + raise RuntimeError("Cannot run model '%s' more than once" % model_name) + + self.rng().begin_step(model_name) + + # check for args + if "." in model_name: + step_name, arg_string = model_name.split(".", 1) + args = dict( + (k, v) + for k, v in ( + split_arg(item, "=", default=True) for item in arg_string.split(";") + ) + ) + else: + step_name = model_name + args = {} - return arg, val + # check for no_checkpoint prefix + if step_name[0] == NO_CHECKPOINT_PREFIX: + step_name = step_name[1:] + checkpoint = False + else: + checkpoint = self.intermediate_checkpoint(model_name) + inject.set_step_args(args) -def run_model(model_name): - """ - Run the specified model and add checkpoint for model_name + mem.trace_memory_info(f"pipeline.run_model {model_name} start") - Since we use model_name as checkpoint name, the same model may not be run more than once. + t0 = print_elapsed_time() + logger.info(f"#run_model running step {step_name}") - Parameters - ---------- - model_name : str - model_name is assumed to be the name of a registered orca step - """ - - if not is_open(): - raise RuntimeError("Pipeline not initialized! Did you call open_pipeline?") + instrument = config.setting("instrument", None) + if instrument is not None: + try: + from pyinstrument import Profiler + except ImportError: + instrument = False + if isinstance(instrument, (list, set, tuple)): + if step_name not in instrument: + instrument = False + else: + instrument = True - # can't run same model more than once - if model_name in [ - checkpoint[CHECKPOINT_NAME] for checkpoint in _PIPELINE.checkpoints - ]: - raise RuntimeError("Cannot run model '%s' more than once" % model_name) + if instrument: + from pyinstrument import Profiler - _PIPELINE.rng().begin_step(model_name) + with Profiler() as profiler: + run_named_step(step_name, self.context) + out_file = config.profiling_file_path(f"{step_name}.html") + with open(out_file, "wt") as f: + f.write(profiler.output_html()) + else: + run_named_step(step_name, self.context) - # check for args - if "." in model_name: - step_name, arg_string = model_name.split(".", 1) - args = dict( - (k, v) - for k, v in ( - split_arg(item, "=", default=True) for item in arg_string.split(";") - ) + t0 = print_elapsed_time( + "#run_model completed step '%s'" % model_name, t0, debug=True ) - else: - step_name = model_name - args = {} - - # check for no_checkpoint prefix - if step_name[0] == NO_CHECKPOINT_PREFIX: - step_name = step_name[1:] - checkpoint = False - else: - checkpoint = intermediate_checkpoint(model_name) - - inject.set_step_args(args) - - mem.trace_memory_info(f"pipeline.run_model {model_name} start") + mem.trace_memory_info(f"pipeline.run_model {model_name} finished") - t0 = print_elapsed_time() - logger.info(f"#run_model running step {step_name}") + inject.set_step_args(None) - instrument = config.setting("instrument", None) - if instrument is not None: - try: - from pyinstrument import Profiler - except ImportError: - instrument = False - if isinstance(instrument, (list, set, tuple)): - if step_name not in instrument: - instrument = False + self.rng().end_step(model_name) + if checkpoint: + self.add_checkpoint(model_name) else: - instrument = True - - if instrument: - with Profiler() as profiler: - orca.run([step_name]) - out_file = config.profiling_file_path(f"{step_name}.html") - with open(out_file, "wt") as f: - f.write(profiler.output_html()) - else: - orca.run([step_name]) + logger.info("##### skipping %s checkpoint for %s" % (step_name, model_name)) - t0 = print_elapsed_time( - "#run_model completed step '%s'" % model_name, t0, debug=True - ) - mem.trace_memory_info(f"pipeline.run_model {model_name} finished") + def open_pipeline(self, resume_after=None, mode="a"): + """ + Start pipeline, either for a new run or, if resume_after, loading checkpoint from pipeline. - inject.set_step_args(None) + If resume_after, then we expect the pipeline hdf5 file to exist and contain + checkpoints from a previous run, including a checkpoint with name specified in resume_after - _PIPELINE.rng().end_step(model_name) - if checkpoint: - add_checkpoint(model_name) - else: - logger.info("##### skipping %s checkpoint for %s" % (step_name, model_name)) + Parameters + ---------- + resume_after : str or None + name of checkpoint to load from pipeline store + mode : {'a', 'w', 'r', 'r+'}, default 'a' + same as for typical opening of H5Store. Ignored unless resume_after + is not None. This is here to allow read-only pipeline for benchmarking. + """ + if self.is_open: + raise RuntimeError("Pipeline is already open!") -def open_pipeline(resume_after=None, mode="a"): - """ - Start pipeline, either for a new run or, if resume_after, loading checkpoint from pipeline. - - If resume_after, then we expect the pipeline hdf5 file to exist and contain - checkpoints from a previous run, including a checkpoint with name specified in resume_after - - Parameters - ---------- - resume_after : str or None - name of checkpoint to load from pipeline store - mode : {'a', 'w', 'r', 'r+'}, default 'a' - same as for typical opening of H5Store. Ignored unless resume_after - is not None. This is here to allow read-only pipeline for benchmarking. - """ - - if is_open(): - raise RuntimeError("Pipeline is already open!") - - _PIPELINE.init_state() - _PIPELINE.is_open = True + self.init_state() + self.is_open = True + + self.get_rn_generator().set_base_seed(inject.get_injectable("rng_base_seed", 0)) + + if resume_after: + # open existing pipeline + logger.debug("open_pipeline - open existing pipeline") + self.open_pipeline_store(overwrite=False, mode=mode) + try: + self.load_checkpoint(resume_after) + except KeyError as err: + if "checkpoints" in err.args[0]: + # no checkpoints initialized, fall back to restart + self.last_checkpoint[CHECKPOINT_NAME] = INITIAL_CHECKPOINT_NAME + self.add_checkpoint(INITIAL_CHECKPOINT_NAME) + else: + raise + else: + # open new, empty pipeline + logger.debug("open_pipeline - new, empty pipeline") + self.open_pipeline_store(overwrite=True) + # - not sure why I thought we needed this? + # could have exogenous tables or prng instantiation under some circumstance?? + self.last_checkpoint[CHECKPOINT_NAME] = INITIAL_CHECKPOINT_NAME + # empty table, in case they have turned off all checkpointing + self.add_checkpoint(INITIAL_CHECKPOINT_NAME) - get_rn_generator().set_base_seed(inject.get_injectable("rng_base_seed", 0)) + logger.debug("open_pipeline complete") - if resume_after: - # open existing pipeline - logger.debug("open_pipeline - open existing pipeline") - open_pipeline_store(overwrite=False, mode=mode) - try: - load_checkpoint(resume_after) - except KeyError as err: - if "checkpoints" in err.args[0]: - # no checkpoints initialized, fall back to restart - _PIPELINE.last_checkpoint[CHECKPOINT_NAME] = INITIAL_CHECKPOINT_NAME - add_checkpoint(INITIAL_CHECKPOINT_NAME) - else: - raise - else: - # open new, empty pipeline - logger.debug("open_pipeline - new, empty pipeline") - open_pipeline_store(overwrite=True) - # - not sure why I thought we needed this? - # could have exogenous tables or prng instantiation under some circumstance?? - _PIPELINE.last_checkpoint[CHECKPOINT_NAME] = INITIAL_CHECKPOINT_NAME - # empty table, in case they have turned off all checkpointing - add_checkpoint(INITIAL_CHECKPOINT_NAME) + def last_checkpoint(self): + """ - logger.debug("open_pipeline complete") + Returns + ------- + last_checkpoint: str + name of last checkpoint + """ + assert self.is_open, f"Pipeline is not open." -def last_checkpoint(): - """ + return self.last_checkpoint[CHECKPOINT_NAME] - Returns - ------- - last_checkpoint: str - name of last checkpoint - """ + def close_pipeline(self): + """ + Close any known open files + """ - assert is_open(), f"Pipeline is not open." + assert self.is_open, f"Pipeline is not open." - return _PIPELINE.last_checkpoint[CHECKPOINT_NAME] + self.close_open_files() + self.pipeline_store.close() -def close_pipeline(): - """ - Close any known open files - """ + self.init_state() - assert is_open(), f"Pipeline is not open." + logger.debug("close_pipeline") - close_open_files() + def intermediate_checkpoint(self, checkpoint_name=None): - _PIPELINE.pipeline_store.close() + checkpoints = config.setting("checkpoints", True) - _PIPELINE.init_state() + if checkpoints is True or checkpoints is False: + return checkpoints - logger.debug("close_pipeline") + assert isinstance( + checkpoints, list + ), f"setting 'checkpoints'' should be True or False or a list" + return checkpoint_name in checkpoints -def intermediate_checkpoint(checkpoint_name=None): + def run(self, models, resume_after=None, memory_sidecar_process=None): + """ + run the specified list of models, optionally loading checkpoint and resuming after specified + checkpoint. - checkpoints = config.setting("checkpoints", True) + Since we use model_name as checkpoint name, the same model may not be run more than once. - if checkpoints is True or checkpoints is False: - return checkpoints + If resume_after checkpoint is specified and a model with that name appears in the models list, + then we only run the models after that point in the list. This allows the user always to pass + the same list of models, but specify a resume_after point if desired. - assert isinstance( - checkpoints, list - ), f"setting 'checkpoints'' should be True or False or a list" + Parameters + ---------- + models : [str] + list of model_names + resume_after : str or None + model_name of checkpoint to load checkpoint and AFTER WHICH to resume model run + memory_sidecar_process : MemorySidecar, optional + Subprocess that monitors memory usage - return checkpoint_name in checkpoints + returns: + nothing, but with pipeline open + """ + t0 = print_elapsed_time() -def run(models, resume_after=None, memory_sidecar_process=None): - """ - run the specified list of models, optionally loading checkpoint and resuming after specified - checkpoint. - - Since we use model_name as checkpoint name, the same model may not be run more than once. - - If resume_after checkpoint is specified and a model with that name appears in the models list, - then we only run the models after that point in the list. This allows the user always to pass - the same list of models, but specify a resume_after point if desired. - - Parameters - ---------- - models : [str] - list of model_names - resume_after : str or None - model_name of checkpoint to load checkpoint and AFTER WHICH to resume model run - memory_sidecar_process : MemorySidecar, optional - Subprocess that monitors memory usage - - returns: - nothing, but with pipeline open - """ + self.open_pipeline(resume_after) + t0 = print_elapsed_time("open_pipeline", t0) - t0 = print_elapsed_time() + if resume_after == LAST_CHECKPOINT: + resume_after = self.last_checkpoint[CHECKPOINT_NAME] - open_pipeline(resume_after) - t0 = print_elapsed_time("open_pipeline", t0) + if resume_after: + logger.info("resume_after %s" % resume_after) + if resume_after in models: + models = models[models.index(resume_after) + 1 :] - if resume_after == LAST_CHECKPOINT: - resume_after = _PIPELINE.last_checkpoint[CHECKPOINT_NAME] + mem.trace_memory_info("pipeline.run before preload_injectables") - if resume_after: - logger.info("resume_after %s" % resume_after) - if resume_after in models: - models = models[models.index(resume_after) + 1 :] + # preload any bulky injectables (e.g. skims) not in pipeline + if inject.get_injectable("preload_injectables", None): + if memory_sidecar_process: + memory_sidecar_process.set_event("preload_injectables") + t0 = print_elapsed_time("preload_injectables", t0) - mem.trace_memory_info("pipeline.run before preload_injectables") + mem.trace_memory_info("pipeline.run after preload_injectables") - # preload any bulky injectables (e.g. skims) not in pipeline - if inject.get_injectable("preload_injectables", None): - if memory_sidecar_process: - memory_sidecar_process.set_event("preload_injectables") - t0 = print_elapsed_time("preload_injectables", t0) + t0 = print_elapsed_time() + for model in models: + if memory_sidecar_process: + memory_sidecar_process.set_event(model) + t1 = print_elapsed_time() + self.run_model(model) + mem.trace_memory_info(f"pipeline.run after {model}") - mem.trace_memory_info("pipeline.run after preload_injectables") + tracing.log_runtime(model_name=model, start_time=t1) - t0 = print_elapsed_time() - for model in models: if memory_sidecar_process: - memory_sidecar_process.set_event(model) - t1 = print_elapsed_time() - run_model(model) - mem.trace_memory_info(f"pipeline.run after {model}") - - tracing.log_runtime(model_name=model, start_time=t1) - - if memory_sidecar_process: - memory_sidecar_process.set_event("finalizing") - - # add checkpoint with final tables even if not intermediate checkpointing - if not intermediate_checkpoint(): - add_checkpoint(FINAL_CHECKPOINT_NAME) - - mem.trace_memory_info("pipeline.run after run_models") + memory_sidecar_process.set_event("finalizing") - t0 = print_elapsed_time("run_model (%s models)" % len(models), t0) + # add checkpoint with final tables even if not intermediate checkpointing + if not self.intermediate_checkpoint(): + self.add_checkpoint(FINAL_CHECKPOINT_NAME) - # don't close the pipeline, as the user may want to read intermediate results from the store + mem.trace_memory_info("pipeline.run after run_models") + t0 = print_elapsed_time("run_model (%s models)" % len(models), t0) -def get_table(table_name, checkpoint_name=None): - """ - Return pandas dataframe corresponding to table_name - - if checkpoint_name is None, return the current (most recent) version of the table. - The table can be a checkpointed table or any registered orca table (e.g. function table) + # don't close the pipeline, as the user may want to read intermediate results from the store - if checkpoint_name is specified, return table as it was at that checkpoint - (the most recently checkpointed version of the table at or before checkpoint_name) + def get_table(self, table_name, checkpoint_name=None): + """ + Return pandas dataframe corresponding to table_name - Parameters - ---------- - table_name : str - checkpoint_name : str or None + if checkpoint_name is None, return the current (most recent) version of the table. + The table can be a checkpointed table or any registered orca table (e.g. function table) - Returns - ------- - df : pandas.DataFrame - """ + if checkpoint_name is specified, return table as it was at that checkpoint + (the most recently checkpointed version of the table at or before checkpoint_name) - assert is_open(), f"Pipeline is not open." + Parameters + ---------- + table_name : str + checkpoint_name : str or None - # orca table not in checkpoints (e.g. a merged table) - if table_name not in _PIPELINE.last_checkpoint and orca.is_table(table_name): - if checkpoint_name is not None: - raise RuntimeError( - "get_table: checkpoint_name ('%s') not supported" - "for non-checkpointed table '%s'" % (checkpoint_name, table_name) - ) + Returns + ------- + df : pandas.DataFrame + """ - return orca.get_table(table_name).to_frame() + assert self.is_open, f"Pipeline is not open." - # if they want current version of table, no need to read from pipeline store - if checkpoint_name is None: + # orca table not in checkpoints (e.g. a merged table) + if table_name not in self.last_checkpoint and self.is_table(table_name): + if checkpoint_name is not None: + raise RuntimeError( + "get_table: checkpoint_name ('%s') not supported" + "for non-checkpointed table '%s'" % (checkpoint_name, table_name) + ) - if table_name not in _PIPELINE.last_checkpoint: - raise RuntimeError("table '%s' never checkpointed." % table_name) + return self.context.get(table_name) - if not _PIPELINE.last_checkpoint[table_name]: - raise RuntimeError("table '%s' was dropped." % table_name) + # if they want current version of table, no need to read from pipeline store + if checkpoint_name is None: - # return orca.get_table(table_name).local - return orca.get_table(table_name).to_frame() + if table_name not in self.last_checkpoint: + raise RuntimeError("table '%s' never checkpointed." % table_name) - # find the requested checkpoint - checkpoint = next( - (x for x in _PIPELINE.checkpoints if x["checkpoint_name"] == checkpoint_name), - None, - ) - if checkpoint is None: - raise RuntimeError("checkpoint '%s' not in checkpoints." % checkpoint_name) + if not self.last_checkpoint[table_name]: + raise RuntimeError("table '%s' was dropped." % table_name) - # find the checkpoint that table was written to store - last_checkpoint_name = checkpoint.get(table_name, None) + # return orca.get_table(table_name).local + return self.context.get(table_name) - if not last_checkpoint_name: - raise RuntimeError( - "table '%s' not in checkpoint '%s'." % (table_name, checkpoint_name) + # find the requested checkpoint + checkpoint = next( + (x for x in self.checkpoints if x["checkpoint_name"] == checkpoint_name), + None, ) + if checkpoint is None: + raise RuntimeError("checkpoint '%s' not in checkpoints." % checkpoint_name) - # if this version of table is same as current - if _PIPELINE.last_checkpoint.get(table_name, None) == last_checkpoint_name: - return orca.get_table(table_name).to_frame() - - return read_df(table_name, last_checkpoint_name) - - -def get_checkpoints(): - """ - Get pandas dataframe of info about all checkpoints stored in pipeline - - pipeline doesn't have to be open - - Returns - ------- - checkpoints_df : pandas.DataFrame + # find the checkpoint that table was written to store + last_checkpoint_name = checkpoint.get(table_name, None) - """ - - store = get_pipeline_store() - - if store is not None: - df = store[CHECKPOINT_TABLE_NAME] - else: - pipeline_file_path = config.pipeline_file_path( - orca.get_injectable("pipeline_file_name") - ) - df = pd.read_hdf(pipeline_file_path, CHECKPOINT_TABLE_NAME) - - # non-table columns first (column order in df is random because created from a dict) - table_names = [name for name in df.columns.values if name not in NON_TABLE_COLUMNS] + if not last_checkpoint_name: + raise RuntimeError( + "table '%s' not in checkpoint '%s'." % (table_name, checkpoint_name) + ) - df = df[NON_TABLE_COLUMNS + table_names] + # if this version of table is same as current + if self.last_checkpoint.get(table_name, None) == last_checkpoint_name: + return self.context.get(table_name) - return df + return self.read_df(table_name, last_checkpoint_name) + def get_checkpoints(self): + """ + Get pandas dataframe of info about all checkpoints stored in pipeline -def replace_table(table_name, df): - """ - Add or replace a orca table, removing any existing added orca columns + pipeline doesn't have to be open - The use case for this function is a method that calls to_frame on an orca table, modifies - it and then saves the modified. + Returns + ------- + checkpoints_df : pandas.DataFrame - orca.to_frame returns a copy, so no changes are saved, and adding multiple column with - add_column adds them in an indeterminate order. + """ - Simply replacing an existing the table "behind the pipeline's back" by calling orca.add_table - risks pipeline to failing to detect that it has changed, and thus not checkpoint the changes. + store = self.get_pipeline_store() - Parameters - ---------- - table_name : str - orca/pipeline table name - df : pandas DataFrame - """ + if store is not None: + df = store[CHECKPOINT_TABLE_NAME] + else: + pipeline_file_path = config.pipeline_file_path( + self.context.get_formatted("pipeline_file_name") + ) + df = pd.read_hdf(pipeline_file_path, CHECKPOINT_TABLE_NAME) - assert is_open(), f"Pipeline is not open." + # non-table columns first (column order in df is random because created from a dict) + table_names = [ + name for name in df.columns.values if name not in NON_TABLE_COLUMNS + ] - if df.columns.duplicated().any(): - logger.error( - "replace_table: dataframe '%s' has duplicate columns: %s" - % (table_name, df.columns[df.columns.duplicated()]) - ) + df = df[NON_TABLE_COLUMNS + table_names] - raise RuntimeError( - "replace_table: dataframe '%s' has duplicate columns: %s" - % (table_name, df.columns[df.columns.duplicated()]) - ) + return df - rewrap(table_name, df) + def replace_table(self, table_name, df): + """ + Add or replace a orca table, removing any existing added orca columns - _PIPELINE.replaced_tables[table_name] = True + The use case for this function is a method that calls to_frame on an orca table, modifies + it and then saves the modified. + orca.to_frame returns a copy, so no changes are saved, and adding multiple column with + add_column adds them in an indeterminate order. -def extend_table(table_name, df, axis=0): - """ - add new table or extend (add rows) to an existing table + Simply replacing an existing the table "behind the pipeline's back" by calling orca.add_table + risks pipeline to failing to detect that it has changed, and thus not checkpoint the changes. - Parameters - ---------- - table_name : str - orca/inject table name - df : pandas DataFrame - """ + Parameters + ---------- + table_name : str + orca/pipeline table name + df : pandas DataFrame + """ - assert is_open(), f"Pipeline is not open." + assert self.is_open, f"Pipeline is not open." - assert axis in [0, 1] + if df.columns.duplicated().any(): + logger.error( + "replace_table: dataframe '%s' has duplicate columns: %s" + % (table_name, df.columns[df.columns.duplicated()]) + ) - if orca.is_table(table_name): + raise RuntimeError( + "replace_table: dataframe '%s' has duplicate columns: %s" + % (table_name, df.columns[df.columns.duplicated()]) + ) - table_df = orca.get_table(table_name).to_frame() + self.rewrap(table_name, df) - if axis == 0: - # don't expect indexes to overlap - assert len(table_df.index.intersection(df.index)) == 0 - missing_df_str_columns = [ - c - for c in table_df.columns - if c not in df.columns and table_df[c].dtype == "O" - ] - else: - # expect indexes be same - assert table_df.index.equals(df.index) - new_df_columns = [c for c in df.columns if c not in table_df.columns] - df = df[new_df_columns] + self.replaced_tables[table_name] = True - # preserve existing column order - df = pd.concat([table_df, df], sort=False, axis=axis) + def extend_table(self, table_name, df, axis=0): + """ + add new table or extend (add rows) to an existing table - # backfill missing df columns that were str (object) type in table_df - if axis == 0: - for c in missing_df_str_columns: - df[c] = df[c].fillna("") + Parameters + ---------- + table_name : str + orca/inject table name + df : pandas DataFrame + """ - replace_table(table_name, df) + assert self.is_open, f"Pipeline is not open." - return df + assert axis in [0, 1] + if self.is_table(table_name): -def drop_table(table_name): + table_df = self.get_table(table_name) - assert is_open(), f"Pipeline is not open." + if axis == 0: + # don't expect indexes to overlap + assert len(table_df.index.intersection(df.index)) == 0 + missing_df_str_columns = [ + c + for c in table_df.columns + if c not in df.columns and table_df[c].dtype == "O" + ] + else: + # expect indexes be same + assert table_df.index.equals(df.index) + new_df_columns = [c for c in df.columns if c not in table_df.columns] + df = df[new_df_columns] + missing_df_str_columns = [] - if orca.is_table(table_name): + # preserve existing column order + df = pd.concat([table_df, df], sort=False, axis=axis) - logger.debug("drop_table dropping orca table '%s'" % table_name) + # backfill missing df columns that were str (object) type in table_df + if axis == 0: + for c in missing_df_str_columns: + df[c] = df[c].fillna("") - # don't trigger function call of TableFuncWrapper - t = orca.get_raw_table(table_name) - t.clear_cached() + self.replace_table(table_name, df) - for column_name in orca.list_columns_for_table(table_name): - # logger.debug("pop %s.%s: %s" % (table_name, column_name, t.column_type(column_name))) - orca._COLUMNS.pop((table_name, column_name), None) + return df - # remove from orca's table list - orca._TABLES.pop(table_name, None) + def drop_table(self, table_name): - if table_name in _PIPELINE.replaced_tables: + assert self.is_open, f"Pipeline is not open." - logger.debug("drop_table forgetting replaced_tables '%s'" % table_name) - del _PIPELINE.replaced_tables[table_name] + if self.is_table(table_name): - if table_name in _PIPELINE.last_checkpoint: + logger.debug("drop_table dropping orca table '%s'" % table_name) + self.context.pop(table_name, None) + self._TABLES.pop(table_name, None) - logger.debug("drop_table removing table %s from last_checkpoint" % table_name) + if table_name in self.replaced_tables: - _PIPELINE.last_checkpoint[table_name] = "" + logger.debug("drop_table forgetting replaced_tables '%s'" % table_name) + del self.replaced_tables[table_name] + if table_name in self.last_checkpoint: -def is_table(table_name): - return orca.is_table(table_name) + logger.debug( + "drop_table removing table %s from last_checkpoint" % table_name + ) + self.last_checkpoint[table_name] = "" -def cleanup_pipeline(): - """ - Cleanup pipeline after successful run + def cleanup_pipeline(self): + """ + Cleanup pipeline after successful run - Open main pipeline if not already open (will be closed if multiprocess) - Create a single-checkpoint pipeline file with latest version of all checkpointed tables, - Delete main pipeline and any subprocess pipelines + Open main pipeline if not already open (will be closed if multiprocess) + Create a single-checkpoint pipeline file with latest version of all checkpointed tables, + Delete main pipeline and any subprocess pipelines - Called if cleanup_pipeline_after_run setting is True + Called if cleanup_pipeline_after_run setting is True - Returns - ------- - nothing, but with changed state: pipeline file that was open on call is closed and deleted + Returns + ------- + nothing, but with changed state: pipeline file that was open on call is closed and deleted - """ - # we don't expect to be called unless cleanup_pipeline_after_run setting is True - assert config.setting("cleanup_pipeline_after_run", False) + """ + # we don't expect to be called unless cleanup_pipeline_after_run setting is True + assert config.setting("cleanup_pipeline_after_run", False) - if not is_open(): - open_pipeline("_") + if not self.is_open: + self.open_pipeline("_") - assert is_open(), f"Pipeline is not open." + assert self.is_open, f"Pipeline is not open." - FINAL_PIPELINE_FILE_NAME = ( - f"final_{inject.get_injectable('pipeline_file_name', 'pipeline')}" - ) - FINAL_CHECKPOINT_NAME = "final" + FINAL_PIPELINE_FILE_NAME = ( + f"final_{inject.get_injectable('pipeline_file_name', 'pipeline')}" + ) + FINAL_CHECKPOINT_NAME = "final" - final_pipeline_file_path = config.build_output_file_path(FINAL_PIPELINE_FILE_NAME) + final_pipeline_file_path = config.build_output_file_path( + FINAL_PIPELINE_FILE_NAME + ) - # keep only the last row of checkpoints and patch the last checkpoint name - checkpoints_df = get_checkpoints().tail(1).copy() - checkpoints_df["checkpoint_name"] = FINAL_CHECKPOINT_NAME + # keep only the last row of checkpoints and patch the last checkpoint name + checkpoints_df = self.get_checkpoints().tail(1).copy() + checkpoints_df["checkpoint_name"] = FINAL_CHECKPOINT_NAME - with pd.HDFStore(final_pipeline_file_path, mode="w") as final_pipeline_store: + with pd.HDFStore(final_pipeline_file_path, mode="w") as final_pipeline_store: - for table_name in checkpointed_tables(): - # patch last checkpoint name for all tables - checkpoints_df[table_name] = FINAL_CHECKPOINT_NAME + for table_name in self.checkpointed_tables(): + # patch last checkpoint name for all tables + checkpoints_df[table_name] = FINAL_CHECKPOINT_NAME - table_df = get_table(table_name) - logger.debug( - f"cleanup_pipeline - adding table {table_name} {table_df.shape}" - ) + table_df = self.get_table(table_name) + logger.debug( + f"cleanup_pipeline - adding table {table_name} {table_df.shape}" + ) - final_pipeline_store[table_name] = table_df + final_pipeline_store[table_name] = table_df - final_pipeline_store[CHECKPOINT_TABLE_NAME] = checkpoints_df + final_pipeline_store[CHECKPOINT_TABLE_NAME] = checkpoints_df - close_pipeline() + self.close_pipeline() - logger.debug(f"deleting all pipeline files except {final_pipeline_file_path}") - tracing.delete_output_files("h5", ignore=[final_pipeline_file_path]) + logger.debug(f"deleting all pipeline files except {final_pipeline_file_path}") + tracing.delete_output_files("h5", ignore=[final_pipeline_file_path]) diff --git a/activitysim/core/tracing.py b/activitysim/core/tracing.py index 44707c0ae..0d9958d6e 100644 --- a/activitysim/core/tracing.py +++ b/activitysim/core/tracing.py @@ -16,6 +16,7 @@ from activitysim.core import inject +from ..core.workflow import workflow_step from . import config # Configurations @@ -243,14 +244,17 @@ def print_summary(label, df, describe=False, value_counts=False): logger.info("%s summary:\n%s" % (label, df.describe())) -def initialize_traceable_tables(): +@workflow_step +def initialize_traceable_tables(traceable_table_ids=None): - traceable_table_ids = inject.get_injectable("traceable_table_ids", {}) + if traceable_table_ids is None: + traceable_table_ids = {} if len(traceable_table_ids) > 0: logger.debug( f"initialize_traceable_tables resetting table_ids for {list(traceable_table_ids.keys())}" ) - inject.add_injectable("traceable_table_ids", {}) + # ORCA# inject.add_injectable("traceable_table_ids", {}) + return {"traceable_table_ids": {}} def register_traceable_table(table_name, df): diff --git a/activitysim/core/workflow/__init__.py b/activitysim/core/workflow/__init__.py new file mode 100644 index 000000000..fc247a097 --- /dev/null +++ b/activitysim/core/workflow/__init__.py @@ -0,0 +1,228 @@ +import importlib +import importlib.machinery +import importlib.util +import logging +from inspect import getfullargspec +from typing import Mapping + +from pypyr.context import Context +from pypyr.errors import KeyNotInContextError + +_STEP_LIBRARY = {} + + +def get_formatted_or_raw(self: Context, key: str): + try: + return self.get_formatted(key) + except TypeError: + return self.get(key) + except Exception as err: + raise ValueError(f"extracting {key} from context") from err + + +def get_formatted_or_default(self: Context, key: str, default): + try: + return self.get_formatted(key) + except (KeyNotInContextError, KeyError): + return default + except TypeError: + return self.get(key) + except Exception as err: + raise ValueError(f"extracting {key} from context") from err + + +def error_logging(func): + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except Exception as err: + logging.error(f"===== ERROR IN {func.__name__} =====") + logging.exception(f"{err}") + logging.error(f"===== / =====") + raise + + return wrapper + + +def _new_module(mod_name): + spec = importlib.machinery.ModuleSpec(mod_name, None) + return importlib.util.module_from_spec(spec) + + +def _create_module(mod_name, content): + mod = _new_module(mod_name) + for k, v in content.items(): + setattr(mod, k, v) + return mod + + +def _create_step(step_name, step_func): + _create_module(f"{__package__}.{step_name}", {"run_step": step_func}) + _STEP_LIBRARY[step_name] = step_func + + +def run_named_step(name, context): + context.update(_STEP_LIBRARY[name](context)) + return context + + +class workflow_step: + """ + Decorator for functions that update a context variable. + + The decorator will generate a `run_step` function in the same module, + wrapped with additional arguments and appropriately annotated for use + with the pypyr workflow model. The original function also remains + available to import and use without changes. + + When called as a step inside a pypyr workflow, the following context + variables are potentially accessed: + + report : xmle.Reporter + The active report into which new figures or tables are added. + caption : str + A caption for the item being processed. This is used both in + writing out the output (if any) in the report and for logging + step progression during a run. + caption_type : str + The caption type (typically, 'fig' for figures or 'tab' + for tables). + progress_tag : str + Use this instead of `caption` to log step progression during a run. + + If the function returns values that should update the context, that + can be done in one of three ways: + + - Set `updates_context` to True and return a `dict`, and use that + dict to update the context directly. + - Return a single object, and set `returns_names` to a string + giving the name that object should take in the context. + - Return a sequence of objects, and set `returns_names` to a + matching sequence of names that those objects should take + in the context. + + Otherwise, the return value is appended to the report. To declare that + there is no return value and no reporting should be done, you must + explicitly annotate the function with a return value of `-> None`. + + Important: there can be only one `workstep` in + each module. If you need more than one, make another separate module. + + Parameters + ---------- + wrapped_func : Callable + returns_names : str or tuple[str], optional + updates_context : bool, default False + + Returns + ------- + wrapped_func : Callable + The original wrapped function + + """ + + def __new__(cls, wrapped_func=None, *, step_name=None): + """ + Initialize a work step wrapper. + + Parameters + ---------- + wrapped_func : Callable + The function being decorated. + """ + if isinstance(wrapped_func, str): + # the step_name is provided instead of the wrapped func + step_name = wrapped_func + wrapped_func = None + if step_name is None and wrapped_func is not None: + step_name = wrapped_func.__name__ + self = super().__new__(cls) + self._step_name = step_name + if wrapped_func is not None: + return self(wrapped_func) + else: + return self + + def __call__(self, wrapped_func): + """ + Initialize a workflow_step wrapper. + + Parameters + ---------- + wrapped_func : Callable + The function being decorated. It should return a dictionary + of context updates. + """ + ( + _args, + _varargs, + _varkw, + _defaults, + _kwonlyargs, + _kwonlydefaults, + _annotations, + ) = getfullargspec(wrapped_func) + + def run_step(context: Context = None) -> None: + caption = get_formatted_or_default(context, "caption", None) + progress_tag = get_formatted_or_default(context, "progress_tag", caption) + # if progress_tag is not None: + # reset_progress_step(description=progress_tag) + + return_type = _annotations.get("return", "") + + caption_type = get_formatted_or_default(context, "caption_type", "fig") + caption_maker = get_formatted_or_default(context, caption_type, None) + # parse and run function itself + if _defaults is None: + ndefault = 0 + _required_args = _args + else: + ndefault = len(_defaults) + _required_args = _args[:-ndefault] + args = [] + for arg in _required_args: + context.assert_key_has_value(key=arg, caller=wrapped_func.__module__) + try: + args.append(get_formatted_or_raw(context, arg)) + except Exception as err: + raise ValueError(f"extracting {arg} from context") from err + if ndefault: + for arg, default in zip(_args[-ndefault:], _defaults): + args.append(get_formatted_or_default(context, arg, default)) + kwargs = {} + for karg in _kwonlyargs: + if karg in _kwonlydefaults: + kwargs[karg] = get_formatted_or_default( + context, karg, _kwonlydefaults[karg] + ) + else: + context.assert_key_has_value( + key=karg, caller=wrapped_func.__module__ + ) + try: + kwargs[karg] = get_formatted_or_raw(context, karg) + except Exception as err: + raise ValueError(f"extracting {karg} from context") from err + if _varkw: + kwargs.update(context) + for arg in _required_args: + if arg in kwargs: + kwargs.pop(arg) + outcome = error_logging(wrapped_func)(*args, **kwargs) + if not isinstance(outcome, Mapping): + raise ValueError( + f"{wrapped_func.__name__} is marked as updates_context, " + f"it should return a mapping" + ) + context.update(outcome) + + # module = importlib.import_module(wrapped_func.__module__) + # if hasattr(module, "run_step"): + # raise ValueError( + # f"{wrapped_func.__module__}.run_step exists, there can be only one per module" + # ) + # setattr(module, "run_step", run_step) + _create_step(self._step_name, run_step) + + return wrapped_func From f7ffa264d31c16baac530a0c3fc5fc46440937d4 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Thu, 9 Feb 2023 16:17:00 -0600 Subject: [PATCH 005/419] whales --- .pre-commit-config.yaml | 8 +- AAA-BreakingChanges.md | 3 + activitysim/abm/misc.py | 63 ++- activitysim/abm/models/accessibility.py | 18 +- activitysim/abm/models/initialize.py | 110 ++-- activitysim/abm/models/summarize.py | 6 +- activitysim/abm/tables/accessibility.py | 25 +- .../abm/tables/disaggregate_accessibility.py | 37 +- activitysim/abm/tables/households.py | 53 +- activitysim/abm/tables/landuse.py | 23 +- activitysim/abm/tables/persons.py | 96 +++- activitysim/abm/tables/shadow_pricing.py | 60 +- activitysim/abm/tables/size_terms.py | 12 +- activitysim/abm/tables/skims.py | 19 +- activitysim/abm/tables/time_windows.py | 34 +- activitysim/abm/test/conftest.py | 2 +- .../abm/test/test_misc/test_summarize.py | 2 +- activitysim/benchmarking/componentwise.py | 30 +- activitysim/cli/run.py | 240 +++++--- activitysim/core/assign.py | 20 +- activitysim/core/chunk.py | 101 +++- activitysim/core/config.py | 37 +- activitysim/core/configuration/__init__.py | 1 + activitysim/core/configuration/filesystem.py | 535 ++++++++++++++++++ activitysim/core/configuration/top.py | 117 +++- activitysim/core/exceptions.py | 31 + activitysim/core/expressions.py | 32 +- activitysim/core/inject.py | 10 +- activitysim/core/input.py | 85 ++- activitysim/core/los.py | 67 +-- activitysim/core/mem.py | 18 +- activitysim/core/pipeline.py | 448 ++++++++++----- activitysim/core/test/configs/logging.yaml | 3 +- activitysim/core/tracing.py | 133 ++--- activitysim/core/workflow/__init__.py | 229 +------- activitysim/core/workflow/injectable.py | 19 + activitysim/core/workflow/steps.py | 337 +++++++++++ activitysim/core/workflow/tableset.py | 90 +++ activitysim/core/workflow/util.py | 46 ++ .../prototype_mtc/configs/logging.yaml | 2 +- 40 files changed, 2259 insertions(+), 943 deletions(-) create mode 100644 AAA-BreakingChanges.md create mode 100644 activitysim/core/configuration/filesystem.py create mode 100644 activitysim/core/exceptions.py create mode 100644 activitysim/core/workflow/injectable.py create mode 100644 activitysim/core/workflow/steps.py create mode 100644 activitysim/core/workflow/tableset.py create mode 100644 activitysim/core/workflow/util.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3a59ed152..4e21e7f74 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,7 @@ repos: hooks: - id: black -- repo: https://github.com/PyCQA/flake8 - rev: 5.0.4 - hooks: - - id: flake8 +#- repo: https://github.com/PyCQA/flake8 +# rev: 5.0.4 +# hooks: +# - id: flake8 diff --git a/AAA-BreakingChanges.md b/AAA-BreakingChanges.md new file mode 100644 index 000000000..feae1ae50 --- /dev/null +++ b/AAA-BreakingChanges.md @@ -0,0 +1,3 @@ + + +- The 'run_list' key in settings.yaml is no longer supported. diff --git a/activitysim/abm/misc.py b/activitysim/abm/misc.py index 528c8db93..6f665de5e 100644 --- a/activitysim/abm/misc.py +++ b/activitysim/abm/misc.py @@ -4,7 +4,8 @@ import pandas as pd -from activitysim.core import config, inject +from ..core.pipeline import Whale +from ..core.workflow import workflow_cached_object # FIXME # warnings.filterwarnings('ignore', category=pd.io.pytables.PerformanceWarning) @@ -13,23 +14,27 @@ logger = logging.getLogger(__name__) -@inject.injectable(cache=True) -def households_sample_size(settings, override_hh_ids): +@workflow_cached_object +def households_sample_size(whale: Whale, override_hh_ids): if override_hh_ids is None: - return settings.get("households_sample_size", 0) + return whale.settings, households_sample_size else: return 0 if override_hh_ids is None else len(override_hh_ids) -@inject.injectable(cache=True) -def override_hh_ids(settings): +@workflow_cached_object +def override_hh_ids(whale: Whale): - hh_ids_filename = settings.get("hh_ids", None) + hh_ids_filename = whale.settings.hh_ids if hh_ids_filename is None: return None - file_path = config.data_file_path(hh_ids_filename, mandatory=False) + file_path = whale.filesystem.get_data_file_path(hh_ids_filename, mandatory=False) + if not file_path: + file_path = whale.filesystem.get_config_file_path( + hh_ids_filename, mandatory=False + ) if not file_path: logger.error( "hh_ids file name '%s' specified in settings not found" % hh_ids_filename @@ -56,24 +61,24 @@ def override_hh_ids(settings): return household_ids -@inject.injectable(cache=True) -def trace_hh_id(settings): - - id = settings.get("trace_hh_id", None) - - if id and not isinstance(id, int): - logger.warning( - "setting trace_hh_id is wrong type, should be an int, but was %s" % type(id) - ) - id = None - - return id +# @workflow_object +# def trace_hh_id(whale: Whale): +# +# id = whale.settings.trace_hh_id +# +# if id and not isinstance(id, int): +# logger.warning( +# "setting trace_hh_id is wrong type, should be an int, but was %s" % type(id) +# ) +# id = None +# +# return id -@inject.injectable(cache=True) -def trace_od(settings): +@workflow_cached_object +def trace_od(whale: Whale): - od = settings.get("trace_od", None) + od = whale.settings.trace_od if od and not ( isinstance(od, list) and len(od) == 2 and all(isinstance(x, int) for x in od) @@ -84,13 +89,13 @@ def trace_od(settings): return od -@inject.injectable(cache=True) -def chunk_size(settings): - _chunk_size = int(settings.get("chunk_size", 0) or 0) +@workflow_cached_object +def chunk_size(whale: Whale): + _chunk_size = int(whale.settings.chunk_size or 0) return _chunk_size -@inject.injectable(cache=True) -def check_for_variability(settings): - return bool(settings.get("check_for_variability", False)) +@workflow_cached_object +def check_for_variability(whale: Whale): + return bool(whale.settings.check_for_variability) diff --git a/activitysim/abm/models/accessibility.py b/activitysim/abm/models/accessibility.py index edd928e5c..34a1e38a4 100644 --- a/activitysim/abm/models/accessibility.py +++ b/activitysim/abm/models/accessibility.py @@ -5,8 +5,9 @@ import numpy as np import pandas as pd -from activitysim.core import assign, chunk, config, inject, los, mem, pipeline, tracing -from activitysim.core.pathbuilder import TransitVirtualPathBuilder +from ...core import assign, chunk, config, los, pipeline, tracing +from ...core.pipeline import Whale +from ...core.workflow import workflow_step logger = logging.getLogger(__name__) @@ -112,8 +113,15 @@ def compute_accessibilities_for_zones( return accessibility_df -@inject.step() -def compute_accessibility(land_use, accessibility, network_los, chunk_size, trace_od): +@workflow_step +def compute_accessibility( + whale: Whale, + land_use: pd.DataFrame, + accessibility: pd.DataFrame, + network_los, + chunk_size, + trace_od, +): """ Compute accessibility for each zone in land use file using expressions from accessibility_spec @@ -177,4 +185,4 @@ def compute_accessibility(land_use, accessibility, network_los, chunk_size, trac logger.info(f"{trace_label} computed accessibilities {accessibility_df.shape}") # - write table to pipeline - pipeline.replace_table("accessibility", accessibility_df) + whale.add_table("accessibility", accessibility_df) diff --git a/activitysim/abm/models/initialize.py b/activitysim/abm/models/initialize.py index fd9f47bb8..2c8c10655 100644 --- a/activitysim/abm/models/initialize.py +++ b/activitysim/abm/models/initialize.py @@ -6,13 +6,11 @@ import pandas as pd -from activitysim.abm.tables import shadow_pricing, disaggregate_accessibility -from activitysim.core import chunk, config, expressions, inject, mem, pipeline, tracing -from activitysim.core.steps.output import ( - track_skim_usage, - write_data_dictionary, - write_tables, -) +from ...core import chunk, config, expressions, inject, mem, pipeline, tracing +from ...core.pipeline import Whale +from ...core.steps.output import track_skim_usage, write_data_dictionary, write_tables +from ...core.workflow import workflow_step +from ..tables import disaggregate_accessibility, shadow_pricing # We are using the naming conventions in the mtc_asim.h5 example # file for our default list. This provides backwards compatibility @@ -31,11 +29,24 @@ logger = logging.getLogger(__name__) -def annotate_tables(model_settings, trace_label): +def annotate_tables(whale, model_settings, trace_label, chunk_sizer): + """ + + Parameters + ---------- + whale : Whale + model_settings : + trace_label : str + chunk_sizer : ChunkSizer + + Returns + ------- + + """ trace_label = tracing.extend_trace_label(trace_label, "annotate_tables") - chunk.log_rss(trace_label) + chunk_sizer.log_rss(trace_label) annotate_tables = model_settings.get("annotate_tables", []) @@ -51,18 +62,16 @@ def annotate_tables(model_settings, trace_label): t0 = tracing.print_elapsed_time() for table_info in annotate_tables: - tablename = table_info["tablename"] - chunk.log_rss(f"{trace_label}.pre-get_table.{tablename}") + chunk_sizer.log_rss(f"{trace_label}.pre-get_table.{tablename}") - df = inject.get_table(tablename).to_frame() - chunk.log_df(trace_label, tablename, df) + df = whale.get_dataframe(tablename) + chunk_sizer.log_df(trace_label, tablename, df) # - rename columns column_map = table_info.get("column_map", None) if column_map: - warnings.warn( f"Setting 'column_map' has been changed to 'rename_columns'. " f"Support for 'column_map' in annotate_tables will be removed in future versions.", @@ -79,61 +88,71 @@ def annotate_tables(model_settings, trace_label): f"{trace_label} - annotating {tablename} SPEC {annotate['SPEC']}" ) expressions.assign_columns( - df=df, model_settings=annotate, trace_label=trace_label + whale, df=df, model_settings=annotate, trace_label=trace_label ) - chunk.log_df(trace_label, tablename, df) + chunk_sizer.log_df(trace_label, tablename, df) # - write table to pipeline - pipeline.replace_table(tablename, df) + whale.add_table(tablename, df) del df - chunk.log_df(trace_label, tablename, None) + chunk_sizer.log_df(trace_label, tablename, None) -@inject.step() -def initialize_landuse(): +@workflow_step +def initialize_landuse(whale): + """ + Initialize the land use table. - trace_label = "initialize_landuse" + Parameters + ---------- + whale : Whale - with chunk.chunk_log(trace_label, base=True): + Returns + ------- + ? + """ + trace_label = "initialize_landuse" + settings_filename = "initialize_landuse.yaml" - model_settings = config.read_model_settings( - "initialize_landuse.yaml", mandatory=True + with chunk.chunk_log( + trace_label, base=True, settings=whale.settings + ) as chunk_sizer: + model_settings = whale.filesystem.read_settings_file( + settings_filename, mandatory=True ) - annotate_tables(model_settings, trace_label) + annotate_tables(whale, model_settings, trace_label, chunk_sizer) # instantiate accessibility (must be checkpointed to be be used to slice accessibility) - accessibility = pipeline.get_table("accessibility") - chunk.log_df(trace_label, "accessibility", accessibility) + accessibility = whale.get_dataframe("accessibility") + chunk_sizer.log_df(trace_label, "accessibility", accessibility) -@inject.step() -def initialize_households(): - +@workflow_step +def initialize_households(whale): trace_label = "initialize_households" - with chunk.chunk_log(trace_label, base=True): - - chunk.log_rss(f"{trace_label}.inside-yield") + with whale.chunk_log(trace_label, base=True) as chunk_sizer: + chunk_sizer.log_rss(f"{trace_label}.inside-yield") - households = inject.get_table("households").to_frame() + households = whale.get_dataframe("households") assert not households._is_view - chunk.log_df(trace_label, "households", households) + chunk_sizer.log_df(trace_label, "households", households) del households - chunk.log_df(trace_label, "households", None) + chunk_sizer.log_df(trace_label, "households", None) - persons = inject.get_table("persons").to_frame() + persons = whale.get_dataframe("persons") assert not persons._is_view - chunk.log_df(trace_label, "persons", persons) + chunk_sizer.log_df(trace_label, "persons", persons) del persons - chunk.log_df(trace_label, "persons", None) + chunk_sizer.log_df(trace_label, "persons", None) - model_settings = config.read_model_settings( + model_settings = whale.filesystem.read_settings_file( "initialize_households.yaml", mandatory=True ) - annotate_tables(model_settings, trace_label) + annotate_tables(whale, model_settings, trace_label, chunk_sizer) # - initialize shadow_pricing size tables after annotating household and person tables # since these are scaled to model size, they have to be created while single-process @@ -141,12 +160,12 @@ def initialize_households(): add_size_tables = model_settings.get("add_size_tables", True) if add_size_tables: # warnings.warn(f"Calling add_size_tables from initialize will be removed in the future.", FutureWarning) - suffixes = inject.get_injectable("disaggregate_suffixes") - shadow_pricing.add_size_tables(suffixes) + suffixes = disaggregate_accessibility.disaggregate_suffixes(whale) + shadow_pricing.add_size_tables(whale, suffixes) # - preload person_windows - person_windows = inject.get_table("person_windows").to_frame() - chunk.log_df(trace_label, "person_windows", person_windows) + person_windows = whale.get_dataframe("person_windows") + chunk_sizer.log_df(trace_label, "person_windows", person_windows) @inject.injectable(cache=True) @@ -176,7 +195,6 @@ def preload_injectables(): # FIXME undocumented feature if config.setting("write_raw_tables"): - # write raw input tables as csv (before annotation) csv_dir = config.output_file_path("raw_tables") if not os.path.exists(csv_dir): diff --git a/activitysim/abm/models/summarize.py b/activitysim/abm/models/summarize.py index 66b5cf958..d4479cfb4 100644 --- a/activitysim/abm/models/summarize.py +++ b/activitysim/abm/models/summarize.py @@ -9,11 +9,13 @@ from activitysim.abm.models.trip_matrices import annotate_trips from activitysim.core import config, expressions, inject, pipeline +from ...core.los import Network_LOS + logger = logging.getLogger(__name__) def wrap_skims( - network_los: pipeline.Pipeline, trips_merged: pd.DataFrame + network_los: Network_LOS, trips_merged: pd.DataFrame ) -> dict[str, object]: """ Retrieve skim wrappers for merged trips. @@ -200,7 +202,7 @@ def manual_breaks( @inject.step() def summarize( - network_los: pipeline.Pipeline, + network_los: Network_LOS, persons: pd.DataFrame, persons_merged: pd.DataFrame, households: pd.DataFrame, diff --git a/activitysim/abm/tables/accessibility.py b/activitysim/abm/tables/accessibility.py index 6869da736..a45cbac8c 100644 --- a/activitysim/abm/tables/accessibility.py +++ b/activitysim/abm/tables/accessibility.py @@ -4,14 +4,16 @@ import pandas as pd -from activitysim.core import inject from activitysim.core.input import read_input_table +from ...core.pipeline import Whale +from ...core.workflow import workflow_table + logger = logging.getLogger(__name__) -@inject.table() -def accessibility(land_use): +@workflow_table +def accessibility(whale: Whale): """ If 'accessibility' is in input_tables list, then read it in, otherwise create skeleton table with same index as landuse. @@ -23,7 +25,8 @@ def accessibility(land_use): otherwise it will simply be replaced when accessibility model is run """ - accessibility_df = read_input_table("accessibility", required=False) + land_use = whale.get_dataframe("land_use") + accessibility_df = read_input_table(whale, "accessibility", required=False) if accessibility_df is None: accessibility_df = pd.DataFrame(index=land_use.index) @@ -33,24 +36,22 @@ def accessibility(land_use): else: try: assert accessibility_df.sort_index().index.equals( - land_use.to_frame().sort_index().index + land_use.sort_index().index ), f"loaded accessibility table index does not match index of land_use table" except AssertionError: - land_use_index = land_use.to_frame().index - if f"_original_{land_use_index.name}" in land_use.to_frame(): - land_use_zone_ids = land_use.to_frame()[ - f"_original_{land_use_index.name}" - ] + land_use_index = land_use.index + if f"_original_{land_use_index.name}" in land_use: + land_use_zone_ids = land_use[f"_original_{land_use_index.name}"] remapper = dict(zip(land_use_zone_ids, land_use_zone_ids.index)) accessibility_df.index = accessibility_df.index.map(remapper.get) assert accessibility_df.sort_index().index.equals( - land_use.to_frame().sort_index().index + land_use.sort_index().index ), f"loaded accessibility table index does not match index of land_use table" else: raise logger.info("loaded land_use %s" % (accessibility_df.shape,)) # replace table function with dataframe - inject.add_table("accessibility", accessibility_df) + whale.add_table("accessibility", accessibility_df) return accessibility_df diff --git a/activitysim/abm/tables/disaggregate_accessibility.py b/activitysim/abm/tables/disaggregate_accessibility.py index 4c4eb9ad4..63a2e8685 100644 --- a/activitysim/abm/tables/disaggregate_accessibility.py +++ b/activitysim/abm/tables/disaggregate_accessibility.py @@ -6,9 +6,11 @@ import numpy as np import pandas as pd import pandas.api.types as ptypes - from sklearn.naive_bayes import CategoricalNB -from activitysim.core import inject, config, pipeline, util, input + +from activitysim.core import config, inject, input, pipeline, util + +from ...core.workflow import workflow_cached_object, workflow_step, workflow_table logger = logging.getLogger(__name__) @@ -82,14 +84,14 @@ def nearest_node(oz, zones_df): return matched_df.loc[_idx] -@inject.injectable() -def disaggregate_suffixes(): +@workflow_cached_object +def disaggregate_suffixes(whale): return {"SUFFIX": None, "ROOTS": []} -@inject.table() -def maz_centroids(): - df = input.read_input_table("maz_centroids") +@workflow_table +def maz_centroids(whale): + df = input.read_input_table(whale, "maz_centroids") if not df.index.is_monotonic_increasing: df = df.sort_index() @@ -102,11 +104,13 @@ def maz_centroids(): return df -@inject.table() -def proto_disaggregate_accessibility(): +@workflow_table +def proto_disaggregate_accessibility(whale): # Read existing accessibilities, but is not required to enable model compatibility - df = input.read_input_table("proto_disaggregate_accessibility", required=False) + df = input.read_input_table( + whale, "proto_disaggregate_accessibility", required=False + ) # If no df, return empty dataframe to skip this model if not df: @@ -119,21 +123,26 @@ def proto_disaggregate_accessibility(): logger.info("loaded proto_disaggregate_accessibility %s" % (df.shape,)) # replace table function with dataframe - inject.add_table("proto_disaggregate_accessibility", df) + whale.add_table("proto_disaggregate_accessibility", df) return df -@inject.table() -def disaggregate_accessibility(persons, households, land_use, accessibility): +@workflow_table +def disaggregate_accessibility(whale): """ This step initializes pre-computed disaggregate accessibility and merges it onto the full synthetic population. Function adds merged all disaggregate accessibility tables to the pipeline but returns nothing. """ + persons = whale.get_dataframe("persons") + households = whale.get_dataframe("households") + land_use = whale.get_dataframe("land_use") + accessibility = whale.get_dataframe("accessibility") + # If disaggregate_accessibilities do not exist in the pipeline, it will try loading csv of that name - proto_accessibility_df = pipeline.get_table("proto_disaggregate_accessibility") + proto_accessibility_df = whale.get_dataframe("proto_disaggregate_accessibility") # If there is no table, skip. We do this first to skip as fast as possible if proto_accessibility_df.empty: diff --git a/activitysim/abm/tables/households.py b/activitysim/abm/tables/households.py index e0a42f63b..53943ccca 100644 --- a/activitysim/abm/tables/households.py +++ b/activitysim/abm/tables/households.py @@ -6,16 +6,22 @@ import pandas as pd -from activitysim.core import inject, mem, pipeline, tracing -from activitysim.core.input import read_input_table +from ...core import inject, mem, pipeline, tracing +from ...core.input import read_input_table +from ...core.pipeline import Whale +from ...core.workflow import workflow_table +from ..misc import override_hh_ids logger = logging.getLogger(__name__) -@inject.table() -def households(households_sample_size, override_hh_ids, trace_hh_id): +@workflow_table +def households(whale: Whale): + households_sample_size = whale.settings.households_sample_size + _override_hh_ids = override_hh_ids(whale) + _trace_hh_id = whale.settings.trace_hh_id - df_full = read_input_table("households") + df_full = read_input_table(whale, "households") tot_households = df_full.shape[0] logger.info("full household list contains %s households" % tot_households) @@ -23,30 +29,30 @@ def households(households_sample_size, override_hh_ids, trace_hh_id): households_sliced = False # only using households listed in override_hh_ids - if override_hh_ids is not None: + if _override_hh_ids is not None: # trace_hh_id will not used if it is not in list of override_hh_ids logger.info( - "override household list containing %s households" % len(override_hh_ids) + "override household list containing %s households" % len(_override_hh_ids) ) - df = df_full[df_full.index.isin(override_hh_ids)] + df = df_full[df_full.index.isin(_override_hh_ids)] households_sliced = True - if df.shape[0] < len(override_hh_ids): + if df.shape[0] < len(_override_hh_ids): logger.info( "found %s of %s households in override household list" - % (df.shape[0], len(override_hh_ids)) + % (df.shape[0], len(_override_hh_ids)) ) if df.shape[0] == 0: raise RuntimeError("No override households found in store") # if we are tracing hh exclusively - elif trace_hh_id and households_sample_size == 1: + elif _trace_hh_id and households_sample_size == 1: # df contains only trace_hh (or empty if not in full store) - df = tracing.slice_ids(df_full, trace_hh_id) + df = tracing.slice_ids(df_full, _trace_hh_id) households_sliced = True # if we need a subset of full store @@ -66,27 +72,30 @@ def households(households_sample_size, override_hh_ids, trace_hh_id): if the pipeline rng's base_seed is changed """ - prng = pipeline.get_rn_generator().get_external_rng("sample_households") + prng = whale.get_rn_generator().get_external_rng("sample_households") df = df_full.take( prng.choice(len(df_full), size=households_sample_size, replace=False) ) households_sliced = True # if tracing and we missed trace_hh in sample, but it is in full store - if trace_hh_id and trace_hh_id not in df.index and trace_hh_id in df_full.index: + if ( + _trace_hh_id + and _trace_hh_id not in df.index + and _trace_hh_id in df_full.index + ): # replace first hh in sample with trace_hh logger.debug( "replacing household %s with %s in household sample" - % (df.index[0], trace_hh_id) + % (df.index[0], _trace_hh_id) ) - df_hh = df_full.loc[[trace_hh_id]] + df_hh = df_full.loc[[_trace_hh_id]] df = pd.concat([df_hh, df[1:]]) else: df = df_full - # persons table - inject.add_injectable("households_sliced", households_sliced) + whale.set("households_sliced", households_sliced) if "sample_rate" not in df.columns: if households_sample_size == 0: @@ -102,12 +111,12 @@ def households(households_sample_size, override_hh_ids, trace_hh_id): logger.debug("households.info:\n" + buffer.getvalue()) # replace table function with dataframe - inject.add_table("households", df) + whale.add_table("households", df) - pipeline.get_rn_generator().add_channel("households", df) + whale.get_rn_generator().add_channel("households", df) - tracing.register_traceable_table("households", df) - if trace_hh_id: + tracing.register_traceable_table(whale, "households", df) + if _trace_hh_id: tracing.trace_df(df, "raw.households", warn_if_empty=True) return df diff --git a/activitysim/abm/tables/landuse.py b/activitysim/abm/tables/landuse.py index 1161bfe5d..9865803b8 100644 --- a/activitysim/abm/tables/landuse.py +++ b/activitysim/abm/tables/landuse.py @@ -6,15 +6,16 @@ from activitysim.core import config, inject from activitysim.core.input import read_input_table -logger = logging.getLogger(__name__) +from ...core.workflow import workflow_table +logger = logging.getLogger(__name__) -@inject.table() -def land_use(): - df = read_input_table("land_use") +@workflow_table +def land_use(whale): + df = read_input_table(whale, "land_use") - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = whale.settings.sharrow if sharrow_enabled: # when using sharrow, the land use file must be organized (either in raw # form or via recoding) so that the index is zero-based and contiguous @@ -33,20 +34,16 @@ def land_use(): buffer = io.StringIO() df.info(buf=buffer) logger.debug("land_use.info:\n" + buffer.getvalue()) - - # replace table function with dataframe - inject.add_table("land_use", df) - return df inject.broadcast("land_use", "households", cast_index=True, onto_on="home_zone_id") -@inject.table() -def land_use_taz(): +@workflow_table +def land_use_taz(whale): - df = read_input_table("land_use_taz") + df = read_input_table(whale, "land_use_taz") if not df.index.is_monotonic_increasing: df = df.sort_index() @@ -57,6 +54,6 @@ def land_use_taz(): logger.debug("land_use_taz.info:\n" + buffer.getvalue()) # replace table function with dataframe - inject.add_table("land_use_taz", df) + whale.tableset.store_data("land_use_taz", df) return df diff --git a/activitysim/abm/tables/persons.py b/activitysim/abm/tables/persons.py index a3d3804cf..0652efcac 100644 --- a/activitysim/abm/tables/persons.py +++ b/activitysim/abm/tables/persons.py @@ -5,27 +5,29 @@ import pandas as pd -from activitysim.core import inject, pipeline, tracing -from activitysim.core.input import read_input_table +from ...core import inject, pipeline, tracing +from ...core.input import read_input_table +from ...core.workflow import workflow_table logger = logging.getLogger(__name__) -def read_raw_persons(households): +def read_raw_persons(whale, households): - df = read_input_table("persons") + df = read_input_table(whale, "persons") - if inject.get_injectable("households_sliced", False): + if whale.get_injectable("households_sliced", False): # keep only persons in the sampled households df = df[df.household_id.isin(households.index)] return df -@inject.table() -def persons(households, trace_hh_id): - - df = read_raw_persons(households) +@workflow_table +def persons(whale): + households = whale.get_dataframe("households") + trace_hh_id = whale.settings.trace_hh_id + df = read_raw_persons(whale, households) logger.info("loaded persons %s" % (df.shape,)) buffer = io.StringIO() @@ -33,11 +35,11 @@ def persons(households, trace_hh_id): logger.debug("persons.info:\n" + buffer.getvalue()) # replace table function with dataframe - inject.add_table("persons", df) + whale.add_table("persons", df) - pipeline.get_rn_generator().add_channel("persons", df) + whale.get_rn_generator().add_channel("persons", df) - tracing.register_traceable_table("persons", df) + tracing.register_traceable_table(whale, "persons", df) if trace_hh_id: tracing.trace_df(df, "raw.persons", warn_if_empty=True) @@ -72,20 +74,62 @@ def persons(households, trace_hh_id): # another common merge for persons -@inject.table() -def persons_merged( - persons, households, land_use, accessibility, disaggregate_accessibility -): - - if not disaggregate_accessibility.to_frame().empty: - tables = [ +# @inject.table() +# def persons_merged( +# persons, households, land_use, accessibility, disaggregate_accessibility +# ): +# +# if not disaggregate_accessibility.to_frame().empty: +# tables = [ +# persons, +# households, +# land_use, +# accessibility, +# disaggregate_accessibility, +# ] +# else: +# tables = [persons, households, land_use, accessibility] +# +# return inject.merge_tables(persons.name, tables=tables) + + +@workflow_table +def persons_merged(whale): + + land_use = whale.get_dataframe("land_use") + households = whale.get_dataframe("households") + accessibility = whale.get_dataframe("accessibility") + persons = whale.get_dataframe("persons") + disaggregate_accessibility = whale.get_dataframe("disaggregate_accessibility") + + households = pd.merge( + households, + land_use, + left_on="home_zone_id", + right_index=True, + suffixes=("_households", "_land_use"), + ) + households = pd.merge( + households, + accessibility, + left_on="home_zone_id", + right_index=True, + suffixes=("_households", "_accessibility"), + ) + persons = pd.merge( + persons, + households, + left_on="household_id", + right_index=True, + suffixes=("_persons", "_households"), + ) + if not disaggregate_accessibility.empty: + persons = pd.merge( persons, - households, - land_use, - accessibility, disaggregate_accessibility, - ] - else: - tables = [persons, households, land_use, accessibility] + left_on="person_id", + right_index=True, + suffixes=("_persons", "_disaggregate_accessibility"), + ) - return inject.merge_tables(persons.name, tables=tables) + return persons diff --git a/activitysim/abm/tables/shadow_pricing.py b/activitysim/abm/tables/shadow_pricing.py index 3a3ba7f56..c9dbb4335 100644 --- a/activitysim/abm/tables/shadow_pricing.py +++ b/activitysim/abm/tables/shadow_pricing.py @@ -9,9 +9,12 @@ import numpy as np import pandas as pd -from activitysim.abm.tables.size_terms import tour_destination_size_terms -from activitysim.core import config, inject, logit, tracing, util -from activitysim.core.input import read_input_table +from ...abm.tables.size_terms import tour_destination_size_terms +from ...core import config, inject, logit, tracing, util +from ...core.input import read_input_table +from ...core.pipeline import Whale +from ...core.workflow import workflow_step +from .size_terms import size_terms as get_size_terms logger = logging.getLogger(__name__) @@ -81,6 +84,7 @@ def size_table_name(model_selector): class ShadowPriceCalculator(object): def __init__( self, + whale: Whale, model_settings, num_processes, shared_data=None, @@ -106,14 +110,14 @@ def __init__( """ self.num_processes = num_processes - self.use_shadow_pricing = bool(config.setting("use_shadow_pricing")) + self.use_shadow_pricing = bool(whale.settings.use_shadow_pricing) self.saved_shadow_price_file_path = ( None # set by read_saved_shadow_prices if loaded ) self.model_selector = model_settings["MODEL_SELECTOR"] - if (self.num_processes > 1) and not config.setting("fail_fast"): + if (self.num_processes > 1) and not whale.settings.fail_fast: # if we are multiprocessing, then fail_fast should be true or we will wait forever for failed processes logger.warning( "deprecated combination of multiprocessing and not fail_fast" @@ -128,14 +132,16 @@ def __init__( self.modeled_size = None if self.use_shadow_pricing: - self.shadow_settings = config.read_model_settings("shadow_pricing.yaml") + self.shadow_settings = whale.filesystem.read_model_settings( + "shadow_pricing.yaml" + ) for k in self.shadow_settings: logger.debug( "shadow_settings %s: %s" % (k, self.shadow_settings.get(k)) ) - full_model_run = config.setting("households_sample_size") == 0 + full_model_run = whale.settings.households_sample_size == 0 if ( self.use_shadow_pricing and not full_model_run @@ -199,7 +205,9 @@ def __init__( if self.shadow_settings["LOAD_SAVED_SHADOW_PRICES"]: # read_saved_shadow_prices logs error and returns None if file not found - self.shadow_prices = self.read_saved_shadow_prices(model_settings) + self.shadow_prices = self.read_saved_shadow_prices( + whale, model_settings + ) if self.shadow_prices is None: self.max_iterations = self.shadow_settings.get("MAX_ITERATIONS", 5) @@ -270,7 +278,7 @@ def __init__( ), f"{target} is not in landuse columns: {land_use.columns}" self.target[segment] = land_use[target] - def read_saved_shadow_prices(self, model_settings): + def read_saved_shadow_prices(self, whale, model_settings): """ Read saved shadow_prices from csv file in data_dir (so-called warm start) returns None if no saved shadow price file name specified or named file not found @@ -292,7 +300,7 @@ def read_saved_shadow_prices(self, model_settings): ) if saved_shadow_price_file_name: # FIXME - where should we look for this file? - file_path = config.data_file_path( + file_path = whale.filesystem.get_data_file_path( saved_shadow_price_file_name, mandatory=False ) if file_path: @@ -1225,16 +1233,16 @@ def load_shadow_price_calculator(model_settings): # first define add_size_tables as an orca step with no scale argument at all. -@inject.step() -def add_size_tables(disaggregate_suffixes): - return _add_size_tables(disaggregate_suffixes) +@workflow_step +def add_size_tables(whale, disaggregate_suffixes): + return _add_size_tables(whale, disaggregate_suffixes) # then define _add_size_tables as a second method which also offers an optional # default argument to not scale sizes. This is used only in disaggregate # accessibility (for now) and is not called via orca. We need to do this to # avoid having to create a new orca variable for the scale argument. -def _add_size_tables(disaggregate_suffixes, scale=True): +def _add_size_tables(whale, disaggregate_suffixes, scale=True): """ inject tour_destination_size_terms tables for each model_selector (e.g. school, workplace) @@ -1254,9 +1262,9 @@ def _add_size_tables(disaggregate_suffixes, scale=True): (size table) counts. """ - use_shadow_pricing = bool(config.setting("use_shadow_pricing")) + use_shadow_pricing = bool(whale.settings.use_shadow_pricing) - shadow_settings = config.read_model_settings("shadow_pricing.yaml") + shadow_settings = whale.filesystem.read_model_settings("shadow_pricing.yaml") shadow_pricing_models = shadow_settings.get("shadow_pricing_models") if shadow_pricing_models is None: @@ -1290,7 +1298,7 @@ def _add_size_tables(disaggregate_suffixes, scale=True): for model_selector, model_name in shadow_pricing_models.items(): - model_settings = config.read_model_settings(model_name) + model_settings = whale.filesystem.read_model_settings(model_name) if suffix is not None and roots: model_settings = util.suffix_tables_in_settings( @@ -1306,19 +1314,19 @@ def _add_size_tables(disaggregate_suffixes, scale=True): chooser_table_name = model_settings["CHOOSER_TABLE_NAME"] chooser_segment_column = model_settings["CHOOSER_SEGMENT_COLUMN_NAME"] - choosers_df = inject.get_table(chooser_table_name).to_frame() + choosers_df = whale.get_dataframe(chooser_table_name) if "CHOOSER_FILTER_COLUMN_NAME" in model_settings: choosers_df = choosers_df[ choosers_df[model_settings["CHOOSER_FILTER_COLUMN_NAME"]] != 0 ] # - raw_desired_size - land_use = inject.get_table("land_use") - size_terms = inject.get_injectable("size_terms") + land_use = whale.get_dataframe("land_use") + size_terms = get_size_terms(whale) raw_size = tour_destination_size_terms(land_use, size_terms, model_selector) assert set(raw_size.columns) == set(segment_ids.keys()) - full_model_run = config.setting("households_sample_size") == 0 + full_model_run = whale.settings.households_sample_size == 0 scale_size_table = scale and scale_size_table @@ -1381,10 +1389,10 @@ def _add_size_tables(disaggregate_suffixes, scale=True): scaled_size.index.is_monotonic_increasing ), f"size table {size_table_name(model_selector)} not is_monotonic_increasing" - inject.add_table(size_table_name(model_selector), scaled_size, replace=True) + whale.add_table(size_table_name(model_selector), scaled_size) -def get_shadow_pricing_info(): +def get_shadow_pricing_info(whale): """ return dict with info about dtype and shapes of desired and modeled size tables @@ -1401,7 +1409,7 @@ def get_shadow_pricing_info(): land_use = inject.get_table("land_use") size_terms = inject.get_injectable("size_terms") - shadow_settings = config.read_model_settings("shadow_pricing.yaml") + shadow_settings = whale.filesystem.read_model_settings("shadow_pricing.yaml") # shadow_pricing_models is dict of {: } shadow_pricing_models = shadow_settings.get("shadow_pricing_models", {}) @@ -1428,7 +1436,7 @@ def get_shadow_pricing_info(): return shadow_pricing_info -def get_shadow_pricing_choice_info(): +def get_shadow_pricing_choice_info(whale): """ return dict with info about dtype and shapes of desired and modeled size tables @@ -1444,7 +1452,7 @@ def get_shadow_pricing_choice_info(): persons = read_input_table("persons") - shadow_settings = config.read_model_settings("shadow_pricing.yaml") + shadow_settings = whale.filesystem.read_model_settings("shadow_pricing.yaml") # shadow_pricing_models is dict of {: } shadow_pricing_models = shadow_settings.get("shadow_pricing_models", {}) diff --git a/activitysim/abm/tables/size_terms.py b/activitysim/abm/tables/size_terms.py index e31b004e5..d6710afe9 100644 --- a/activitysim/abm/tables/size_terms.py +++ b/activitysim/abm/tables/size_terms.py @@ -5,14 +5,14 @@ import numpy as np import pandas as pd -from activitysim.core import config, inject +from ...core.workflow import workflow_cached_object logger = logging.getLogger(__name__) -@inject.injectable(cache=True) -def size_terms(): - f = config.config_file_path("destination_choice_size_terms.csv") +@workflow_cached_object +def size_terms(whale): + f = whale.filesystem.get_config_file_path("destination_choice_size_terms.csv") return pd.read_csv(f, comment="#", index_col="segment") @@ -57,7 +57,7 @@ def tour_destination_size_terms(land_use, size_terms, model_selector): Parameters ---------- - land_use - pipeline table + land_use - pd.DataFrame size_terms - pipeline table model_selector - str @@ -78,8 +78,6 @@ def tour_destination_size_terms(land_use, size_terms, model_selector): ... """ - land_use = land_use.to_frame() - # don't count on land_use being sorted by index if not land_use.index.is_monotonic_increasing: land_use = land_use.sort_index() diff --git a/activitysim/abm/tables/skims.py b/activitysim/abm/tables/skims.py index 39440b29f..85cf719b1 100644 --- a/activitysim/abm/tables/skims.py +++ b/activitysim/abm/tables/skims.py @@ -6,6 +6,9 @@ from activitysim.core import config, inject, los from activitysim.core.pathbuilder import TransitVirtualPathBuilder +from ...core.pipeline import Whale +from ...core.workflow import workflow_cached_object + logger = logging.getLogger(__name__) """ @@ -13,8 +16,8 @@ """ -@inject.injectable(cache=True) -def network_los_preload(): +@workflow_cached_object +def network_los_preload(whale) -> los.Network_LOS: # when multiprocessing with shared data mp_tasks has to call network_los methods # allocate_shared_skim_buffers() and load_shared_data() BEFORE network_los.load_data() @@ -24,21 +27,21 @@ def network_los_preload(): return nw_los -@inject.injectable(cache=True) -def network_los(network_los_preload): +@workflow_cached_object +def network_los(whale, network_los_preload: los.Network_LOS) -> los.Network_LOS: logger.debug("loading network_los injectable") network_los_preload.load_data() return network_los_preload -@inject.injectable(cache=True) -def skim_dict(network_los): +@workflow_cached_object +def skim_dict(whale, network_los): return network_los.get_default_skim_dict() -@inject.injectable() -def log_settings(): +@workflow_cached_object +def log_settings(whale): # abm settings to log on startup return [ diff --git a/activitysim/abm/tables/time_windows.py b/activitysim/abm/tables/time_windows.py index 1a6279173..36c95e927 100644 --- a/activitysim/abm/tables/time_windows.py +++ b/activitysim/abm/tables/time_windows.py @@ -6,16 +6,20 @@ import numpy as np import pandas as pd -from activitysim.core import config, inject -from activitysim.core import timetable as tt +from ...core import config, inject +from ...core import timetable as tt +from ...core.pipeline import Whale +from ...core.workflow import workflow_cached_object, workflow_table logger = logging.getLogger(__name__) -@inject.injectable(cache=True) -def tdd_alts(): +@workflow_cached_object +def tdd_alts(whale) -> pd.DataFrame: # right now this file just contains the start and end hour - file_path = config.config_file_path("tour_departure_and_duration_alternatives.csv") + file_path = whale.filesystem.get_config_file_path( + "tour_departure_and_duration_alternatives.csv" + ) df = pd.read_csv(file_path) df["duration"] = df.end - df.start @@ -26,9 +30,8 @@ def tdd_alts(): return df -@inject.injectable(cache=True) -def tdd_alt_segments(): - +@workflow_cached_object +def tdd_alt_segments(whale: Whale) -> pd.DataFrame: # tour_purpose,time_period,start,end # work,EA,3,5 # work,AM,6,8 @@ -36,12 +39,11 @@ def tdd_alt_segments(): # school,PM,15,17 # school,EV,18,22 - file_path = config.config_file_path( + file_path = whale.filesystem.get_config_file_path( "tour_departure_and_duration_segments.csv", mandatory=False ) if file_path: - df = pd.read_csv(file_path, comment="#") # - NARROW @@ -54,18 +56,18 @@ def tdd_alt_segments(): return df -@inject.table() -def person_windows(persons, tdd_alts): - +@workflow_table +def person_windows( + whale: Whale, + persons: pd.DataFrame, + tdd_alts: pd.DataFrame, +) -> pd.DataFrame: df = tt.create_timetable_windows(persons, tdd_alts) - inject.add_table("person_windows", df) - return df @inject.injectable() def timetable(person_windows, tdd_alts): - logging.debug("@inject timetable") return tt.TimeTable(person_windows.to_frame(), tdd_alts, person_windows.name) diff --git a/activitysim/abm/test/conftest.py b/activitysim/abm/test/conftest.py index f14a69149..98b6f0eea 100644 --- a/activitysim/abm/test/conftest.py +++ b/activitysim/abm/test/conftest.py @@ -11,7 +11,7 @@ @pytest.fixture(scope="module") def initialize_pipeline( module: str, tables: dict[str, str], initialize_network_los: bool -) -> pipeline.Pipeline: +) -> pipeline.Whale: test_dir = os.path.join("test", module) configs_dir = os.path.join(test_dir, "configs") data_dir = os.path.join(test_dir, "data") diff --git a/activitysim/abm/test/test_misc/test_summarize.py b/activitysim/abm/test/test_misc/test_summarize.py index 6f76e1f2c..cbbae9807 100644 --- a/activitysim/abm/test/test_misc/test_summarize.py +++ b/activitysim/abm/test/test_misc/test_summarize.py @@ -48,7 +48,7 @@ def initialize_network_los() -> bool: return True -def test_summarize(initialize_pipeline: pipeline.Pipeline, caplog): +def test_summarize(initialize_pipeline: pipeline.Whale, caplog): # Run summarize model caplog.set_level(logging.DEBUG) pipeline.run(models=["summarize"]) diff --git a/activitysim/benchmarking/componentwise.py b/activitysim/benchmarking/componentwise.py index bb3c909b0..f3c1cf812 100644 --- a/activitysim/benchmarking/componentwise.py +++ b/activitysim/benchmarking/componentwise.py @@ -11,7 +11,7 @@ from ..cli.create import get_example from ..cli.run import INJECTABLES, config, pipeline from ..core import inject, tracing -from ..core.pipeline import open_pipeline, run_model +from ..core.pipeline import Whale from . import workspace logger = logging.getLogger(__name__) @@ -56,6 +56,7 @@ def component_logging(component_name): def setup_component( + whale, component_name, working_dir=".", preload_injectables=(), @@ -124,7 +125,7 @@ def setup_component( # components. Instead, those benchmarks are generated in # aggregate during setup and then extracted from logs later. else: - open_pipeline(resume_after, mode="r") + whale.open_pipeline(resume_after, mode="r") for k in preload_injectables: if inject.get_injectable(k, None) is not None: @@ -154,7 +155,7 @@ def setup_component( logger.info("setup_component completed: %s", component_name) -def run_component(component_name): +def run_component(whale, component_name): logger.info("run_component: %s", component_name) try: if config.setting("multiprocess", False): @@ -166,7 +167,7 @@ def run_component(component_name): # components. Instead, those benchmarks are generated in # aggregate during setup and then extracted from logs later. else: - run_model(component_name) + whale.run_model(component_name) except Exception as err: logger.exception("run_component exception: %s", component_name) raise @@ -175,21 +176,21 @@ def run_component(component_name): return 0 -def teardown_component(component_name): +def teardown_component(whale, component_name): logger.info("teardown_component: %s", component_name) # use the pipeline module to clear out all the orca tables, so # the next benchmark run has a clean slate. # anything needed should be reloaded from the pipeline checkpoint file - pipeline_tables = pipeline.registered_tables() + pipeline_tables = whale.registered_tables() for table_name in pipeline_tables: logger.info("dropping table %s", table_name) - pipeline.drop_table(table_name) + whale.drop_table(table_name) if config.setting("multiprocess", False): raise NotImplementedError("multiprocess benchmarking is not yet implemented") else: - pipeline.close_pipeline() + whale.close_pipeline() logger.critical( "teardown_component completed: %s\n\n%s\n\n", component_name, "~" * 88 ) @@ -197,6 +198,7 @@ def teardown_component(component_name): def pre_run( + whale, model_working_dir, configs_dirs=None, data_dir="data", @@ -301,8 +303,8 @@ def pre_run( logger.info("run multi-process complete simulation") else: logger.info("run single process simulation") - pipeline.run(models=config.setting("models")) - pipeline.close_pipeline() + whale.run(models=config.setting("models")) + whale.close_pipeline() tracing.print_elapsed_time("prerun required models for checkpointing", t0) @@ -324,10 +326,10 @@ def run_multiprocess(): injectables = {k: inject.get_injectable(k) for k in INJECTABLES} mp_tasks.run_multiprocess(injectables) - assert not pipeline.is_open() - - if config.setting("cleanup_pipeline_after_run", False): - pipeline.cleanup_pipeline() + # assert not pipeline.is_open() + # + # if config.setting("cleanup_pipeline_after_run", False): + # pipeline.cleanup_pipeline() ######## diff --git a/activitysim/cli/run.py b/activitysim/cli/run.py index 71677e8ad..89520ecdc 100644 --- a/activitysim/cli/run.py +++ b/activitysim/cli/run.py @@ -11,6 +11,9 @@ from activitysim.core import chunk, config, inject, mem, pipeline, tracing +from ..core.configuration import FileSystem, Settings +from ..core.pipeline import Whale + logger = logging.getLogger(__name__) @@ -97,15 +100,18 @@ def add_run_args(parser, multiprocess=True): metavar="(N)", nargs="?", type=int, - help="run multiprocess. Adds configs_mp settings" - " by default. Optionally give a number of processes," - " which will override the settings file.", + help="run multiprocess. Adds configs_mp settings " + "by default as the first config directory, but only if it is found" + "and is not already explicitly included elsewhere in the list of " + "configs. Optionally give a number of processes greater than 1, " + "which will override the number of processes written in settings file.", ) -def validate_injectable(name): +def validate_injectable(whale: Whale, name, make_if_missing=False): try: - dir_paths = inject.get_injectable(name) + dir_paths = whale.context.get_formatted(name) + # dir_paths = inject.get_injectable(name) except RuntimeError: # injectable is missing, meaning is hasn't been explicitly set # and defaults cannot be found. @@ -119,15 +125,18 @@ def validate_injectable(name): for dir_path in dir_paths: if not os.path.exists(dir_path): - sys.exit("Could not find %s '%s'" % (name, os.path.abspath(dir_path))) + if make_if_missing: + os.makedirs(dir_path) + else: + sys.exit("Could not find %s '%s'" % (name, os.path.abspath(dir_path))) return dir_paths -def handle_standard_args(args, multiprocess=True): - def inject_arg(name, value, cache=False): +def handle_standard_args(whale: Whale, args, multiprocess=True): + def inject_arg(name, value): assert name in INJECTABLES - inject.add_injectable(name, value, cache=cache) + whale.context[name] = value if args.working_dir: # activitysim will look in the current working directory for @@ -155,67 +164,111 @@ def inject_arg(name, value, cache=False): inject_arg("imported_extensions", ()) # settings_file_name should be cached or else it gets squashed by config.py - if args.settings_file: - inject_arg("settings_file_name", args.settings_file, cache=True) - - if args.config: - inject_arg("configs_dir", args.config) - - if args.data: - inject_arg("data_dir", args.data) - - if args.output: - inject_arg("output_dir", args.output) + # if args.settings_file: + # inject_arg("settings_file_name", args.settings_file) + # + # if args.config: + # inject_arg("configs_dir", args.config) + # + # if args.data: + # inject_arg("data_dir", args.data) + # + # if args.output: + # inject_arg("output_dir", args.output) + + whale.filesystem = FileSystem.parse_args(args) + + # read settings file + raw_settings = whale.filesystem.read_settings_file( + whale.filesystem.settings_file_name, + mandatory=True, + include_stack=False, + ) - if multiprocess and args.multiprocess: - config_paths = validate_injectable("configs_dir") + # the settings can redefine the cache directories. + cache_dir = raw_settings.pop("cache_dir", None) + if cache_dir: + whale.filesystem.cache_dir = cache_dir + whale.settings = Settings.parse_obj(raw_settings) - if not os.path.exists("configs_mp"): - logger.warning("could not find 'configs_mp'. skipping...") - else: - logger.info("adding 'configs_mp' to config_dir list...") - config_paths.insert(0, "configs_mp") - inject_arg("configs_dir", config_paths) + extra_settings = set(whale.settings.__dict__) - set(Settings.__fields__) - config.override_setting("multiprocess", True) - if args.multiprocess > 0: - config.override_setting("num_processes", args.multiprocess) + if extra_settings: + warnings.warn( + "Writing arbitrary model values as top-level key in settings.yaml " + "is deprecated, make them sub-keys of `other_settings` instead.", + DeprecationWarning, + ) + logger.warning(f"Found the following unexpected settings:") + if whale.settings.other_settings is None: + whale.settings.other_settings = {} + for k in extra_settings: + logger.warning(f" - {k}") + whale.settings.other_settings[k] = getattr(whale.settings, k) + delattr(whale.settings, k) + + if args.multiprocess: + if "configs_mp" not in whale.filesystem.configs_dir: + # when triggering multiprocessing from command arguments, + # add 'configs_mp' as the first config directory, but only + # if it exists, and it is not already explicitly included + # in the set of config directories. + if not whale.filesystem.get_working_subdir("configs_mp").exists(): + logger.warning("could not find 'configs_mp'. skipping...") + else: + logger.info("adding 'configs_mp' to config_dir list...") + whale.filesystem.configs_dir = ( + "configs_mp", + ) + whale.filesystem.configs_dir + + whale.settings.multiprocess = True + if args.multiprocess > 1: + # setting --multiprocess to just 1 implies using the number of + # processes discovered in the configs file, while setting to more + # than 1 explicitly overrides that setting + whale.settings.num_processes = args.multiprocess if args.chunk_size: - config.override_setting("chunk_size", int(args.chunk_size)) + whale.settings.chunk_size = int(args.chunk_size) + # config.override_setting("chunk_size", int(args.chunk_size)) if args.chunk_training_mode is not None: - config.override_setting("chunk_training_mode", args.chunk_training_mode) + whale.settings.chunk_training_mode = args.chunk_training_mode + # config.override_setting("chunk_training_mode", args.chunk_training_mode) if args.households_sample_size is not None: - config.override_setting("households_sample_size", args.households_sample_size) + whale.settings.households_sample_size = args.households_sample_size + # config.override_setting("households_sample_size", args.households_sample_size) - for injectable in ["configs_dir", "data_dir", "output_dir"]: - validate_injectable(injectable) + # for injectable in ["configs_dir", "data_dir", "output_dir"]: + # validate_injectable( + # whale, injectable, make_if_missing=(injectable == "output_dir") + # ) if args.pipeline: - inject.add_injectable("pipeline_file_name", args.pipeline) + whale.filesystem.pipeline_file_name = args.pipeline if args.resume: - config.override_setting("resume_after", args.resume) + whale.settings.resume_after = args.resume + return whale -def cleanup_output_files(): - tracing.delete_trace_files() +def cleanup_output_files(whale: Whale): + tracing.delete_trace_files(whale) csv_ignore = [] - if config.setting("memory_profile", False): + if whale.settings.memory_profile: # memory profiling is opened potentially before `cleanup_output_files` # is called, but we want to leave any (newly created) memory profiling # log files that may have just been created. - mem_prof_log = config.log_file_path("memory_profile.csv") + mem_prof_log = config.log_file_path("memory_profile.csv", whale=whale) csv_ignore.append(mem_prof_log) - tracing.delete_output_files("h5") - tracing.delete_output_files("csv", ignore=csv_ignore) - tracing.delete_output_files("txt") - tracing.delete_output_files("yaml") - tracing.delete_output_files("prof") - tracing.delete_output_files("omx") + tracing.delete_output_files(whale, "h5") + tracing.delete_output_files(whale, "csv", ignore=csv_ignore) + tracing.delete_output_files(whale, "txt") + tracing.delete_output_files(whale, "yaml") + tracing.delete_output_files(whale, "prof") + tracing.delete_output_files(whale, "omx") def run(args): @@ -229,6 +282,8 @@ def run(args): int: sys.exit exit code """ + whale = pipeline.Whale() + # register abm steps and other abm-specific injectables # by default, assume we are running activitysim.abm # other callers (e.g. populationsim) will have to arrange to register their own steps and injectables @@ -238,17 +293,15 @@ def run(args): from activitysim import abm # noqa: F401 tracing.config_logger(basic=True) - handle_standard_args(args) # possibly update injectables + whale = handle_standard_args(whale, args) # possibly update injectables - if config.setting("rotate_logs", False): - config.rotate_log_directory() + if whale.settings.rotate_logs: + config.rotate_log_directory(whale=whale) - if config.setting("memory_profile", False) and not config.setting( - "multiprocess", False - ): + if whale.settings.memory_profile and not whale.settings.multiprocess: # Memory sidecar is only useful for single process runs # multiprocess runs log memory usage without blocking in the controlling process. - mem_prof_log = config.log_file_path("memory_profile.csv") + mem_prof_log = config.log_file_path("memory_profile.csv", whale=whale) from ..core.memory_sidecar import MemorySidecar memory_sidecar_process = MemorySidecar(mem_prof_log) @@ -256,49 +309,51 @@ def run(args): memory_sidecar_process = None # legacy support for run_list setting nested 'models' and 'resume_after' settings - if config.setting("run_list"): - warnings.warn( - "Support for 'run_list' settings group will be removed.\n" - "The run_list.steps setting is renamed 'models'.\n" - "The run_list.resume_after setting is renamed 'resume_after'.\n" - "Specify both 'models' and 'resume_after' directly in settings config file.", - FutureWarning, - ) - run_list = config.setting("run_list") - if "steps" in run_list: - assert not config.setting( - "models" - ), f"Don't expect 'steps' in run_list and 'models' as stand-alone setting!" - config.override_setting("models", run_list["steps"]) - - if "resume_after" in run_list: - assert not config.setting( - "resume_after" - ), f"Don't expect 'resume_after' both in run_list and as stand-alone setting!" - config.override_setting("resume_after", run_list["resume_after"]) + # if config.setting("run_list"): + # warnings.warn( + # "Support for 'run_list' settings group will be removed.\n" + # "The run_list.steps setting is renamed 'models'.\n" + # "The run_list.resume_after setting is renamed 'resume_after'.\n" + # "Specify both 'models' and 'resume_after' directly in settings config file.", + # FutureWarning, + # ) + # run_list = config.setting("run_list") + # if "steps" in run_list: + # assert not config.setting( + # "models" + # ), f"Don't expect 'steps' in run_list and 'models' as stand-alone setting!" + # config.override_setting("models", run_list["steps"]) + # + # if "resume_after" in run_list: + # assert not config.setting( + # "resume_after" + # ), f"Don't expect 'resume_after' both in run_list and as stand-alone setting!" + # config.override_setting("resume_after", run_list["resume_after"]) # If you provide a resume_after argument to pipeline.run # the pipeline manager will attempt to load checkpointed tables from the checkpoint store # and resume pipeline processing on the next submodel step after the specified checkpoint - resume_after = config.setting("resume_after", None) + resume_after = whale.settings.resume_after # cleanup if not resuming if not resume_after: - cleanup_output_files() + cleanup_output_files(whale) elif config.setting("cleanup_trace_files_on_resume", False): - tracing.delete_trace_files() + tracing.delete_trace_files(whale) - tracing.config_logger(basic=False) # update using possibly new logging configs - config.filter_warnings() + tracing.config_logger( + basic=False, whale=whale + ) # update using possibly new logging configs + config.filter_warnings(whale) logging.captureWarnings(capture=True) # directories for k in ["configs_dir", "settings_file_name", "data_dir", "output_dir"]: - logger.info("SETTING %s: %s" % (k, inject.get_injectable(k, None))) + logger.info("SETTING %s: %s" % (k, getattr(whale.filesystem, k, None))) - log_settings = inject.get_injectable("log_settings", {}) + log_settings = whale.settings.log_settings for k in log_settings: - logger.info("SETTING %s: %s" % (k, config.setting(k))) + logger.info("SETTING %s: %s" % (k, getattr(whale.settings, k, None))) # OMP_NUM_THREADS: openmp # OPENBLAS_NUM_THREADS: openblas @@ -335,7 +390,7 @@ def run(args): t0 = tracing.print_elapsed_time() try: - if config.setting("multiprocess", False): + if whale.settings.multiprocess: logger.info("run multiprocess simulation") from activitysim.core import mp_tasks @@ -343,24 +398,24 @@ def run(args): injectables = {k: inject.get_injectable(k) for k in INJECTABLES} mp_tasks.run_multiprocess(injectables) - assert not pipeline.is_open() + assert not whale.is_open - if config.setting("cleanup_pipeline_after_run", False): - pipeline.cleanup_pipeline() + if whale.settings.cleanup_pipeline_after_run: + whale.cleanup_pipeline() else: logger.info("run single process simulation") - pipeline.run( - models=config.setting("models"), + whale.run( + models=whale.settings.models, resume_after=resume_after, memory_sidecar_process=memory_sidecar_process, ) - if config.setting("cleanup_pipeline_after_run", False): - pipeline.cleanup_pipeline() # has side effect of closing open pipeline + if whale.settings.cleanup_pipeline_after_run: + whale.cleanup_pipeline() # has side effect of closing open pipeline else: - pipeline.close_pipeline() + whale.close_pipeline() mem.log_global_hwm() # main process except Exception: @@ -385,7 +440,6 @@ def run(args): if __name__ == "__main__": - from activitysim import abm # register injectables # noqa: F401 parser = argparse.ArgumentParser() diff --git a/activitysim/core/assign.py b/activitysim/core/assign.py index f8e73adc6..5ab604e61 100644 --- a/activitysim/core/assign.py +++ b/activitysim/core/assign.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from activitysim.core import chunk, config, pipeline, util +from activitysim.core import chunk, pipeline, util logger = logging.getLogger(__name__) @@ -135,7 +135,7 @@ def write(self, msg): ) -def local_utilities(): +def local_utilities(whale): """ Dict of useful modules and functions to provides as locals for use in eval of expressions @@ -150,12 +150,12 @@ def local_utilities(): "np": np, "reindex": util.reindex, "reindex_i": util.reindex_i, - "setting": config.setting, + "setting": lambda *arg: whale.settings._get_attr(*arg), "other_than": util.other_than, - "rng": pipeline.get_rn_generator(), + "rng": whale.get_rn_generator(), } - utility_dict.update(config.get_global_constants()) + utility_dict.update(whale.get_global_constants()) return utility_dict @@ -173,6 +173,7 @@ def is_temp(target): def assign_variables( + whale, assignment_expressions, df, locals_dict, @@ -218,8 +219,9 @@ def assign_variables( variables : pandas.DataFrame Will have the index of `df` and columns named by target and containing the result of evaluating expression - trace_df : pandas.DataFrame or None + trace_results : pandas.DataFrame or None a dataframe containing the eval result values for each assignment expression + trace_assigned_locals : dict or None """ np_logger = NumpyLogger(logger) @@ -250,7 +252,7 @@ def to_series(x): trace_assigned_locals = OrderedDict() # avoid touching caller's passed-in locals_d parameter (they may be looping) - _locals_dict = local_utilities() + _locals_dict = local_utilities(whale) if locals_dict is not None: _locals_dict.update(locals_dict) if df_alias: @@ -279,7 +281,7 @@ def to_series(x): from activitysim.core import pipeline try: - random_draws = pipeline.get_rn_generator().normal_for_df( + random_draws = whale.get_rn_generator().normal_for_df( df, broadcast=True, size=n_randoms ) except RuntimeError: @@ -297,7 +299,7 @@ def rng_lognormal(random_draws, mu, sigma, broadcast=True, scale=False): _locals_dict["rng_lognormal"] = rng_lognormal - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = whale.settings.sharrow # need to be able to identify which variables causes an error, which keeps # this from being expressed more parsimoniously diff --git a/activitysim/core/chunk.py b/activitysim/core/chunk.py index 6634b479d..788aa41ba 100644 --- a/activitysim/core/chunk.py +++ b/activitysim/core/chunk.py @@ -710,14 +710,22 @@ def run(self): log_rss(self.trace_label) -class ChunkSizer(object): +class ChunkSizer: """ """ - def __init__(self, chunk_tag, trace_label, num_choosers=0, chunk_size=0): + def __init__( + self, + chunk_tag, + trace_label, + num_choosers=0, + chunk_size=0, + chunk_training_mode="disabled", + ): self.depth = len(CHUNK_SIZERS) + 1 + self.chunk_training_mode = chunk_training_mode - if chunk_training_mode() != MODE_CHUNKLESS: + if self.chunk_training_mode != MODE_CHUNKLESS: if chunk_metric() == USS: self.rss, self.uss = mem.get_rss(force_garbage_collect=True, uss=True) else: @@ -739,8 +747,8 @@ def __init__(self, chunk_tag, trace_label, num_choosers=0, chunk_size=0): ) # give parent a complementary log_rss reading entering sub context else: self.rss, self.uss = 0, 0 - chunk_size = 0 - config.override_setting("chunk_size", 0) + # config.override_setting("chunk_size", 0) + return self.chunk_tag = chunk_tag self.trace_label = trace_label @@ -764,7 +772,7 @@ def __init__(self, chunk_tag, trace_label, num_choosers=0, chunk_size=0): self.cum_overhead = {m: 0 for m in METRICS} # if production mode, to reduce volatility, initialize cum_overhead and cum_rows from cache - if chunk_training_mode() in [MODE_ADAPTIVE, MODE_PRODUCTION]: + if self.chunk_training_mode in [MODE_ADAPTIVE, MODE_PRODUCTION]: cached_history = _HISTORIAN.cached_history_for_chunk_tag(self.chunk_tag) if cached_history: self.cum_overhead = {m: cached_history[m] for m in METRICS} @@ -788,8 +796,11 @@ def __init__(self, chunk_tag, trace_label, num_choosers=0, chunk_size=0): def close(self): + if self.chunk_training_mode == MODE_CHUNKLESS: + return + if ((self.depth == 1) or WRITE_SUBCHUNK_HISTORY) and ( - chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS) + self.chunk_training_mode not in (MODE_PRODUCTION, MODE_CHUNKLESS) ): _HISTORIAN.write_history(self.history, self.chunk_tag) @@ -850,7 +861,7 @@ def initial_rows_per_chunk(self): ) estimated_number_of_chunks = None - if chunk_training_mode() == MODE_PRODUCTION: + if self.chunk_training_mode == MODE_PRODUCTION: warnings.warn( "ActivitySim is running with a chunk_training_mode of " f"'production' but initial_row_size is zero in {self.trace_label}" @@ -883,7 +894,7 @@ def adaptive_rows_per_chunk(self, i): prev_rss = self.rss prev_uss = self.uss - if chunk_training_mode() != MODE_PRODUCTION: + if self.chunk_training_mode != MODE_PRODUCTION: if chunk_metric() == USS: self.rss, self.uss = mem.get_rss(force_garbage_collect=True, uss=True) @@ -897,7 +908,7 @@ def adaptive_rows_per_chunk(self, i): rows_remaining = self.num_choosers - prev_rows_processed - if chunk_training_mode() == MODE_PRODUCTION: + if self.chunk_training_mode == MODE_PRODUCTION: # since overhead changes we don't necessarily want the same number of rows per chunk every time # but we do use the row_size from cache which we trust is stable # which is stored in self.initial_row_size because initial_rows_per_chunk used it for the first chunk @@ -973,7 +984,7 @@ def adaptive_rows_per_chunk(self, i): # input() - if chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS): + if self.chunk_training_mode not in (MODE_PRODUCTION, MODE_CHUNKLESS): self.cum_rows += self.rows_per_chunk return self.rows_per_chunk, estimated_number_of_chunks @@ -982,7 +993,7 @@ def adaptive_rows_per_chunk(self, i): def ledger(self): # don't do anything in chunkless mode - if chunk_training_mode() == MODE_CHUNKLESS: + if self.chunk_training_mode == MODE_CHUNKLESS: yield return @@ -1039,16 +1050,68 @@ def ledger(self): CHUNK_LEDGERS.pop() self.chunk_ledger = None + def log_rss(self, trace_label, force=False): + + if self.chunk_training_mode == MODE_CHUNKLESS: + # no memory tracing at all in chunkless mode + return + + assert len(CHUNK_LEDGERS) > 0, f"log_rss called without current chunker." + + hwm_trace_label = f"{trace_label}.log_rss" + + if self.chunk_training_mode == MODE_PRODUCTION: + # FIXME - this trace_memory_info call slows things down a lot so it is turned off for now + # trace_ticks = 0 if force else mem.MEM_TRACE_TICK_LEN + # mem.trace_memory_info(hwm_trace_label, trace_ticks=trace_ticks) + return + + rss, uss = mem.trace_memory_info(hwm_trace_label) + + # check local hwm for all ledgers + with ledger_lock: + for c in CHUNK_LEDGERS: + c.check_local_hwm(hwm_trace_label, rss, uss, total_bytes=None) + + def log_df(self, trace_label, table_name, df): + + if self.chunk_training_mode in (MODE_PRODUCTION, MODE_CHUNKLESS): + return + + assert len(CHUNK_LEDGERS) > 0, f"log_df called without current chunker." + + op = "del" if df is None else "add" + hwm_trace_label = f"{trace_label}.{op}.{table_name}" + + rss, uss = mem.trace_memory_info(hwm_trace_label) + + cur_chunker = CHUNK_LEDGERS[-1] + + # registers this df and recalc total_bytes + cur_chunker.log_df(table_name, df) + + total_bytes = sum([c.total_bytes for c in CHUNK_LEDGERS]) + + # check local hwm for all ledgers + with ledger_lock: + for c in CHUNK_LEDGERS: + c.check_local_hwm(hwm_trace_label, rss, uss, total_bytes) + @contextmanager -def chunk_log(trace_label, chunk_tag=None, base=False): +def chunk_log(trace_label, chunk_tag=None, base=False, settings=None): # With `base=True` this method can be used to instantiate # a ChunkSizer class object without actually chunking. This # avoids breaking the assertion below. - if chunk_training_mode() == MODE_CHUNKLESS: - yield + if settings is None: + _chunk_training_mode = chunk_training_mode() + else: + _chunk_training_mode = settings.chunk_training_mode + + if _chunk_training_mode == MODE_CHUNKLESS: + yield ChunkSizer("chunkless", trace_label, 0, 0, _chunk_training_mode) return assert base == (len(CHUNK_SIZERS) == 0) @@ -1059,15 +1122,17 @@ def chunk_log(trace_label, chunk_tag=None, base=False): num_choosers = 0 chunk_size = 0 - chunk_sizer = ChunkSizer(chunk_tag, trace_label, num_choosers, chunk_size) + chunk_sizer = ChunkSizer( + chunk_tag, trace_label, num_choosers, chunk_size, _chunk_training_mode + ) chunk_sizer.initial_rows_per_chunk() with chunk_sizer.ledger(): - yield + yield chunk_sizer - if chunk_training_mode() != MODE_CHUNKLESS: + if _chunk_training_mode != MODE_CHUNKLESS: chunk_sizer.adaptive_rows_per_chunk(1) chunk_sizer.close() diff --git a/activitysim/core/config.py b/activitysim/core/config.py index 7024d0512..1557c2457 100644 --- a/activitysim/core/config.py +++ b/activitysim/core/config.py @@ -10,7 +10,10 @@ import yaml -from activitysim.core import inject, util +from ..core import inject, util +from .exceptions import SettingsFileNotFoundError +from .pipeline import Whale +from .workflow.util import get_formatted_or_default logger = logging.getLogger(__name__) @@ -387,9 +390,16 @@ def trace_file_path(file_name): return file_path -def log_file_path(file_name, prefix=True): +def log_file_path(file_name, prefix=True, whale: Whale = None): - output_dir = inject.get_injectable("output_dir") + if whale is not None: + output_dir = whale.filesystem.get_output_dir() + prefix = prefix and get_formatted_or_default( + whale.context, "log_file_prefix", None + ) + else: + output_dir = inject.get_injectable("output_dir") + prefix = prefix and inject.get_injectable("log_file_prefix", None) # - check if running asv and if so, log to commit-specific subfolder asv_commit = os.environ.get("ASV_COMMIT", None) @@ -402,7 +412,6 @@ def log_file_path(file_name, prefix=True): output_dir = os.path.join(output_dir, "log") # - check for optional process name prefix - prefix = prefix and inject.get_injectable("log_file_prefix", None) if prefix: file_name = "%s-%s" % (prefix, file_name) @@ -429,9 +438,12 @@ def open_log_file(file_name, mode, header=None, prefix=False): return f -def rotate_log_directory(): +def rotate_log_directory(whale=None): - output_dir = inject.get_injectable("output_dir") + if whale is not None: + output_dir = whale.context.get_formatted("output_dir") + else: + output_dir = inject.get_injectable("output_dir") log_dir = os.path.join(output_dir, "log") if not os.path.exists(log_dir): return @@ -489,7 +501,7 @@ def read_settings_file( ---------- file_name mandatory: booelan - if true, raise SettingsFileNotFound exception if no settings file, otherwise return empty dict + if true, raise SettingsFileNotFoundError if no settings file, otherwise return empty dict include_stack: boolean or list only used for recursive calls to provide list of files included so far to detect cycles @@ -626,7 +638,7 @@ def backfill_settings(settings, backfill): settings["source_file_paths"] = source_file_paths if mandatory and not settings: - raise SettingsFileNotFound(file_name, configs_dir_list) + raise SettingsFileNotFoundError(file_name, configs_dir_list) # Adds proto_ suffix for disaggregate accessibilities if args.SUFFIX is not None and args.ROOTS: @@ -666,12 +678,17 @@ def base_settings_file_path(file_name): raise RuntimeError("base_settings_file %s not found" % file_name) -def filter_warnings(): +def filter_warnings(whale=None): """ set warning filter to 'strict' if specified in settings """ - if setting("strict", False): # noqa: E402 + if whale is None: + strict = setting("strict", False) + else: + strict = whale.settings.treat_warnings_as_errors + + if strict: # noqa: E402 warnings.filterwarnings("error", category=Warning) warnings.filterwarnings( "default", category=PendingDeprecationWarning, module="future" diff --git a/activitysim/core/configuration/__init__.py b/activitysim/core/configuration/__init__.py index 5cdd2f69c..724752432 100644 --- a/activitysim/core/configuration/__init__.py +++ b/activitysim/core/configuration/__init__.py @@ -1,4 +1,5 @@ # flake8: noqa +from .filesystem import FileSystem from .network import * from .top import * diff --git a/activitysim/core/configuration/filesystem.py b/activitysim/core/configuration/filesystem.py new file mode 100644 index 000000000..4ba64be87 --- /dev/null +++ b/activitysim/core/configuration/filesystem.py @@ -0,0 +1,535 @@ +import glob +import logging +import os +import time +from pathlib import Path + +import yaml +from pydantic import DirectoryPath, validator + +from ..exceptions import SettingsFileNotFoundError +from ..util import parse_suffix_args, suffix_tables_in_settings +from .base import PydanticBase + +logger = logging.getLogger(__name__) + + +class FileSystem(PydanticBase): + """ + Manage finding and loading files for ActivitySim's command line interface. + """ + + working_dir: DirectoryPath = None + """ + Name of the working directory. + + All other directories (configs, data, output, cache), when given as relative + paths, are assumed to be relative to this working directory. If it is not + provided, the usual Python current working directory is used. + """ + + configs_dir: tuple[Path] = ("configs",) + """ + Name[s] of the config directory. + """ + + @validator("configs_dir") + def configs_dirs_must_exist(cls, configs_dir, values): + working_dir = values.get("working_dir", None) or Path.cwd() + for c in configs_dir: + c_full = working_dir.joinpath(c) + if not c_full.exists(): + raise ValueError(f"config directory {c_full} does not exist") + + data_dir: tuple[Path] = ("data",) + """ + Name of the data directory. + """ + + @validator("data_dir") + def data_dirs_must_exist(cls, data_dir, values): + working_dir = values.get("working_dir", None) or Path.cwd() + for d in data_dir: + d_full = working_dir.joinpath(d) + if not d_full.exists(): + raise ValueError(f"data directory {d_full} does not exist") + + output_dir: Path = "output" + """ + Name of the output directory. + + This directory will be created on access if it does not exist. + """ + + profile_dir: Path = None + """ + Name of the output directory for pyinstrument profiling files. + + If not given, a unique time-stamped directory will be created inside + the usual output directory. + """ + + cache_dir: Path = None + """ + Name of the output directory for cache files. + + If not given, a directory named "cache" will be created inside + the usual output directory. + """ + + settings_file_name: str = "settings.yaml" + + pipeline_file_name: str = "pipeline" + """ + The name for the base pipeline file or directory. + + To use the HDF5 pipeline file format, include a '.h5' file extension. + Otherwise, the default parquet file format is used. + """ + + @classmethod + def parse_args(cls, args): + self = cls() + + def _parse_arg(name, x): + v = getattr(args, x, None) + if v is not None: + setattr(self, name, v) + + _parse_arg("working_dir", "working_dir") + _parse_arg("settings_file_name", "settings_file") + _parse_arg("configs_dir", "config") + _parse_arg("data_dir", "data") + _parse_arg("output_dir", "output") + + return self + + def get_working_subdir(self, subdir) -> Path: + if self.working_dir: + return self.working_dir.joinpath(subdir) + else: + return Path(subdir) + + def get_output_dir(self, subdir=None) -> Path: + """ + Get an output directory, creating it if needed. + + Parameters + ---------- + subdir : Path-like, optional + If given, get this subdirectory of the output_dir. + + Returns + ------- + Path + """ + out = self.get_working_subdir(self.output_dir) + if subdir is not None: + out = out.joinpath(subdir) + if not out.exists(): + out.mkdir(parents=True) + return out + + def get_pipeline_filepath(self) -> Path: + """ + Get the complete path to the pipeline file or directory. + + Returns + ------- + Path + """ + return self.get_output_dir().joinpath(self.pipeline_file_name) + + def get_profiling_file_path(self, file_name) -> Path: + """ + Get the complete path to a profile output file. + + Parameters + ---------- + file_name : str + Base name of the profiling output file. + + Returns + ------- + Path + """ + if self.profile_dir is None: + profile_dir = self.get_output_dir( + time.strftime("profiling--%Y-%m-%d--%H-%M-%S") + ) + profile_dir.mkdir(parents=True, exist_ok=True) + self.profile_dir = profile_dir + return self.profile_dir.joinpath(file_name) + + def get_log_file_path(self, file_name) -> Path: + """ + Get the complete path to a log file. + + Parameters + ---------- + file_name : str + Base name of the log file. + + Returns + ------- + Path + """ + + output_dir = self.get_output_dir() + + # - check if running asv and if so, log to commit-specific subfolder + asv_commit = os.environ.get("ASV_COMMIT", None) + if asv_commit: + output_dir = os.path.join(output_dir, f"log-{asv_commit}") + os.makedirs(output_dir, exist_ok=True) + + # - check for optional log subfolder + if os.path.exists(os.path.join(output_dir, "log")): + output_dir = os.path.join(output_dir, "log") + + file_path = os.path.join(output_dir, file_name) + + return Path(file_path) + + def get_cache_dir(self, subdir=None) -> Path: + """ + Get the cache directory, creating it if needed. + + The cache directory is used to store: + - skim memmaps created by skim+dict_factories + - tvpb tap_tap table cache + - pre-compiled sharrow modules + + + Parameters + ---------- + subdir : Path-like, optional + If given, get this subdirectory of the output_dir. + + Returns + ------- + Path + """ + if self.cache_dir is None: + out = self.get_output_dir("cache") + else: + out = self.get_working_subdir(self.cache_dir) + if subdir is not None: + out = out.joinpath(subdir) + if not out.exists(): + out.mkdir(parents=True) + + # create a git-ignore in the cache dir if it does not exist. + # this helps prevent accidentally committing cache contents to git + gitignore = out.joinpath(".gitignore") + if not gitignore.exists(): + gitignore.write_text("/**") + + return out + + def _cascading_input_file_path( + self, file_name, dir_list_injectable_name, mandatory=True, allow_glob=False + ) -> Path: + """ + Find the first matching file among a group of directories. + + Parameters + ---------- + file_name : Path-like + The name of the file to match. + dir_list_injectable_name : {'configs_dir', 'data_dir'} + The group of directories to search. + mandatory : bool, default True + Raise a FileNotFoundError if no match is found. If set to False, + this method returns None when there is no match. + allow_glob : bool, default False + Allow glob-style matches. + + Returns + ------- + Path or None + """ + + dir_paths = getattr(self, dir_list_injectable_name) + dir_paths = [dir_paths] if isinstance(dir_paths, str) else dir_paths + + file_path = None + if file_name is not None: + for dir in dir_paths: + p = os.path.join(dir, file_name) + if os.path.isfile(p): + file_path = p + break + + if allow_glob and len(glob.glob(p)) > 0: + file_path = p + break + + if mandatory and not file_path: + raise FileNotFoundError( + "file_path %s: file '%s' not in %s" + % (dir_list_injectable_name, file_name, dir_paths) + ) + + return Path(file_path) if file_path else None + + def get_configs_dir(self) -> tuple[Path]: + """ + Get the configs directories. + + Returns + ------- + tuple[Path] + """ + return tuple(self.get_working_subdir(i) for i in self.configs_dir) + + def get_config_file_path(self, file_name, mandatory=True, allow_glob=False) -> Path: + """ + Find the first matching file among config directories. + + Parameters + ---------- + file_name : Path-like + The name of the file to match. + mandatory : bool, default True + Raise a FileNotFoundError if no match is found. If set to False, + this method returns None when there is no match. + allow_glob : bool, default False + Allow glob-style matches. + + Returns + ------- + Path or None + """ + return self._cascading_input_file_path( + file_name, "configs_dir", mandatory, allow_glob + ) + + def get_data_file_path(self, file_name, mandatory=True, allow_glob=False) -> Path: + """ + Find the first matching file among data directories. + + Parameters + ---------- + file_name : Path-like + The name of the file to match. + mandatory : bool, default True + Raise a FileNotFoundError if no match is found. If set to False, + this method returns None when there is no match. + allow_glob : bool, default False + Allow glob-style matches. + + Returns + ------- + Path or None + """ + return self._cascading_input_file_path( + file_name, "data_dir", mandatory, allow_glob + ) + + def open_log_file(self, file_name, mode, header=None, prefix=False): + if prefix: + file_name = f"{prefix}-{file_name}" + file_path = self.get_log_file_path(file_name) + + want_header = header and not os.path.exists(file_path) + + f = open(file_path, mode) + + if want_header: + assert mode in [ + "a", + "w", + ], f"open_log_file: header requested but mode was {mode}" + print(header, file=f) + + return f + + def read_settings_file( + self, + file_name, + mandatory=True, + include_stack=False, + configs_dir_list=None, + validator_class=None, + ): + """ + Load settings from one or more yaml files. + + This method will look for first occurrence of a yaml file named + in the directories in configs_dir list, and + read settings from that yaml file. + + Settings file may contain directives that affect which file settings + are returned: + + - inherit_settings (boolean) + If found and set to true, this method will backfill settings + in the current file with values from the next settings file + in configs_dir list (if any) + - include_settings: string + Read settings from specified include_file in place of the current + file. To avoid confusion, this directive must appear ALONE in the + target file, without any additional settings or directives. + + Parameters + ---------- + file_name : str + mandatory : boolean, default True + If true, raise SettingsFileNotFoundError if no matching settings file + is found in any config directory, otherwise this method will return + an empty dict or an all-default instance of the validator class. + include_stack : boolean or list + Only used for recursive calls, provides a list of files included + so far to detect and prevent cycles. + validator_class : pydantic.BaseModel, optional + This model is used to validate the loaded settings. + + Returns + ------- + dict or validator_class + """ + + def backfill_settings(settings, backfill): + new_settings = backfill.copy() + new_settings.update(settings) + return new_settings + + if configs_dir_list is None: + configs_dir_list = self.get_configs_dir() + assert len(configs_dir_list) == len( + set(configs_dir_list) + ), f"repeating file names not allowed in config_dir list: {configs_dir_list}" + + args = parse_suffix_args(file_name) + file_name = args.filename + + assert isinstance(args.ROOTS, list) + assert (args.SUFFIX is not None and args.ROOTS) or ( + args.SUFFIX is None and not args.ROOTS + ), ( + "Expected to find both 'ROOTS' and 'SUFFIX' in %s, missing one" + % args.filename + ) + + if not file_name.lower().endswith(".yaml"): + file_name = "%s.yaml" % (file_name,) + + inheriting = False + settings = {} + if isinstance(include_stack, list): + source_file_paths = include_stack.copy() + else: + source_file_paths = [] + for dir in configs_dir_list: + file_path = os.path.join(dir, file_name) + if os.path.exists(file_path): + if inheriting: + # we must be inheriting + logger.debug( + "inheriting additional settings for %s from %s" + % (file_name, file_path) + ) + inheriting = True + + assert ( + file_path not in source_file_paths + ), f"read_settings_file - recursion in reading 'file_path' after loading: {source_file_paths}" + + with open(file_path) as f: + s = yaml.load(f, Loader=yaml.SafeLoader) + if s is None: + s = {} + + settings = backfill_settings(settings, s) + + # maintain a list of files we read from to improve error message when an expected setting is not found + source_file_paths += [file_path] + + include_file_name = s.get("include_settings", False) + if include_file_name: + # FIXME - prevent users from creating borgesian garden of branching paths? + # There is a lot of opportunity for confusion if this feature were over-used + # Maybe we insist that a file with an include directive is the 'end of the road' + # essentially the current settings firle is an alias for the included file + if len(s) > 1: + logger.error( + "'include_settings' must appear alone in settings file." + ) + additional_settings = list( + set(s.keys()).difference({"include_settings"}) + ) + logger.error( + f"Unexpected additional settings: {additional_settings}" + ) + raise RuntimeError( + "'include_settings' must appear alone in settings file." + ) + + logger.debug( + "including settings for %s from %s" + % (file_name, include_file_name) + ) + + # recursive call to read included file INSTEAD of the file with include_settings sepcified + s, source_file_paths = self.read_settings_file( + include_file_name, + mandatory=True, + include_stack=source_file_paths, + ) + + # FIXME backfill with the included file + settings = backfill_settings(settings, s) + + # we are done as soon as we read one file successfully + # unless if inherit_settings is set to true in this file + + if not s.get("inherit_settings", False): + break + + # if inheriting, continue and backfill settings from the next existing settings file configs_dir_list + + inherit_settings = s.get("inherit_settings") + if isinstance(inherit_settings, str): + inherit_file_name = inherit_settings + assert ( + os.path.join(dir, inherit_file_name) not in source_file_paths + ), f"circular inheritance of {inherit_file_name}: {source_file_paths}: " + # make a recursive call to switch inheritance chain to specified file + + logger.debug( + "inheriting additional settings for %s from %s" + % (file_name, inherit_file_name) + ) + s, source_file_paths = self.read_settings_file( + inherit_file_name, + mandatory=True, + include_stack=source_file_paths, + configs_dir_list=configs_dir_list, + ) + + # backfill with the inherited file + settings = backfill_settings(settings, s) + break # break the current inheritance chain (not as bad luck as breaking a chain-letter chain?...) + + if len(source_file_paths) > 0: + settings["source_file_paths"] = source_file_paths + + if mandatory and not settings: + raise SettingsFileNotFoundError(file_name, configs_dir_list) + + # Adds proto_ suffix for disaggregate accessibilities + if args.SUFFIX is not None and args.ROOTS: + settings = suffix_tables_in_settings(settings, args.SUFFIX, args.ROOTS) + + if validator_class is not None: + settings = validator_class.parse_obj(settings) + + if include_stack: + # if we were called recursively, return an updated list of source_file_paths + return settings, source_file_paths + + else: + return settings + + read_model_settings = read_settings_file diff --git a/activitysim/core/configuration/top.py b/activitysim/core/configuration/top.py index f956bb6ea..6bf0a5e74 100644 --- a/activitysim/core/configuration/top.py +++ b/activitysim/core/configuration/top.py @@ -1,3 +1,6 @@ +from pathlib import Path +from typing import Any, Literal + from .base import PydanticBase, Union @@ -9,7 +12,7 @@ class InputTable(PydanticBase): tablename: str """Name of the injected table""" - filename: str = None + filename: Path = None """ Name of the CSV or HDF5 file to read. @@ -65,9 +68,23 @@ class InputTable(PydanticBase): and retained. """ + drop_columns: list[str] = None + """ + Columns to drop once read in to memory. + + Save only the columns needed for modeling or analysis to save on memory + and file I/O. If not given, all columns in the input file will be read + and retained. + """ + h5_tablename: str = None """table name if reading from HDF5 and different from `tablename`""" + dtypes: dict[str, str] = None + """ + dtypes for loaded columns + """ + class OutputTable(PydanticBase): tablename: str @@ -179,7 +196,7 @@ class MultiprocessStep(PydanticBase): """Instructions on how to slice tables for each subprocess.""" -class Settings(PydanticBase): +class Settings(PydanticBase, extra="allow"): """ The overall settings for the ActivitySim model system. @@ -192,7 +209,7 @@ class Settings(PydanticBase): the model. """ - models: list[str] + models: list[str] = None """ list of model steps to run - auto ownership, tour frequency, etc. @@ -210,13 +227,13 @@ class Settings(PydanticBase): half the number of available CPU cores, plus 1. """ - multiprocess_steps: list[MultiprocessStep] + multiprocess_steps: list[MultiprocessStep] = None """A list of multiprocess steps.""" resume_after: str = None """to resume running the data pipeline after the last successful checkpoint""" - input_table_list: list[InputTable] + input_table_list: list[InputTable] = None """list of table names, indices, and column re-maps for each table in `input_store`""" input_store: str = None @@ -235,9 +252,9 @@ class Settings(PydanticBase): If omitted or set to 0, ActivitySim will simulate all households. """ - trace_hh_id: Union[int, list] = None + trace_hh_id: int = None """ - Trace household id(s) + Trace this household id If omitted, no tracing is written out """ @@ -249,7 +266,9 @@ class Settings(PydanticBase): If omitted, no tracing is written out. """ - chunk_training_mode: str = None + chunk_training_mode: Literal[ + "disabled", "training", "production", "adaptive" + ] = "disabled" """ The method to use for chunk training. @@ -271,6 +290,16 @@ class Settings(PydanticBase): See :ref:`chunk_size`. """ + keep_chunk_logs: bool = True + """ + Whether to keep chunk logs when deleting other files. + """ + + default_initial_rows_per_chunk: int = 100 + """ + Default number of rows to use in initial chunking. + """ + checkpoints: Union[bool, list] = True """ When to write checkpoint (intermediate table states) to disk. @@ -457,3 +486,75 @@ class Settings(PydanticBase): """ keep_mem_logs: bool = False + + pipeline_complib: str = None + """ + Compression library to use when storing pipeline tables in an HDF5 file. + + .. versionadded:: 1.3 + """ + + treat_warnings_as_errors: bool = False + """ + Treat most warnings as errors. + + Use of this setting is not recommended outside of rigorous testing regimes. + + .. versionadded:: 1.3 + """ + + log_settings: tuple[str] = ( + "households_sample_size", + "chunk_size", + "chunk_method", + "chunk_training_mode", + "multiprocess", + "num_processes", + "resume_after", + "trace_hh_id", + "memory_profile", + "instrument", + ) + """ + Setting to log on startup. + """ + + hh_ids: Path = None + """ + Load only the household ids given in this file. + + The file need only contain the desired households ids, nothing else. + If given as a relative path (or just a file name), both the data and + config directories are searched, in that order, for the matching file. + """ + + source_file_paths: list[Path] = None + """ + A list of source files from which these settings were loaded. + + This value should not be set by the user within the YAML settings files, + instead it is populated as those files are loaded. It is primarily + provided for debugging purposes, and does not actually affect the operation + of the model. + """ + + inherit_settings: Union[bool, Path] = None + """ + Instruction on if and how to find other files that can provide settings. + + When this value is True, all config directories are searched in order for + additional files with the same filename. If other files are found they + are also loaded, but only settings values that are not already explicitly + set are applied. Alternatives, set this to a different file name, in which + case settings from that other file are loaded (again, backfilling unset + values only). Once the settings files are loaded, this value does not + have any other effect on the operation of the model(s). + """ + + other_settings: dict[str, Any] = None + + def _get_attr(self, attr): + try: + return getattr(self, attr) + except: + return self.other_settings.get(attr) diff --git a/activitysim/core/exceptions.py b/activitysim/core/exceptions.py new file mode 100644 index 000000000..c25e1547a --- /dev/null +++ b/activitysim/core/exceptions.py @@ -0,0 +1,31 @@ +class PipelineError(ValueError): + """General class for errors in using a Pipeline.""" + + +class PipelineAccessError(PipelineError): + """Error trying to access a pipeline feature that is not yet initialized.""" + + +class TableTypeError(TypeError): + """Unable to return data in the format requested.""" + + +class DuplicateWorkflowNameError(ValueError): + """More than one workflow function is defined with the same name""" + + +class DuplicateWorkflowTableError(ValueError): + """More than one loadable table is defined with the same name""" + + +class DuplicateLoadableObjectError(ValueError): + """More than one loadable object is defined with the same name""" + + +class SettingsFileNotFoundError(FileNotFoundError): + def __init__(self, file_name, configs_dir): + self.file_name = file_name + self.configs_dir = configs_dir + + def __str__(self): + return repr(f"Settings file '{self.file_name}' not found in {self.configs_dir}") diff --git a/activitysim/core/expressions.py b/activitysim/core/expressions.py index 728b6d440..a72361740 100644 --- a/activitysim/core/expressions.py +++ b/activitysim/core/expressions.py @@ -2,17 +2,13 @@ # See full license in LICENSE.txt. import logging -from activitysim.core import assign, config, inject, simulate, tracing -from activitysim.core.util import ( - assign_in_place, - parse_suffix_args, - suffix_expressions_df_str, -) +from . import assign, config, simulate, tracing +from .util import assign_in_place, parse_suffix_args, suffix_expressions_df_str logger = logging.getLogger(__name__) -def compute_columns(df, model_settings, locals_dict={}, trace_label=None): +def compute_columns(whale, df, model_settings, locals_dict={}, trace_label=None): """ Evaluate expressions_spec in context of df, with optional additional pipeline tables in locals @@ -80,7 +76,7 @@ def compute_columns(df, model_settings, locals_dict={}, trace_label=None): ) expressions_spec = assign.read_assignment_spec( - config.config_file_path(expressions_spec_name) + whale.filesystem.get_config_file_path(expressions_spec_name), ) if suffix is not None and roots: @@ -90,7 +86,7 @@ def compute_columns(df, model_settings, locals_dict={}, trace_label=None): "Expected to find some assignment expressions in %s" % expressions_spec_name ) - tables = {t: inject.get_table(t).to_frame() for t in helper_table_names} + tables = {t: whale.get_dataframe(t) for t in helper_table_names} # if df was passed in, df might be a slice, or any other table, but DF is it's local alias assert df_name not in tables, "Did not expect to find df '%s' in TABLES" % df_name @@ -99,18 +95,22 @@ def compute_columns(df, model_settings, locals_dict={}, trace_label=None): # be nice and also give it to them as df? tables["df"] = df - _locals_dict = assign.local_utilities() + _locals_dict = assign.local_utilities(whale) _locals_dict.update(locals_dict) _locals_dict.update(tables) # FIXME a number of asim model preprocessors want skim_dict - should they request it in model_settings.TABLES? - if config.setting("sharrow", False): - _locals_dict["skim_dict"] = inject.get_injectable("skim_dataset_dict", None) + if whale.settings.sharrow: + _locals_dict["skim_dict"] = whale.get("skim_dataset_dict", None) else: - _locals_dict["skim_dict"] = inject.get_injectable("skim_dict", None) + _locals_dict["skim_dict"] = whale.get("skim_dict", None) results, trace_results, trace_assigned_locals = assign.assign_variables( - expressions_spec, df, _locals_dict, trace_rows=tracing.trace_targets(df) + whale, + expressions_spec, + df, + _locals_dict, + trace_rows=tracing.trace_targets(whale, df), ) if trace_results is not None: @@ -122,7 +122,7 @@ def compute_columns(df, model_settings, locals_dict={}, trace_label=None): return results -def assign_columns(df, model_settings, locals_dict={}, trace_label=None): +def assign_columns(whale, df, model_settings, locals_dict={}, trace_label=None): """ Evaluate expressions in context of df and assign resulting target columns to df @@ -135,7 +135,7 @@ def assign_columns(df, model_settings, locals_dict={}, trace_label=None): assert df is not None assert model_settings is not None - results = compute_columns(df, model_settings, locals_dict, trace_label) + results = compute_columns(whale, df, model_settings, locals_dict, trace_label) assign_in_place(df, results) diff --git a/activitysim/core/inject.py b/activitysim/core/inject.py index 208a5658f..e57d28584 100644 --- a/activitysim/core/inject.py +++ b/activitysim/core/inject.py @@ -104,6 +104,7 @@ def merge_tables(target, tables, columns=None): def add_step(name, func): + logger.critical(f"ADD-STEP: {name}") return orca.add_step(name, func) @@ -112,6 +113,7 @@ def add_table(table_name, table, replace=False): Add new table and raise assertion error if the table already exists. Silently replace if replace=True. """ + logger.critical(f"ADD-TABLE: {table_name}") if ( not replace and orca.is_table(table_name) @@ -126,10 +128,12 @@ def add_table(table_name, table, replace=False): # fixme remove? def add_column(table_name, column_name, column, cache=False): + logger.critical(f"ADD-COLUMN: {table_name}[{column_name}]") return orca.add_column(table_name, column_name, column, cache=cache) def add_injectable(name, injectable, cache=False): + logger.critical(f"ADD-INJECTABLE: {name}") return orca.add_injectable(name, injectable, cache=cache) @@ -158,7 +162,7 @@ def broadcast( def get_table(name, default=_NO_DEFAULT): - + logger.critical(f"GET-TABLE: {name}") if orca.is_table(name) or default == _NO_DEFAULT: return orca.get_table(name) else: @@ -171,7 +175,7 @@ def is_injectable(name): def get_injectable(name, default=_NO_DEFAULT): - + logger.critical(f"GET-INJECTABLE: {name}") if is_injectable(name) or default == _NO_DEFAULT: return orca.get_injectable(name) else: @@ -179,7 +183,7 @@ def get_injectable(name, default=_NO_DEFAULT): def remove_injectable(name): - + logger.critical(f"DEL-INJECTABLE: {name}") orca._INJECTABLES.pop(name, None) diff --git a/activitysim/core/input.py b/activitysim/core/input.py index 41bfdc1c0..ec9bd0ff7 100644 --- a/activitysim/core/input.py +++ b/activitysim/core/input.py @@ -7,17 +7,20 @@ import pandas as pd -from activitysim.core import config, inject, util +from ..core import inject, util +from ..core.configuration import FileSystem, InputTable, Settings logger = logging.getLogger(__name__) def canonical_table_index_name(table_name): - table_index_names = inject.get_injectable("canonical_table_index_names", None) + from ..abm.models.util import canonical_ids + + table_index_names = canonical_ids.CANONICAL_TABLE_INDEX_NAMES return table_index_names and table_index_names.get(table_name, None) -def read_input_table(tablename, required=True): +def read_input_table(whale, tablename, required=True): """Reads input table name and returns cleaned DataFrame. Uses settings found in input_table_list in global settings file @@ -25,21 +28,22 @@ def read_input_table(tablename, required=True): Parameters ---------- tablename : string + settings : Whale Returns ------- pandas DataFrame """ - table_list = config.setting("input_table_list") + table_list = whale.settings.input_table_list assert table_list is not None, "no input_table_list found in settings" table_info = None for info in table_list: - if info["tablename"] == tablename: + if info.tablename == tablename: table_info = info if table_info is not None: - df = read_from_table_info(table_info) + df = read_from_table_info(table_info, whale) else: if required: raise RuntimeError( @@ -50,7 +54,7 @@ def read_input_table(tablename, required=True): return df -def read_from_table_info(table_info): +def read_from_table_info(table_info: InputTable, whale): """ Read input text files and return cleaned up DataFrame. @@ -65,28 +69,23 @@ def read_from_table_info(table_info): +--------------+----------------------------------------------------------+ | filename | name of csv file to read (in data_dir) | +--------------+----------------------------------------------------------+ - | column_map | list of input columns to rename from_name: to_name | - +--------------+----------------------------------------------------------+ | index_col | name of column to set as dataframe index column | +--------------+----------------------------------------------------------+ - | drop_columns | list of column names of columns to drop | - +--------------+----------------------------------------------------------+ | h5_tablename | name of target table in HDF5 file | +--------------+----------------------------------------------------------+ """ - input_store = config.setting("input_store", None) - create_input_store = config.setting("create_input_store", default=False) - - tablename = table_info.get("tablename") - data_filename = table_info.get("filename", input_store) - h5_tablename = table_info.get("h5_tablename") or tablename - drop_columns = table_info.get("drop_columns", None) - column_map = table_info.get("column_map", None) - keep_columns = table_info.get("keep_columns", None) - rename_columns = table_info.get("rename_columns", None) - recode_columns = table_info.get("recode_columns", None) - csv_dtypes = table_info.get("dtypes", {}) + input_store = whale.settings.input_store + create_input_store = whale.settings.create_input_store + + tablename = table_info.tablename + data_filename = table_info.filename or input_store + h5_tablename = table_info.h5_tablename or tablename + keep_columns = table_info.keep_columns + drop_columns = table_info.drop_columns + rename_columns = table_info.rename_columns + recode_columns = table_info.recode_columns + csv_dtypes = table_info.dtypes or {} # don't require a redundant index_col directive for canonical tables # but allow explicit disabling of assignment of index col for canonical tables, in which case, presumably, @@ -97,14 +96,14 @@ def read_from_table_info(table_info): if "index_col" in table_info: # honor explicit index_col unless it conflicts with canonical name - index_col = table_info["index_col"] + index_col = table_info.index_col if canonical_index_col: if index_col: # if there is a non-empty index_col directive, it should be for canonical_table_index_name assert ( index_col == canonical_index_col - ), f"{tablename} index_col {table_info.get('index_col')} should be {index_col}" + ), f"{tablename} index_col {table_info.index_col} should be {index_col}" else: logger.info( f"Not assigning canonical index_col {tablename}.{canonical_index_col} " @@ -120,45 +119,37 @@ def read_from_table_info(table_info): assert tablename is not None, "no tablename provided" assert data_filename is not None, "no input file provided" - data_file_path = config.data_file_path(data_filename) + data_file_path = whale.filesystem.get_data_file_path(data_filename) df = _read_input_file( - data_file_path, h5_tablename=h5_tablename, csv_dtypes=csv_dtypes + str(data_file_path), h5_tablename=h5_tablename, csv_dtypes=csv_dtypes ) # logger.debug('raw %s table columns: %s' % (tablename, df.columns.values)) logger.debug("raw %s table size: %s" % (tablename, util.df_size(df))) if create_input_store: - h5_filepath = config.output_file_path("input_data.h5") - logger.info("writing %s to %s" % (h5_tablename, h5_filepath)) - df.to_hdf(h5_filepath, key=h5_tablename, mode="a") - - csv_dir = config.output_file_path("input_data") - if not os.path.exists(csv_dir): - os.makedirs(csv_dir) # make directory if needed - df.to_csv(os.path.join(csv_dir, "%s.csv" % tablename), index=False) + raise NotImplementedError("the input store functionality has been disabled") + # h5_filepath = config.output_file_path("input_data.h5") + # logger.info("writing %s to %s" % (h5_tablename, h5_filepath)) + # df.to_hdf(h5_filepath, key=h5_tablename, mode="a") + # + # csv_dir = config.output_file_path("input_data") + # if not os.path.exists(csv_dir): + # os.makedirs(csv_dir) # make directory if needed + # df.to_csv(os.path.join(csv_dir, "%s.csv" % tablename), index=False) if drop_columns: logger.debug("dropping columns: %s" % drop_columns) df.drop(columns=drop_columns, inplace=True, errors="ignore") - if column_map: - warnings.warn( - "table_inf option 'column_map' renamed 'rename_columns'" - "Support for 'column_map' will be removed in future versions.", - FutureWarning, - ) - logger.debug("renaming columns: %s" % column_map) - df.rename(columns=column_map, inplace=True) - # rename columns first, so keep_columns can be a stable list of expected/required columns if rename_columns: logger.debug("renaming columns: %s" % rename_columns) df.rename(columns=rename_columns, inplace=True) # recode columns, can simplify data structure - if recode_columns and config.setting("recode_pipeline_columns", True): + if recode_columns and whale.settings.recode_pipeline_columns: for colname, recode_instruction in recode_columns.items(): logger.info(f"recoding column {colname}: {recode_instruction}") if recode_instruction == "zero-based": @@ -177,10 +168,10 @@ def read_from_table_info(table_info): # We need to keep track if we have recoded the land_use # table's index to zero-based, as we need to disable offset # processing for legacy skim access. - config.override_setting("offset_preprocessing", True) + whale.settings.offset_preprocessing = True else: source_table, lookup_col = recode_instruction.split(".") - parent_table = inject.get_table(source_table) + parent_table = whale.get_dataframe(source_table) try: map_col = parent_table[f"_original_{lookup_col}"] except KeyError: diff --git a/activitysim/core/los.py b/activitysim/core/los.py index 377e106e7..5f7268767 100644 --- a/activitysim/core/los.py +++ b/activitysim/core/los.py @@ -7,11 +7,13 @@ import numpy as np import pandas as pd -from activitysim.core import skim_dataset # noqa: F401 -from activitysim.core import config, inject, pathbuilder, skim_dictionary, tracing, util -from activitysim.core.cleaning import recode_based_on_table -from activitysim.core.skim_dict_factory import MemMapSkimFactory, NumpyArraySkimFactory -from activitysim.core.skim_dictionary import NOT_IN_SKIM_ZONE_ID +from . import skim_dataset # noqa: F401 +from . import config, inject, pathbuilder, skim_dictionary, tracing, util +from .cleaning import recode_based_on_table +from .exceptions import SettingsFileNotFoundError +from .pipeline import Whale +from .skim_dict_factory import MemMapSkimFactory, NumpyArraySkimFactory +from .skim_dictionary import NOT_IN_SKIM_ZONE_ID skim_factories = { "NumpyArraySkimFactory": NumpyArraySkimFactory, @@ -69,7 +71,7 @@ class Network_LOS(object): tap_tap_uid: TapTapUidCalculator """ - def __init__(self, los_settings_file_name=LOS_SETTINGS_FILE_NAME): + def __init__(self, whale, los_settings_file_name=LOS_SETTINGS_FILE_NAME): # Note: we require all skims to be of same dtype so they can share buffer - is that ok? # fixme is it ok to require skims be all the same type? if so, is this the right choice? @@ -91,7 +93,7 @@ def __init__(self, los_settings_file_name=LOS_SETTINGS_FILE_NAME): self.tvpb = None self.los_settings_file_name = los_settings_file_name - self.load_settings() + self.load_settings(whale) # dependency injection of skim factory (of type specified in skim_dict_factory setting) skim_dict_factory_name = self.setting("skim_dict_factory") @@ -144,58 +146,17 @@ def setting(self, keys, default=""): else: return default - def load_settings(self): + def load_settings(self, whale: Whale): """ Read setting file and initialize object variables (see class docstring for list of object variables) """ - try: - self.los_settings = config.read_settings_file( - self.los_settings_file_name, mandatory=True - ) - except config.SettingsFileNotFound as e: - - print( - f"los_settings_file_name {self.los_settings_file_name} not found - trying global settings" - ) - print(f"skims_file: {config.setting('skims_file')}") - print(f"skim_time_periods: {config.setting('skim_time_periods')}") - print(f"source_file_paths: {config.setting('source_file_paths')}") - print( - f"inject.get_injectable('configs_dir') {inject.get_injectable('configs_dir')}" - ) - - # look for legacy 'skims_file' setting in global settings file - if config.setting("skims_file"): - - warnings.warn( - "Support for 'skims_file' setting in global settings file will be removed." - "Use 'taz_skims' in network_los.yaml config file instead.", - FutureWarning, - ) - - # in which case, we also expect to find skim_time_periods in settings file - skim_time_periods = config.setting("skim_time_periods") - assert ( - skim_time_periods is not None - ), "'skim_time_periods' setting not found." - warnings.warn( - "Support for 'skim_time_periods' setting in global settings file will be removed." - "Put 'skim_time_periods' in network_los.yaml config file instead.", - FutureWarning, - ) - - self.los_settings = { - "taz_skims": config.setting("skims_file"), - "zone_system": ONE_ZONE, - "skim_time_periods": skim_time_periods, - } - - else: - raise e + self.los_settings = whale.filesystem.read_settings_file( + self.los_settings_file_name, mandatory=True + ) # validate skim_time_periods - self.skim_time_periods = self.setting("skim_time_periods") + self.skim_time_periods = whale.network_settings.skim_time_periods if "hours" in self.skim_time_periods: self.skim_time_periods["periods"] = self.skim_time_periods.pop("hours") warnings.warn( diff --git a/activitysim/core/mem.py b/activitysim/core/mem.py index ae832f250..5e4d723e3 100644 --- a/activitysim/core/mem.py +++ b/activitysim/core/mem.py @@ -173,7 +173,7 @@ def log_global_hwm(): ) -def trace_memory_info(event, trace_ticks=0, force_garbage_collect=False): +def trace_memory_info(event, trace_ticks=0, force_garbage_collect=False, whale=None): global MEM_TICK @@ -235,9 +235,19 @@ def trace_memory_info(event, trace_ticks=0, force_garbage_collect=False): with mem_log_lock: MEM_LOG_HEADER = "process,pid,rss,full_rss,uss,event,children,time" - with config.open_log_file( - MEM_LOG_FILE_NAME, "a", header=MEM_LOG_HEADER, prefix=True - ) as log_file: + if whale is None: + log_file = config.open_log_file( + MEM_LOG_FILE_NAME, "a", header=MEM_LOG_HEADER, prefix=True + ) + else: + log_file = whale.filesystem.open_log_file( + MEM_LOG_FILE_NAME, + "a", + header=MEM_LOG_HEADER, + prefix=whale.context.get("log_file_prefix", None), + ) + + with log_file: print( f"{process_name}," f"{pid}," diff --git a/activitysim/core/pipeline.py b/activitysim/core/pipeline.py index b8f1f498e..9c24e5f02 100644 --- a/activitysim/core/pipeline.py +++ b/activitysim/core/pipeline.py @@ -1,17 +1,20 @@ # ActivitySim # See full license in LICENSE.txt. +import contextlib import datetime as dt import logging import os from builtins import map, next from pathlib import Path +from typing import Any import pandas as pd from pypyr.context import Context -from ..core.workflow import run_named_step -from . import config, inject, mem, random, tracing, util -from .tracing import print_elapsed_time +from ..core.configuration import FileSystem, NetworkSettings, Settings +from ..core.exceptions import PipelineAccessError +from ..core.workflow.steps import run_named_step +from ..core.workflow.util import get_formatted_or_default logger = logging.getLogger(__name__) @@ -34,6 +37,8 @@ # single character prefix for run_list model name to indicate that no checkpoint should be saved NO_CHECKPOINT_PREFIX = "_" +NO_DEFAULT = "throw error if missing" + def split_arg(s, sep, default=""): """ @@ -53,35 +58,175 @@ def split_arg(s, sep, default=""): return arg, val -class Pipeline: - def __init__(self): - self.context = Context() - self.init_state() +class Whale: + def __init__(self, context=None): + if context is None: + self.context = Context() + self.init_state() + elif isinstance(context, Context): + self.context = context + else: + raise TypeError(f"cannot init Whale with {type(context)}") def init_state(self, pipeline_file_format="parquet"): - # most recent checkpoint self.last_checkpoint = {} # array of checkpoint dicts self.checkpoints = [] - self.replaced_tables = {} + from .random import Random - self._rng = random.Random() + self.context["prng"] = Random() self.open_files = {} self.pipeline_store = None self._is_open = False + from .tracing import initialize_traceable_tables + + initialize_traceable_tables(self) + + self.context["_salient_tables"] = {} + + @property + def filesystem(self) -> FileSystem: + try: + return self.context["filesystem"] + except KeyError: + raise PipelineAccessError("filesystem not initialized for this pipeline") - self.context.update(tracing.initialize_traceable_tables()) + @filesystem.setter + def filesystem(self, fs: FileSystem): + if not isinstance(fs, FileSystem): + raise TypeError(f"filesystem must be FileSystem not {type(fs)}") + self.context["filesystem"] = fs - self._TABLES = set() + @property + def settings(self) -> Settings: + try: + return self.context["settings"] + except KeyError: + raise PipelineAccessError("settings not initialized for this pipeline") + + @settings.setter + def settings(self, s: Settings): + if not isinstance(s, Settings): + raise TypeError(f"settings must be Settings not {type(s)}") + self.context["settings"] = s + + @property + def network_settings(self) -> NetworkSettings: + try: + return self.context["network_settings"] + except KeyError: + raise PipelineAccessError( + "network_settings not initialized for this pipeline" + ) + + @network_settings.setter + def network_settings(self, s: NetworkSettings): + if not isinstance(s, NetworkSettings): + raise TypeError(f"settings must be NetworkSettings not {type(s)}") + self.context["network_settings"] = s + + _RUNNABLE_STEPS = {} + _LOADABLE_TABLES = {} + _LOADABLE_OBJECTS = {} + + @property + def known_table_names(self): + return self._LOADABLE_TABLES.keys() | self.existing_table_names + + @property + def existing_table_names(self): + return self.existing_table_status.keys() + + @property + def existing_table_status(self): + return self.context["_salient_tables"] + + def uncheckpointed_table_names(self): + uncheckpointed = [] + for tablename, table_status in self.existing_table_status.items(): + if table_status: + uncheckpointed.append(tablename) + return uncheckpointed + + def load_table(self, tablename, overwrite=False, swallow_errors=False): + """ + Load a table from disk or otherwise programmatically create it. + + Parameters + ---------- + tablename : str + overwrite : bool + swallow_errors : bool + + Returns + ------- + pandas.DataFrame or xarray.Dataset + """ + if tablename in self.existing_table_names and not overwrite: + if swallow_errors: + return + raise ValueError(f"table {tablename} already loaded") + if tablename not in self._LOADABLE_TABLES: + if swallow_errors: + return + raise ValueError(f"table {tablename} has no loading function") + logger.debug(f"loading table {tablename}") + try: + t = self._LOADABLE_TABLES[tablename](self.context) + except PipelineAccessError: + if not swallow_errors: + raise + else: + t = None + if t is not None: + self.add_table(tablename, t) + return t + + def get_dataframe(self, tablename): + t = self.context.get(tablename, None) + if t is None: + t = self.load_table(tablename, swallow_errors=False) + if t is None: + raise KeyError(tablename) + if isinstance(t, pd.DataFrame): + return t + raise TypeError(f"cannot convert {tablename} to DataFrame") + + def access(self, key, initializer): + if key not in self.context: + self.context[key] = initializer + return self.context[key] + + def get(self, key, default: Any = NO_DEFAULT): + if default == NO_DEFAULT: + try: + return self.context.get_formatted(key) + except KeyError: + alt_result = getattr(self.filesystem, key, NO_DEFAULT) + if alt_result == NO_DEFAULT: + raise + else: + return alt_result + else: + return get_formatted_or_default(self.context, key, default) + + def set(self, key, value): + self.context[key] = value + + def extract(self, func): + return func(self) + + get_injectable = get # legacy function name + add_injectable = set # legacy function name def rng(self): - return self._rng + return self.context["prng"] @property def is_open(self): @@ -115,7 +260,7 @@ def close_open_files(self): file.close() self.open_files.clear() - def open_pipeline_store(self, overwrite=False, mode="a"): + def open_pipeline_store(self, pipeline_file_name, overwrite=False, mode="a"): """ Open the pipeline checkpoint store. @@ -143,11 +288,9 @@ def open_pipeline_store(self, overwrite=False, mode="a"): if self.pipeline_store is not None: raise RuntimeError("Pipeline store is already open!") - pipeline_file_path = config.pipeline_file_path( - inject.get_injectable("pipeline_file_name") - ) + pipeline_file_path = self.filesystem.get_pipeline_filepath() - if pipeline_file_path.endswith(".h5"): + if pipeline_file_path.suffix == ".h5": if overwrite: try: if os.path.isfile(pipeline_file_path): @@ -157,7 +300,7 @@ def open_pipeline_store(self, overwrite=False, mode="a"): print(e) logger.warning("Error removing %s: %s" % (pipeline_file_path, e)) - self.pipeline_store = pd.HDFStore(pipeline_file_path, mode=mode) + self.pipeline_store = pd.HDFStore(str(pipeline_file_path), mode=mode) else: self.pipeline_store = Path(pipeline_file_path) @@ -184,6 +327,17 @@ def get_rn_generator(self): """ return self.rng() + def get_global_constants(self): + """ + Read global constants from settings file + + Returns + ------- + constants : dict + dictionary of constants to add to locals for use by expressions in model spec + """ + return self.filesystem.read_settings_file("constants.yaml", mandatory=False) + def read_df(self, table_name, checkpoint_name=None): """ Read a pandas dataframe from the pipeline store. @@ -247,7 +401,7 @@ def write_df(self, df, table_name, checkpoint_name=None): store.joinpath(table_name).mkdir(parents=True, exist_ok=True) df.to_parquet(store.joinpath(table_name, f"{checkpoint_name}.parquet")) else: - complib = config.setting("pipeline_complib", None) + complib = self.settings.pipeline_complib if complib is None or len(df.columns) == 0: # tables with no columns can't be compressed successfully, so to # avoid them getting just lost and dropped they are instead written @@ -266,56 +420,58 @@ def write_df(self, df, table_name, checkpoint_name=None): ) store.flush() - def add_table(self, name, content): - self._TABLES.add(name) + def add_table(self, name, content, salient=True): + if salient: + # mark this salient table as edited, so it can be checkpointed + # at some later time if desired. + self.existing_table_status[name] = True self.context.update({name: content}) def is_table(self, name): - return name in self._TABLES - - def rewrap(self, table_name, df=None): - """ - Add or replace an orca registered table as a unitary DataFrame-backed DataFrameWrapper table - - if df is None, then get the dataframe from orca (table_name should be registered, or - an error will be thrown) which may involve evaluating added columns, etc. - - If the orca table already exists, deregister it along with any associated columns before - re-registering it. - - The net result is that the dataframe is a registered orca DataFrameWrapper table with no - computed or added columns. - - Parameters - ---------- - table_name - df - - Returns - ------- - the underlying df of the rewrapped table - """ - - logger.debug("rewrap table %s inplace=%s" % (table_name, (df is None))) - - if self.is_table(table_name): - - if df is None: - # # logger.debug("rewrap - orca.get_table(%s)" % (table_name,)) - # t = orca.get_table(table_name) - # df = t.to_frame() - df = self.context.get(table_name) - else: - # logger.debug("rewrap - orca.get_raw_table(%s)" % (table_name,)) - # don't trigger function call of TableFuncWrapper - # t = orca.get_raw_table(table_name) - df = self.context.get(table_name) - - assert df is not None - - self.add_table(table_name, df) - - return df + return name in self.existing_table_status + + # def rewrap(self, table_name, df=None): + # """ + # Add or replace an orca registered table as a unitary DataFrame-backed DataFrameWrapper table + # + # if df is None, then get the dataframe from orca (table_name should be registered, or + # an error will be thrown) which may involve evaluating added columns, etc. + # + # If the orca table already exists, deregister it along with any associated columns before + # re-registering it. + # + # The net result is that the dataframe is a registered orca DataFrameWrapper table with no + # computed or added columns. + # + # Parameters + # ---------- + # table_name + # df + # + # Returns + # ------- + # the underlying df of the rewrapped table + # """ + # + # logger.debug("rewrap table %s inplace=%s" % (table_name, (df is None))) + # + # if self.is_table(table_name): + # if df is None: + # # # logger.debug("rewrap - orca.get_table(%s)" % (table_name,)) + # # t = orca.get_table(table_name) + # # df = t.to_frame() + # df = self.context.get(table_name) + # else: + # # logger.debug("rewrap - orca.get_raw_table(%s)" % (table_name,)) + # # don't trigger function call of TableFuncWrapper + # # t = orca.get_raw_table(table_name) + # df = self.context.get(table_name) + # + # assert df is not None + # + # self.add_table(table_name, df) + # + # return df def add_checkpoint(self, checkpoint_name): """ @@ -333,28 +489,14 @@ def add_checkpoint(self, checkpoint_name): logger.debug("add_checkpoint %s timestamp %s" % (checkpoint_name, timestamp)) - for table_name in self.registered_tables(): - - # if we have not already checkpointed it or it has changed - # FIXME - this won't detect if the orca table was modified - if ( - table_name not in self.last_checkpoint - or table_name in self.replaced_tables - ): - df = self.get_table(table_name).to_frame() - else: - continue - - logger.debug( - "add_checkpoint '%s' table '%s' %s" - % (checkpoint_name, table_name, util.df_size(df)) - ) + for table_name in self.uncheckpointed_table_names(): + df = self.get_dataframe(table_name) + logger.debug(f"add_checkpoint {checkpoint_name!r} table {table_name!r}") self.write_df(df, table_name, checkpoint_name) # remember which checkpoint it was last written self.last_checkpoint[table_name] = checkpoint_name - - self.replaced_tables.clear() + self.existing_table_status[table_name] = False self.last_checkpoint[CHECKPOINT_NAME] = checkpoint_name self.last_checkpoint[TIMESTAMP] = timestamp @@ -376,11 +518,7 @@ def registered_tables(self): """ Return a list of the names of all currently registered dataframe tables """ - return [ - name - for name in self._TABLES - if isinstance(self.context.get(name, None), (pd.DataFrame,)) - ] + return [name for name in self.existing_table_status if name in self.context] def checkpointed_tables(self): """ @@ -460,7 +598,7 @@ def load_checkpoint(self, checkpoint_name): ) logger.info("load_checkpoint table %s %s" % (table_name, df.shape)) # register it as an orca table - self.rewrap(table_name, df) + self.add_table(table_name, df) loaded_tables[table_name] = df if table_name == "land_use" and "_original_zone_id" in df.columns: # The presence of _original_zone_id indicates this table index was @@ -469,17 +607,19 @@ def load_checkpoint(self, checkpoint_name): # TODO: this "magic" column name should be replaced with a mechanism # to write and recover particular settings from the pipeline # store, but we don't have that mechanism yet - config.override_setting("offset_preprocessing", True) + self.settings.offset_preprocessing = True # register for tracing in order that tracing.register_traceable_table wants us to register them - traceable_tables = inject.get_injectable("traceable_tables", []) + traceable_tables = self.get_injectable("traceable_tables", []) + + from .tracing import register_traceable_table for table_name in traceable_tables: if table_name in loaded_tables: - tracing.register_traceable_table(table_name, loaded_tables[table_name]) + register_traceable_table(self, table_name, loaded_tables[table_name]) # add tables of known rng channels - rng_channels = inject.get_injectable("rng_channels", []) + rng_channels = self.get_injectable("rng_channels", []) if rng_channels: logger.debug("loading random channels %s" % rng_channels) for table_name in rng_channels: @@ -530,14 +670,16 @@ def run_model(self, model_name): else: checkpoint = self.intermediate_checkpoint(model_name) - inject.set_step_args(args) + self.add_injectable("step_args", args) + + self.trace_memory_info(f"pipeline.run_model {model_name} start") - mem.trace_memory_info(f"pipeline.run_model {model_name} start") + from .tracing import print_elapsed_time t0 = print_elapsed_time() logger.info(f"#run_model running step {step_name}") - instrument = config.setting("instrument", None) + instrument = self.settings.instrument if instrument is not None: try: from pyinstrument import Profiler @@ -553,19 +695,19 @@ def run_model(self, model_name): from pyinstrument import Profiler with Profiler() as profiler: - run_named_step(step_name, self.context) - out_file = config.profiling_file_path(f"{step_name}.html") + self.context = run_named_step(step_name, self.context) + out_file = self.filesystem.get_profiling_file_path(f"{step_name}.html") with open(out_file, "wt") as f: f.write(profiler.output_html()) else: - run_named_step(step_name, self.context) + self.context = run_named_step(step_name, self.context) t0 = print_elapsed_time( "#run_model completed step '%s'" % model_name, t0, debug=True ) - mem.trace_memory_info(f"pipeline.run_model {model_name} finished") + self.trace_memory_info(f"pipeline.run_model {model_name} finished") - inject.set_step_args(None) + self.add_injectable("step_args", None) self.rng().end_step(model_name) if checkpoint: @@ -595,12 +737,13 @@ def open_pipeline(self, resume_after=None, mode="a"): self.init_state() self.is_open = True - self.get_rn_generator().set_base_seed(inject.get_injectable("rng_base_seed", 0)) + self.get_rn_generator().set_base_seed(self.get("rng_base_seed", 0)) + pipeline_file_name = self.filesystem.pipeline_file_name if resume_after: # open existing pipeline logger.debug("open_pipeline - open existing pipeline") - self.open_pipeline_store(overwrite=False, mode=mode) + self.open_pipeline_store(pipeline_file_name, overwrite=False, mode=mode) try: self.load_checkpoint(resume_after) except KeyError as err: @@ -613,7 +756,7 @@ def open_pipeline(self, resume_after=None, mode="a"): else: # open new, empty pipeline logger.debug("open_pipeline - new, empty pipeline") - self.open_pipeline_store(overwrite=True) + self.open_pipeline_store(pipeline_file_name, overwrite=True) # - not sure why I thought we needed this? # could have exogenous tables or prng instantiation under some circumstance?? self.last_checkpoint[CHECKPOINT_NAME] = INITIAL_CHECKPOINT_NAME @@ -652,8 +795,7 @@ def close_pipeline(self): logger.debug("close_pipeline") def intermediate_checkpoint(self, checkpoint_name=None): - - checkpoints = config.setting("checkpoints", True) + checkpoints = self.settings.checkpoints if checkpoints is True or checkpoints is False: return checkpoints @@ -664,6 +806,11 @@ def intermediate_checkpoint(self, checkpoint_name=None): return checkpoint_name in checkpoints + def trace_memory_info(self, event): + from .mem import trace_memory_info + + return trace_memory_info(event, whale=self) + def run(self, models, resume_after=None, memory_sidecar_process=None): """ run the specified list of models, optionally loading checkpoint and resuming after specified @@ -687,6 +834,7 @@ def run(self, models, resume_after=None, memory_sidecar_process=None): returns: nothing, but with pipeline open """ + from .tracing import print_elapsed_time t0 = print_elapsed_time() @@ -701,15 +849,15 @@ def run(self, models, resume_after=None, memory_sidecar_process=None): if resume_after in models: models = models[models.index(resume_after) + 1 :] - mem.trace_memory_info("pipeline.run before preload_injectables") + self.trace_memory_info("pipeline.run before preload_injectables") # preload any bulky injectables (e.g. skims) not in pipeline - if inject.get_injectable("preload_injectables", None): - if memory_sidecar_process: - memory_sidecar_process.set_event("preload_injectables") - t0 = print_elapsed_time("preload_injectables", t0) + # if inject.get_injectable("preload_injectables", None): + # if memory_sidecar_process: + # memory_sidecar_process.set_event("preload_injectables") + # t0 = print_elapsed_time("preload_injectables", t0) - mem.trace_memory_info("pipeline.run after preload_injectables") + self.trace_memory_info("pipeline.run after preload_injectables") t0 = print_elapsed_time() for model in models: @@ -717,9 +865,11 @@ def run(self, models, resume_after=None, memory_sidecar_process=None): memory_sidecar_process.set_event(model) t1 = print_elapsed_time() self.run_model(model) - mem.trace_memory_info(f"pipeline.run after {model}") + self.trace_memory_info(f"pipeline.run after {model}") + + from .tracing import log_runtime - tracing.log_runtime(model_name=model, start_time=t1) + log_runtime(self, model_name=model, start_time=t1) if memory_sidecar_process: memory_sidecar_process.set_event("finalizing") @@ -728,7 +878,7 @@ def run(self, models, resume_after=None, memory_sidecar_process=None): if not self.intermediate_checkpoint(): self.add_checkpoint(FINAL_CHECKPOINT_NAME) - mem.trace_memory_info("pipeline.run after run_models") + self.trace_memory_info("pipeline.run after run_models") t0 = print_elapsed_time("run_model (%s models)" % len(models), t0) @@ -768,7 +918,6 @@ def get_table(self, table_name, checkpoint_name=None): # if they want current version of table, no need to read from pipeline store if checkpoint_name is None: - if table_name not in self.last_checkpoint: raise RuntimeError("table '%s' never checkpointed." % table_name) @@ -822,16 +971,12 @@ def get_checkpoints(self): else: df = store[CHECKPOINT_TABLE_NAME] else: - pipeline_file_path = config.pipeline_file_path( - self.context.get_formatted("pipeline_file_name") - ) - if pipeline_file_path.endswith(".h5"): + pipeline_file_path = self.filesystem.get_pipeline_filepath() + if pipeline_file_path.suffix == ".h5": df = pd.read_hdf(pipeline_file_path, CHECKPOINT_TABLE_NAME) else: df = pd.read_parquet( - Path(pipeline_file_path).joinpath( - CHECKPOINT_TABLE_NAME, "None.parquet" - ) + pipeline_file_path.joinpath(CHECKPOINT_TABLE_NAME, "None.parquet") ) # non-table columns first (column order in df is random because created from a dict) @@ -896,7 +1041,6 @@ def extend_table(self, table_name, df, axis=0): assert axis in [0, 1] if self.is_table(table_name): - table_df = self.get_table(table_name) if axis == 0: @@ -927,22 +1071,18 @@ def extend_table(self, table_name, df, axis=0): return df def drop_table(self, table_name): - assert self.is_open, f"Pipeline is not open." if self.is_table(table_name): - logger.debug("drop_table dropping orca table '%s'" % table_name) self.context.pop(table_name, None) self._TABLES.pop(table_name, None) if table_name in self.replaced_tables: - logger.debug("drop_table forgetting replaced_tables '%s'" % table_name) del self.replaced_tables[table_name] if table_name in self.last_checkpoint: - logger.debug( "drop_table removing table %s from last_checkpoint" % table_name ) @@ -965,19 +1105,17 @@ def cleanup_pipeline(self): """ # we don't expect to be called unless cleanup_pipeline_after_run setting is True - assert config.setting("cleanup_pipeline_after_run", False) + assert self.settings.cleanup_pipeline_after_run if not self.is_open: self.open_pipeline("_") assert self.is_open, f"Pipeline is not open." - FINAL_PIPELINE_FILE_NAME = ( - f"final_{inject.get_injectable('pipeline_file_name', 'pipeline')}" - ) + FINAL_PIPELINE_FILE_NAME = f"final_{self.filesystem.pipeline_file_name}" FINAL_CHECKPOINT_NAME = "final" - final_pipeline_file_path = config.build_output_file_path( + final_pipeline_file_path = self.filesystem.get_output_dir().joinpath( FINAL_PIPELINE_FILE_NAME ) @@ -985,8 +1123,24 @@ def cleanup_pipeline(self): checkpoints_df = self.get_checkpoints().tail(1).copy() checkpoints_df["checkpoint_name"] = FINAL_CHECKPOINT_NAME - with pd.HDFStore(final_pipeline_file_path, mode="w") as final_pipeline_store: + if final_pipeline_file_path.suffix == ".h5": + with pd.HDFStore( + str(final_pipeline_file_path), mode="w" + ) as final_pipeline_store: + for table_name in self.checkpointed_tables(): + # patch last checkpoint name for all tables + checkpoints_df[table_name] = FINAL_CHECKPOINT_NAME + + table_df = self.get_table(table_name) + logger.debug( + f"cleanup_pipeline - adding table {table_name} {table_df.shape}" + ) + + final_pipeline_store[table_name] = table_df + final_pipeline_store[CHECKPOINT_TABLE_NAME] = checkpoints_df + self.close_pipeline() + else: for table_name in self.checkpointed_tables(): # patch last checkpoint name for all tables checkpoints_df[table_name] = FINAL_CHECKPOINT_NAME @@ -995,12 +1149,24 @@ def cleanup_pipeline(self): logger.debug( f"cleanup_pipeline - adding table {table_name} {table_df.shape}" ) + table_df.to_parquet( + final_pipeline_file_path.joinpath( + table_name, f"{FINAL_CHECKPOINT_NAME}.parquet" + ) + ) + checkpoints_df.to_parquet( + final_pipeline_file_path.joinpath(CHECKPOINT_TABLE_NAME, "None.parquet") + ) - final_pipeline_store[table_name] = table_df + from .tracing import delete_output_files - final_pipeline_store[CHECKPOINT_TABLE_NAME] = checkpoints_df + logger.debug(f"deleting all pipeline files except {final_pipeline_file_path}") + delete_output_files(self, "h5", ignore=[final_pipeline_file_path]) + # TODO: delete nested directory structure. + delete_output_files(self, "parquet", ignore=[final_pipeline_file_path]) - self.close_pipeline() + # @contextlib.contextmanager + def chunk_log(self, *args, **kwargs): + from .chunk import chunk_log - logger.debug(f"deleting all pipeline files except {final_pipeline_file_path}") - tracing.delete_output_files("h5", ignore=[final_pipeline_file_path]) + return chunk_log(*args, **kwargs, settings=self.settings) diff --git a/activitysim/core/test/configs/logging.yaml b/activitysim/core/test/configs/logging.yaml index 35067d008..6d02e2ed3 100644 --- a/activitysim/core/test/configs/logging.yaml +++ b/activitysim/core/test/configs/logging.yaml @@ -28,7 +28,7 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: activitysim.log mode: w formatter: simpleFormatter level: NOTSET @@ -51,4 +51,3 @@ logging: class: logging.Formatter format: '%(asctime)s - %(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' - diff --git a/activitysim/core/tracing.py b/activitysim/core/tracing.py index 0d9958d6e..d00fe1614 100644 --- a/activitysim/core/tracing.py +++ b/activitysim/core/tracing.py @@ -14,9 +14,7 @@ import pandas as pd import yaml -from activitysim.core import inject - -from ..core.workflow import workflow_step +from ..core.workflow.steps import workflow_cached_object, workflow_step from . import config # Configurations @@ -67,7 +65,7 @@ def print_elapsed_time(msg=None, t0=None, debug=False): return t1 -def log_runtime(model_name, start_time=None, timing=None, force=False): +def log_runtime(whale, model_name, start_time=None, timing=None, force=False): global timing_notes assert (start_time or timing) and not (start_time and timing) @@ -78,50 +76,53 @@ def log_runtime(model_name, start_time=None, timing=None, force=False): process_name = multiprocessing.current_process().name - if config.setting("multiprocess", False) and not force: + if whale.settings.multiprocess and not force: # when benchmarking, log timing for each processes in its own log - if config.setting("benchmarking", False): + if whale.settings.benchmarking: header = "component_name,duration" - with config.open_log_file( + with whale.filesystem.open_log_file( f"timing_log.{process_name}.csv", "a", header ) as log_file: print(f"{model_name},{timing}", file=log_file) # only continue to log runtime in global timing log for locutor - if not inject.get_injectable("locutor", False): + if not whale.get_injectable("locutor", False): return header = "process_name,model_name,seconds,minutes,notes" note = " ".join(timing_notes) - with config.open_log_file("timing_log.csv", "a", header) as log_file: + with whale.filesystem.open_log_file("timing_log.csv", "a", header) as log_file: print(f"{process_name},{model_name},{seconds},{minutes},{note}", file=log_file) timing_notes.clear() -def delete_output_files(file_type, ignore=None, subdir=None): +def delete_output_files(whale, file_type, ignore=None, subdir=None): """ - Delete files in output directory of specified type + Delete files in output directory of specified type. Parameters ---------- - output_dir: str - Directory of trace output CSVs - - Returns - ------- - Nothing + whale : Pipeline + The output directory is read from the Pipeline. + file_type : str + File extension to delete. + ignore : list[Path-like] + Specific files to leave alone. + subdir : list[Path-like], optional + Subdirectories to scrub. If not given, the top level output directory + plus the 'log' and 'trace' directories will be scrubbed. """ - output_dir = inject.get_injectable("output_dir") + output_dir = whale.filesystem.get_output_dir() subdir = [subdir] if subdir else None directories = subdir or ["", "log", "trace"] for subdir in directories: - dir = os.path.join(output_dir, subdir) if subdir else output_dir + dir = output_dir.joinpath(output_dir, subdir) if subdir else output_dir - if not os.path.exists(dir): + if not dir.exists(): continue if ignore: @@ -144,16 +145,12 @@ def delete_output_files(file_type, ignore=None, subdir=None): print(e) -def delete_trace_files(): +def delete_trace_files(whale): """ Delete CSV files in output_dir - - Returns - ------- - Nothing """ - delete_output_files(CSV_FILE_TYPE, subdir="trace") - delete_output_files(CSV_FILE_TYPE, subdir="log") + delete_output_files(whale, CSV_FILE_TYPE, subdir="trace") + delete_output_files(whale, CSV_FILE_TYPE, subdir="log") active_log_files = [ h.baseFilename @@ -161,10 +158,10 @@ def delete_trace_files(): if isinstance(h, logging.FileHandler) ] - delete_output_files("log", ignore=active_log_files) + delete_output_files(whale, "log", ignore=active_log_files) -def config_logger(basic=False): +def config_logger(basic=False, whale=None): """ Configure logger @@ -179,18 +176,33 @@ def config_logger(basic=False): if basic: log_config_file = None else: - log_config_file = config.config_file_path( - LOGGING_CONF_FILE_NAME, mandatory=False - ) + if whale is None: + log_config_file = config.config_file_path( + LOGGING_CONF_FILE_NAME, mandatory=False + ) + else: + log_config_file = whale.filesystem.get_config_file_path( + LOGGING_CONF_FILE_NAME, mandatory=False + ) if log_config_file: try: with open(log_config_file) as f: - config_dict = yaml.load(f, Loader=yaml.UnsafeLoader) + config_dict = yaml.load(f, Loader=yaml.SafeLoader) except Exception as e: print(f"Unable to read logging config file {log_config_file}") raise e + if "logging" in config_dict: + if "handlers" in config_dict["logging"]: + for k, v in config_dict["logging"]["handlers"].items(): + if isinstance(v, dict) and "filename" in v: + from .config import log_file_path + + old_f = v["filename"] + v["filename"] = log_file_path(v["filename"], whale=whale) + print(f"CHANGE {old_f} -> {v['filename']}") + try: config_dict = config_dict["logging"] config_dict.setdefault("version", 1) @@ -244,20 +256,13 @@ def print_summary(label, df, describe=False, value_counts=False): logger.info("%s summary:\n%s" % (label, df.describe())) -@workflow_step -def initialize_traceable_tables(traceable_table_ids=None): +@workflow_step(inplace=True) +def initialize_traceable_tables(whale): - if traceable_table_ids is None: - traceable_table_ids = {} - if len(traceable_table_ids) > 0: - logger.debug( - f"initialize_traceable_tables resetting table_ids for {list(traceable_table_ids.keys())}" - ) - # ORCA# inject.add_injectable("traceable_table_ids", {}) - return {"traceable_table_ids": {}} + whale.set("traceable_table_ids", {}) -def register_traceable_table(table_name, df): +def register_traceable_table(whale, table_name, df): """ Register traceable table @@ -275,7 +280,7 @@ def register_traceable_table(table_name, df): logger.debug(f"register_traceable_table {table_name}") - traceable_tables = inject.get_injectable("traceable_tables", []) + traceable_tables = whale.get_injectable("traceable_tables", []) if table_name not in traceable_tables: logger.error("table '%s' not in traceable_tables" % table_name) return @@ -285,8 +290,8 @@ def register_traceable_table(table_name, df): logger.error("Can't register table '%s' without index name" % table_name) return - traceable_table_ids = inject.get_injectable("traceable_table_ids", {}) - traceable_table_indexes = inject.get_injectable("traceable_table_indexes", {}) + traceable_table_ids = whale.get_injectable("traceable_table_ids", {}) + traceable_table_indexes = whale.get_injectable("traceable_table_indexes", {}) if ( idx_name in traceable_table_indexes @@ -304,11 +309,11 @@ def register_traceable_table(table_name, df): logger.debug( "adding table %s.%s to traceable_table_indexes" % (table_name, idx_name) ) - inject.add_injectable("traceable_table_indexes", traceable_table_indexes) + whale.add_injectable("traceable_table_indexes", traceable_table_indexes) # add any new indexes associated with trace_hh_id to traceable_table_ids - trace_hh_id = inject.get_injectable("trace_hh_id", None) + trace_hh_id = whale.get_injectable("trace_hh_id", None) if trace_hh_id is None: return @@ -357,7 +362,7 @@ def register_traceable_table(table_name, df): if new_traced_ids: assert not set(prior_traced_ids) & set(new_traced_ids) traceable_table_ids[table_name] = prior_traced_ids + new_traced_ids - inject.add_injectable("traceable_table_ids", traceable_table_ids) + whale.add_injectable("traceable_table_ids", traceable_table_ids) logger.debug( "register %s: added %s new ids to %s existing trace ids" @@ -442,7 +447,7 @@ def write_csv( Parameters ---------- - df: pandas.DataFrame or pandas.Series + df: pandas.DataFrame or pandas.Series or dict traced dataframe file_name: str output file name @@ -527,7 +532,7 @@ def slice_ids(df, ids, column=None): return df -def get_trace_target(df, slicer, column=None): +def get_trace_target(whale, df, slicer, column=None): """ get target ids and column or index to identify target trace rows in df @@ -569,8 +574,8 @@ def get_trace_target(df, slicer, column=None): "bad slicer '%s' for df with index '%s'" % (slicer, df.index.name) ) - traceable_table_indexes = inject.get_injectable("traceable_table_indexes", {}) - traceable_table_ids = inject.get_injectable("traceable_table_ids", {}) + traceable_table_indexes = whale.access("traceable_table_indexes", {}) + traceable_table_ids = whale.access("traceable_table_ids", {}) if df.empty: target_ids = None @@ -579,14 +584,14 @@ def get_trace_target(df, slicer, column=None): table_name = traceable_table_indexes[slicer] target_ids = traceable_table_ids.get(table_name, []) elif slicer == "zone_id": - target_ids = inject.get_injectable("trace_od", []) + target_ids = whale.access("trace_od", []) return target_ids, column -def trace_targets(df, slicer=None, column=None): +def trace_targets(whale, df, slicer=None, column=None): - target_ids, column = get_trace_target(df, slicer, column) + target_ids, column = get_trace_target(whale, df, slicer, column) if target_ids is None: targets = None @@ -764,7 +769,7 @@ def interaction_trace_rows(interaction_df, choosers, sample_size=None): # slicer column name and id targets to use for chooser id added to model_design dataframe # currently we only ever slice by person_id, but that could change, so we check here... - traceable_table_ids = inject.get_injectable("traceable_table_ids", {}) + traceable_table_ids = whale.get_injectable("traceable_table_ids", {}) # Determine whether actual tables or proto_ tables for disaggregate accessibilities persons_table_name = set(traceable_table_ids).intersection( @@ -906,7 +911,7 @@ def no_results(trace_label): logger.info("Skipping %s: no_results" % trace_label) -def deregister_traceable_table(table_name): +def deregister_traceable_table(whale, table_name): """ un-register traceable table @@ -919,9 +924,9 @@ def deregister_traceable_table(table_name): ------- Nothing """ - traceable_tables = inject.get_injectable("traceable_tables", []) - traceable_table_ids = inject.get_injectable("traceable_table_ids", {}) - traceable_table_indexes = inject.get_injectable("traceable_table_indexes", {}) + traceable_tables = whale.get_injectable("traceable_tables", []) + traceable_table_ids = whale.get_injectable("traceable_table_ids", {}) + traceable_table_indexes = whale.get_injectable("traceable_table_indexes", {}) if table_name not in traceable_tables: logger.error("table '%s' not in traceable_tables" % table_name) @@ -934,7 +939,7 @@ def deregister_traceable_table(table_name): {k: v for k, v in traceable_table_indexes.items() if v != table_name} ) - inject.add_injectable("traceable_table_ids", traceable_table_ids) - inject.add_injectable("traceable_table_indexes", traceable_table_indexes) + whale.add_injectable("traceable_table_ids", traceable_table_ids) + whale.add_injectable("traceable_table_indexes", traceable_table_indexes) return diff --git a/activitysim/core/workflow/__init__.py b/activitysim/core/workflow/__init__.py index fc247a097..e746ba454 100644 --- a/activitysim/core/workflow/__init__.py +++ b/activitysim/core/workflow/__init__.py @@ -1,228 +1 @@ -import importlib -import importlib.machinery -import importlib.util -import logging -from inspect import getfullargspec -from typing import Mapping - -from pypyr.context import Context -from pypyr.errors import KeyNotInContextError - -_STEP_LIBRARY = {} - - -def get_formatted_or_raw(self: Context, key: str): - try: - return self.get_formatted(key) - except TypeError: - return self.get(key) - except Exception as err: - raise ValueError(f"extracting {key} from context") from err - - -def get_formatted_or_default(self: Context, key: str, default): - try: - return self.get_formatted(key) - except (KeyNotInContextError, KeyError): - return default - except TypeError: - return self.get(key) - except Exception as err: - raise ValueError(f"extracting {key} from context") from err - - -def error_logging(func): - def wrapper(*args, **kwargs): - try: - return func(*args, **kwargs) - except Exception as err: - logging.error(f"===== ERROR IN {func.__name__} =====") - logging.exception(f"{err}") - logging.error(f"===== / =====") - raise - - return wrapper - - -def _new_module(mod_name): - spec = importlib.machinery.ModuleSpec(mod_name, None) - return importlib.util.module_from_spec(spec) - - -def _create_module(mod_name, content): - mod = _new_module(mod_name) - for k, v in content.items(): - setattr(mod, k, v) - return mod - - -def _create_step(step_name, step_func): - _create_module(f"{__package__}.{step_name}", {"run_step": step_func}) - _STEP_LIBRARY[step_name] = step_func - - -def run_named_step(name, context): - context.update(_STEP_LIBRARY[name](context)) - return context - - -class workflow_step: - """ - Decorator for functions that update a context variable. - - The decorator will generate a `run_step` function in the same module, - wrapped with additional arguments and appropriately annotated for use - with the pypyr workflow model. The original function also remains - available to import and use without changes. - - When called as a step inside a pypyr workflow, the following context - variables are potentially accessed: - - report : xmle.Reporter - The active report into which new figures or tables are added. - caption : str - A caption for the item being processed. This is used both in - writing out the output (if any) in the report and for logging - step progression during a run. - caption_type : str - The caption type (typically, 'fig' for figures or 'tab' - for tables). - progress_tag : str - Use this instead of `caption` to log step progression during a run. - - If the function returns values that should update the context, that - can be done in one of three ways: - - - Set `updates_context` to True and return a `dict`, and use that - dict to update the context directly. - - Return a single object, and set `returns_names` to a string - giving the name that object should take in the context. - - Return a sequence of objects, and set `returns_names` to a - matching sequence of names that those objects should take - in the context. - - Otherwise, the return value is appended to the report. To declare that - there is no return value and no reporting should be done, you must - explicitly annotate the function with a return value of `-> None`. - - Important: there can be only one `workstep` in - each module. If you need more than one, make another separate module. - - Parameters - ---------- - wrapped_func : Callable - returns_names : str or tuple[str], optional - updates_context : bool, default False - - Returns - ------- - wrapped_func : Callable - The original wrapped function - - """ - - def __new__(cls, wrapped_func=None, *, step_name=None): - """ - Initialize a work step wrapper. - - Parameters - ---------- - wrapped_func : Callable - The function being decorated. - """ - if isinstance(wrapped_func, str): - # the step_name is provided instead of the wrapped func - step_name = wrapped_func - wrapped_func = None - if step_name is None and wrapped_func is not None: - step_name = wrapped_func.__name__ - self = super().__new__(cls) - self._step_name = step_name - if wrapped_func is not None: - return self(wrapped_func) - else: - return self - - def __call__(self, wrapped_func): - """ - Initialize a workflow_step wrapper. - - Parameters - ---------- - wrapped_func : Callable - The function being decorated. It should return a dictionary - of context updates. - """ - ( - _args, - _varargs, - _varkw, - _defaults, - _kwonlyargs, - _kwonlydefaults, - _annotations, - ) = getfullargspec(wrapped_func) - - def run_step(context: Context = None) -> None: - caption = get_formatted_or_default(context, "caption", None) - progress_tag = get_formatted_or_default(context, "progress_tag", caption) - # if progress_tag is not None: - # reset_progress_step(description=progress_tag) - - return_type = _annotations.get("return", "") - - caption_type = get_formatted_or_default(context, "caption_type", "fig") - caption_maker = get_formatted_or_default(context, caption_type, None) - # parse and run function itself - if _defaults is None: - ndefault = 0 - _required_args = _args - else: - ndefault = len(_defaults) - _required_args = _args[:-ndefault] - args = [] - for arg in _required_args: - context.assert_key_has_value(key=arg, caller=wrapped_func.__module__) - try: - args.append(get_formatted_or_raw(context, arg)) - except Exception as err: - raise ValueError(f"extracting {arg} from context") from err - if ndefault: - for arg, default in zip(_args[-ndefault:], _defaults): - args.append(get_formatted_or_default(context, arg, default)) - kwargs = {} - for karg in _kwonlyargs: - if karg in _kwonlydefaults: - kwargs[karg] = get_formatted_or_default( - context, karg, _kwonlydefaults[karg] - ) - else: - context.assert_key_has_value( - key=karg, caller=wrapped_func.__module__ - ) - try: - kwargs[karg] = get_formatted_or_raw(context, karg) - except Exception as err: - raise ValueError(f"extracting {karg} from context") from err - if _varkw: - kwargs.update(context) - for arg in _required_args: - if arg in kwargs: - kwargs.pop(arg) - outcome = error_logging(wrapped_func)(*args, **kwargs) - if not isinstance(outcome, Mapping): - raise ValueError( - f"{wrapped_func.__name__} is marked as updates_context, " - f"it should return a mapping" - ) - context.update(outcome) - - # module = importlib.import_module(wrapped_func.__module__) - # if hasattr(module, "run_step"): - # raise ValueError( - # f"{wrapped_func.__module__}.run_step exists, there can be only one per module" - # ) - # setattr(module, "run_step", run_step) - _create_step(self._step_name, run_step) - - return wrapped_func +from .steps import workflow_cached_object, workflow_step, workflow_table diff --git a/activitysim/core/workflow/injectable.py b/activitysim/core/workflow/injectable.py new file mode 100644 index 000000000..6baa175d8 --- /dev/null +++ b/activitysim/core/workflow/injectable.py @@ -0,0 +1,19 @@ +# import orca +# import logging +# +# from ..pipeline import Whale +# from ..exceptions import DuplicateLoadableTableError +# +# logger = logging.getLogger(__name__) +# +# def _injectable(cache=False): +# def decorator(func): +# name = func.__name__ +# logger.debug(f"found loadable object {name}") +# if name in Whale._LOADABLE_OBJECTS: +# raise DuplicateLoadableTableError(name) +# Whale._LOADABLE_OBJECTS[name] = (func, cache) +# return func +# +# return decorator +# diff --git a/activitysim/core/workflow/steps.py b/activitysim/core/workflow/steps.py new file mode 100644 index 000000000..843152be6 --- /dev/null +++ b/activitysim/core/workflow/steps.py @@ -0,0 +1,337 @@ +import importlib +import importlib.machinery +import importlib.util +import logging +import time +from inspect import getfullargspec +from typing import Callable, Mapping + +from pypyr.context import Context +from pypyr.errors import KeyNotInContextError + +from ..exceptions import DuplicateWorkflowNameError, DuplicateWorkflowTableError +from .util import get_formatted_or_default, get_formatted_or_raw + +logger = logging.getLogger(__name__) + +_STEP_LIBRARY = {} + + +def error_logging(func): + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except Exception as err: + logging.error(f"===== ERROR IN {func.__name__} =====") + logging.exception(f"{err}") + logging.error(f"===== / =====") + raise + + return wrapper + + +def _new_module(mod_name): + spec = importlib.machinery.ModuleSpec(mod_name, None) + return importlib.util.module_from_spec(spec) + + +def _create_module(mod_name, content): + mod = _new_module(mod_name) + for k, v in content.items(): + setattr(mod, k, v) + return mod + + +def _create_step(step_name, step_func): + # the module version of each step is for pypyr, and it always mutates + # context in-place instead of making updates to copies + _create_module(f"{__package__}.{step_name}", {"run_step": step_func}) + _STEP_LIBRARY[step_name] = step_func + + +def run_named_step(name, context): + try: + step_func = _STEP_LIBRARY[name] + except KeyError: + logger.error(f"Unknown step {name}, the known steps are:") + for n in sorted(_STEP_LIBRARY.keys()): + logger.error(f" - {n}") + raise + step_func(context) + return context + + +class workflow_step: + """ + Decorator for functions that update a context variable. + + The decorator will generate a `run_step` function in the same module, + wrapped with additional arguments and appropriately annotated for use + with the pypyr workflow model. The original function also remains + available to import and use without changes. + + When called as a step inside a pypyr workflow, the following context + variables are potentially accessed: + + report : xmle.Reporter + The active report into which new figures or tables are added. + caption : str + A caption for the item being processed. This is used both in + writing out the output (if any) in the report and for logging + step progression during a run. + caption_type : str + The caption type (typically, 'fig' for figures or 'tab' + for tables). + progress_tag : str + Use this instead of `caption` to log step progression during a run. + + If the function returns values that should update the context, that + can be done in one of three ways: + + - Set `updates_context` to True and return a `dict`, and use that + dict to update the context directly. + - Return a single object, and set `returns_names` to a string + giving the name that object should take in the context. + - Return a sequence of objects, and set `returns_names` to a + matching sequence of names that those objects should take + in the context. + + Otherwise, the return value is appended to the report. To declare that + there is no return value and no reporting should be done, you must + explicitly annotate the function with a return value of `-> None`. + + Important: there can be only one `workstep` in + each module. If you need more than one, make another separate module. + + Parameters + ---------- + wrapped_func : Callable + returns_names : str or tuple[str], optional + updates_context : bool, default False + + Returns + ------- + wrapped_func : Callable + The original wrapped function + + """ + + def __new__( + cls, + wrapped_func=None, + *, + step_name=None, + cache=False, + inplace=False, + kind="step", + ): + """ + Initialize a work step wrapper. + + Parameters + ---------- + wrapped_func : Callable + The function being decorated. + step_name : str + Use this name for the function being decorated, if not given + the existing name is used. + cache : bool, default False + If true, this function is only run if the named value is not + already stored in the context. Also, the return value should + not be a mapping but instead just a single Python object that + will be stored in the context with a key given by the step_name. + """ + if wrapped_func is not None and not isinstance(wrapped_func, Callable): + raise TypeError("workflow step must decorate a callable") + if step_name is None and wrapped_func is not None: + step_name = wrapped_func.__name__ + self = super().__new__(cls) + self._step_name = step_name + self._cache = cache + self._inplace = inplace + self._kind = kind + if wrapped_func is not None: + return self(wrapped_func) + else: + return self + + def __call__(self, wrapped_func): + """ + Initialize a workflow_step wrapper. + + Parameters + ---------- + wrapped_func : Callable + The function being decorated. It should return a dictionary + of context updates. + """ + from ..pipeline import Whale + + if self._step_name is None: + self._step_name = wrapped_func.__name__ + logger.debug(f"found workflow_{self._kind}: {self._step_name}") + + # check for duplicate workflow function names + if self._step_name in Whale._LOADABLE_OBJECTS: + raise DuplicateWorkflowNameError(self._step_name) + if self._step_name in Whale._LOADABLE_TABLES: + raise DuplicateWorkflowNameError(self._step_name) + if self._step_name in Whale._RUNNABLE_STEPS: + raise DuplicateWorkflowNameError(self._step_name) + + ( + _args, + _varargs, + _varkw, + _defaults, + _kwonlyargs, + _kwonlydefaults, + _annotations, + ) = getfullargspec(wrapped_func) + if _defaults is None: + _ndefault = 0 + _required_args = _args + else: + _ndefault = len(_defaults) + _required_args = _args[:-_ndefault] + + if not _required_args or _required_args[0] != "whale": + raise TypeError( + f"the first argument of a workflow_{self._kind} must be the whale" + ) + + def run_step(context: Context = None) -> None: + if self._cache and (context is not None) and (self._step_name in context): + return context.get_formatted(self._step_name) + assert isinstance(context, Context) + whale = Whale(context) + caption = get_formatted_or_default(context, "caption", None) + progress_tag = get_formatted_or_default(context, "progress_tag", caption) + # if progress_tag is not None: + # reset_progress_step(description=progress_tag) + + return_type = _annotations.get("return", "") + + caption_type = get_formatted_or_default(context, "caption_type", "fig") + caption_maker = get_formatted_or_default(context, caption_type, None) + # parse and run function itself + args = [] + for arg in _required_args: + if arg == "whale": + args.append(whale) + else: + try: + context.assert_key_has_value( + key=arg, caller=wrapped_func.__module__ + ) + except KeyNotInContextError: + # The desired key does not yet exist. We will attempt + # to create it using the whale. + if arg in whale._LOADABLE_TABLES: + arg_value = whale._LOADABLE_TABLES[arg](context) + elif arg in whale._LOADABLE_OBJECTS: + arg_value = whale._LOADABLE_OBJECTS[arg](context) + else: + raise + else: + arg_value = get_formatted_or_raw(context, arg) + try: + args.append(arg_value) + except Exception as err: + raise ValueError(f"extracting {arg} from context") from err + if _ndefault: + for arg, default in zip(_args[-_ndefault:], _defaults): + args.append(get_formatted_or_default(context, arg, default)) + kwargs = {} + for karg in _kwonlyargs: + if karg in _kwonlydefaults: + kwargs[karg] = get_formatted_or_default( + context, karg, _kwonlydefaults[karg] + ) + else: + context.assert_key_has_value( + key=karg, caller=wrapped_func.__module__ + ) + try: + kwargs[karg] = get_formatted_or_raw(context, karg) + except Exception as err: + raise ValueError(f"extracting {karg} from context") from err + if _varkw: + kwargs.update(context) + for arg in _required_args: + if arg in kwargs: + kwargs.pop(arg) + outcome = error_logging(wrapped_func)(*args, **kwargs) + if self._kind == "table": + context[self._step_name] = outcome + if "_salient_tables" not in context: + context["_salient_tables"] = {} + context["_salient_tables"][self._step_name] = time.time() + return outcome + elif self._kind == "cached_object": + context[self._step_name] = outcome + return outcome + elif self._kind == "step": + if outcome is not None: + if not isinstance(outcome, Mapping): + raise ValueError( + f"workflow step {wrapped_func.__name__} should return a mapping or None" + ) + context.update(outcome) + + _create_step(self._step_name, run_step) + + def update_with_cache(whale, *args, **kwargs): + ignore_cache = kwargs.pop("_ignore_cache_", False) + if self._step_name not in whale.context or ignore_cache: + whale.context[self._step_name] = wrapped_func(whale, *args, **kwargs) + return whale.context[self._step_name] + + if self._kind == "cached_object": + Whale._LOADABLE_OBJECTS[self._step_name] = run_step + return update_with_cache + elif self._kind == "table": + Whale._LOADABLE_TABLES[self._step_name] = run_step + return update_with_cache + elif self._kind == "step": + Whale._RUNNABLE_STEPS[self._step_name] = run_step + return wrapped_func + else: + raise ValueError(self._kind) + + +class workflow_cached_object(workflow_step): + def __new__(cls, wrapped_func=None, *, step_name=None): + return super().__new__( + cls, wrapped_func, step_name=step_name, cache=True, kind="cached_object" + ) + + +class workflow_table(workflow_step): + def __new__(cls, wrapped_func=None, *, step_name=None): + return super().__new__( + cls, wrapped_func, step_name=step_name, cache=True, kind="table" + ) + + +# def workflow_table(func): +# """ +# Decorator for functions that initialize tables. +# +# The function being decorated should have a single argument: `whale`. +# +# Parameters +# ---------- +# func +# +# Returns +# ------- +# func +# """ +# from ..pipeline import Whale +# name = func.__name__ +# logger.debug(f"found loadable table {name}") +# if name in Whale._LOADABLE_TABLES: +# raise DuplicateWorkflowTableError(name) +# Whale._LOADABLE_TABLES[name] = func +# return func diff --git a/activitysim/core/workflow/tableset.py b/activitysim/core/workflow/tableset.py new file mode 100644 index 000000000..58382643b --- /dev/null +++ b/activitysim/core/workflow/tableset.py @@ -0,0 +1,90 @@ +# +# import pandas as pd +# import logging +# from typing import Mapping +# from ..exceptions import PipelineAccessError, DuplicateLoadableTableError +# +# logger = logging.getLogger(__name__) +# +# +# +# +# def loadable_table(func): +# """ +# Decorator for functions that initialize tables. +# +# The function being decorated should have a single arguments: `whale`. +# +# Parameters +# ---------- +# func +# +# Returns +# ------- +# func +# """ +# from ..pipeline import Whale +# name = func.__name__ +# logger.debug(f"found loadable table {name}") +# if name in Whale._LOADABLE_TABLES: +# raise DuplicateLoadableTableError(name) +# Whale._LOADABLE_TABLES[name] = func +# return func +# +# +# +# class Tableset: +# +# def __init__(self): +# self.tables = {} +# self.unsaved_tables = set() +# self.saveable_tables = set() +# +# # def load_table(self, tablename, overwrite=False, swallow_errors=False): +# # if tablename in self.tables and not overwrite: +# # if swallow_errors: +# # return +# # raise ValueError(f"table {tablename} already loaded") +# # if tablename not in _LOADABLE_TABLES: +# # if swallow_errors: +# # return +# # raise ValueError(f"table {tablename} has no loading function") +# # if self.filesystem is None: +# # if swallow_errors: +# # return +# # raise PipelineAccessError("filesystem not attached to tableset") +# # if self.settings is None: +# # if swallow_errors: +# # return +# # raise PipelineAccessError("settings not attached to tableset") +# # logger.debug(f"loading table {tablename}") +# # t = _LOADABLE_TABLES[tablename](self, self.filesystem, self.settings) +# # self.store_data(tablename, t) +# # return t +# +# # +# # def get_frame(self, tablename): +# # t = self.tables.get(tablename, None) +# # if t is None: +# # t = self.load_table(tablename, swallow_errors=True) +# # if t is None: +# # raise KeyError(tablename) +# # if isinstance(t, pd.DataFrame): +# # return t +# # raise TypeError(f"cannot convert {tablename} to DataFrame") +# +# def store_data(self, name, data, saveable=True): +# self.tables[name] = data +# if saveable or name in self.saveable_tables: +# self.saveable_tables.add(name) +# self.unsaved_tables.add(name) +# +# def update(self, other, all_saveable=False): +# if isinstance(other, Tableset): +# for tablename, t in other.tables.items(): +# is_saveable = tablename in self.saveable_tables or tablename in other.saveable_tables +# self.store_data(tablename, t, saveable=is_saveable) +# elif isinstance(other, Mapping): +# for tablename, t in other.items(): +# is_saveable = all_saveable or (tablename in self.saveable_tables) +# self.store_data(tablename, t, saveable=is_saveable) diff --git a/activitysim/core/workflow/util.py b/activitysim/core/workflow/util.py new file mode 100644 index 000000000..d2c6e4186 --- /dev/null +++ b/activitysim/core/workflow/util.py @@ -0,0 +1,46 @@ +import logging + +from pypyr.context import Context, KeyNotInContextError + +logger = logging.getLogger(__name__) + + +def get_formatted_or_raw(self: Context, key: str): + try: + return self.get_formatted(key) + except TypeError: + return self.get(key) + except Exception as err: + raise ValueError(f"extracting {key} from context") from err + + +def get_formatted_or_default(self: Context, key: str, default): + try: + return self.get_formatted(key) + except (KeyNotInContextError, KeyError): + return default + except TypeError: + return self.get(key) + except Exception as err: + raise ValueError(f"extracting {key} from context") from err + + +# def _create_step(step_name, step_func): +# # the module version of each step is for pypyr, and it always mutates +# # context in-place instead of making updates to copies +# from .steps import _create_module, _STEP_LIBRARY +# _create_module(f"{__package__}.{step_name}", {"run_step": step_func}) +# _STEP_LIBRARY[step_name] = step_func +# +# +# def run_named_step(name, context): +# from .steps import _STEP_LIBRARY +# try: +# step_func = _STEP_LIBRARY[name] +# except KeyError: +# logger.error(f"Unknown step {name}, the known steps are:") +# for n in sorted(_STEP_LIBRARY.keys()): +# logger.error(f" - {n}") +# raise +# step_func(context) +# return context diff --git a/activitysim/examples/prototype_mtc/configs/logging.yaml b/activitysim/examples/prototype_mtc/configs/logging.yaml index 3b2851ddd..981584bed 100644 --- a/activitysim/examples/prototype_mtc/configs/logging.yaml +++ b/activitysim/examples/prototype_mtc/configs/logging.yaml @@ -40,7 +40,7 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: activitysim.log mode: w formatter: fileFormatter level: NOTSET From 22d592fec94caa7b07cfc18025ccf29cf7641b60 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Fri, 10 Feb 2023 15:06:03 -0600 Subject: [PATCH 006/419] refactoring --- .gitignore | 1 + activitysim/abm/misc.py | 23 +- activitysim/abm/models/accessibility.py | 42 +-- .../abm/models/atwork_subtour_destination.py | 18 +- .../abm/models/atwork_subtour_frequency.py | 39 +-- .../abm/models/atwork_subtour_mode_choice.py | 29 +-- .../abm/models/atwork_subtour_scheduling.py | 27 +- activitysim/abm/models/auto_ownership.py | 17 +- activitysim/abm/models/cdap.py | 22 +- .../abm/models/disaggregate_accessibility.py | 86 ++++--- activitysim/abm/models/free_parking.py | 19 +- activitysim/abm/models/initialize.py | 26 +- activitysim/abm/models/initialize_los.py | 38 +-- activitysim/abm/models/initialize_tours.py | 14 +- .../abm/models/joint_tour_composition.py | 27 +- .../abm/models/joint_tour_destination.py | 17 +- .../abm/models/joint_tour_frequency.py | 33 +-- .../abm/models/joint_tour_participation.py | 45 ++-- .../abm/models/joint_tour_scheduling.py | 24 +- activitysim/abm/models/location_choice.py | 122 +++++---- .../abm/models/mandatory_scheduling.py | 18 +- .../abm/models/mandatory_tour_frequency.py | 37 +-- .../abm/models/non_mandatory_destination.py | 18 +- .../abm/models/non_mandatory_scheduling.py | 13 +- .../models/non_mandatory_tour_frequency.py | 61 +++-- .../abm/models/parking_location_choice.py | 27 +- activitysim/abm/models/school_escorting.py | 35 ++- activitysim/abm/models/stop_frequency.py | 35 +-- activitysim/abm/models/summarize.py | 15 +- .../abm/models/telecommute_frequency.py | 16 +- activitysim/abm/models/tour_mode_choice.py | 44 ++-- activitysim/abm/models/tour_od_choice.py | 27 +- .../models/tour_scheduling_probabilistic.py | 24 +- .../abm/models/transit_pass_ownership.py | 16 +- .../abm/models/transit_pass_subsidy.py | 16 +- .../abm/models/trip_departure_choice.py | 36 +-- activitysim/abm/models/trip_destination.py | 115 ++++++--- activitysim/abm/models/trip_matrices.py | 42 +-- activitysim/abm/models/trip_mode_choice.py | 34 ++- activitysim/abm/models/trip_purpose.py | 43 ++-- .../models/trip_purpose_and_destination.py | 47 ++-- activitysim/abm/models/trip_scheduling.py | 42 +-- .../abm/models/trip_scheduling_choice.py | 36 ++- activitysim/abm/models/util/annotate.py | 25 +- activitysim/abm/models/util/canonical_ids.py | 33 ++- activitysim/abm/models/util/cdap.py | 18 +- activitysim/abm/models/util/estimation.py | 37 +-- activitysim/abm/models/util/logsums.py | 13 +- activitysim/abm/models/util/mode.py | 5 +- .../models/util/probabilistic_scheduling.py | 13 +- .../models/util/school_escort_tours_trips.py | 26 +- activitysim/abm/models/util/test/test_cdap.py | 33 +-- .../abm/models/util/tour_destination.py | 24 +- activitysim/abm/models/util/tour_od.py | 38 +-- .../abm/models/util/tour_scheduling.py | 17 +- activitysim/abm/models/util/trip.py | 4 +- .../models/util/vectorize_tour_scheduling.py | 61 ++++- activitysim/abm/models/vehicle_allocation.py | 43 ++-- activitysim/abm/models/vehicle_type_choice.py | 71 ++++-- activitysim/abm/models/work_from_home.py | 21 +- activitysim/abm/tables/accessibility.py | 8 +- .../abm/tables/disaggregate_accessibility.py | 21 +- activitysim/abm/tables/households.py | 15 +- activitysim/abm/tables/landuse.py | 15 +- activitysim/abm/tables/persons.py | 40 +-- activitysim/abm/tables/shadow_pricing.py | 74 +++--- activitysim/abm/tables/size_terms.py | 6 +- activitysim/abm/tables/skims.py | 30 +-- activitysim/abm/tables/time_windows.py | 27 +- activitysim/abm/tables/vehicles.py | 16 +- .../test_misc/test_trip_departure_choice.py | 6 +- .../test_misc/test_trip_scheduling_choice.py | 8 +- activitysim/benchmarking/componentwise.py | 9 +- activitysim/cli/run.py | 16 +- activitysim/core/assign.py | 13 +- activitysim/core/chunk.py | 10 +- activitysim/core/config.py | 207 +++++++-------- activitysim/core/configuration/filesystem.py | 52 ++++ activitysim/core/exceptions.py | 2 +- activitysim/core/expressions.py | 6 +- activitysim/core/flow.py | 41 +-- activitysim/core/input.py | 7 +- activitysim/core/interaction_sample.py | 72 ++++-- .../core/interaction_sample_simulate.py | 6 +- activitysim/core/interaction_simulate.py | 76 +++--- activitysim/core/logit.py | 14 +- activitysim/core/los.py | 101 ++++---- activitysim/core/mp_tasks.py | 14 +- activitysim/core/pathbuilder.py | 60 ++--- activitysim/core/simulate.py | 205 +++++++++------ activitysim/core/skim_dataset.py | 56 ++-- activitysim/core/skim_dict_factory.py | 12 +- activitysim/core/skim_dictionary.py | 18 +- activitysim/core/steps/output.py | 19 +- activitysim/core/test/extensions/steps.py | 28 +- activitysim/core/test/test_assign.py | 12 +- activitysim/core/test/test_logit.py | 4 +- activitysim/core/test/test_skim.py | 4 +- activitysim/core/test/test_timetable.py | 2 +- activitysim/core/timetable.py | 8 +- activitysim/core/tracing.py | 21 +- activitysim/core/workflow/__init__.py | 6 +- .../core/{pipeline.py => workflow/state.py} | 240 ++++++++++-------- activitysim/core/workflow/steps.py | 35 ++- activitysim/estimation/larch/cdap.py | 2 +- .../example_estimation/scripts/infer.py | 14 +- .../extensions/reassign_tour_purpose.py | 14 +- 107 files changed, 1977 insertions(+), 1643 deletions(-) rename activitysim/core/{pipeline.py => workflow/state.py} (87%) diff --git a/.gitignore b/.gitignore index b1c67af87..5c538adb6 100644 --- a/.gitignore +++ b/.gitignore @@ -77,3 +77,4 @@ _test_est **/output/ **/_generated_version.py docs/**/_generated +activitysim/examples/prototype_mtc_extended/test/*.ipynb diff --git a/activitysim/abm/misc.py b/activitysim/abm/misc.py index 6f665de5e..b5cf20c67 100644 --- a/activitysim/abm/misc.py +++ b/activitysim/abm/misc.py @@ -4,8 +4,7 @@ import pandas as pd -from ..core.pipeline import Whale -from ..core.workflow import workflow_cached_object +from activitysim.core import workflow # FIXME # warnings.filterwarnings('ignore', category=pd.io.pytables.PerformanceWarning) @@ -14,8 +13,8 @@ logger = logging.getLogger(__name__) -@workflow_cached_object -def households_sample_size(whale: Whale, override_hh_ids): +@workflow.cached_object +def households_sample_size(whale: workflow.Whale, override_hh_ids): if override_hh_ids is None: return whale.settings, households_sample_size @@ -23,8 +22,8 @@ def households_sample_size(whale: Whale, override_hh_ids): return 0 if override_hh_ids is None else len(override_hh_ids) -@workflow_cached_object -def override_hh_ids(whale: Whale): +@workflow.cached_object +def override_hh_ids(whale: workflow.Whale): hh_ids_filename = whale.settings.hh_ids if hh_ids_filename is None: @@ -75,8 +74,8 @@ def override_hh_ids(whale: Whale): # return id -@workflow_cached_object -def trace_od(whale: Whale): +@workflow.cached_object +def trace_od(whale: workflow.Whale): od = whale.settings.trace_od @@ -89,13 +88,13 @@ def trace_od(whale: Whale): return od -@workflow_cached_object -def chunk_size(whale: Whale): +@workflow.cached_object +def chunk_size(whale: workflow.Whale): _chunk_size = int(whale.settings.chunk_size or 0) return _chunk_size -@workflow_cached_object -def check_for_variability(whale: Whale): +@workflow.cached_object +def check_for_variability(whale: workflow.Whale): return bool(whale.settings.check_for_variability) diff --git a/activitysim/abm/models/accessibility.py b/activitysim/abm/models/accessibility.py index 34a1e38a4..32d61abb9 100644 --- a/activitysim/abm/models/accessibility.py +++ b/activitysim/abm/models/accessibility.py @@ -5,14 +5,13 @@ import numpy as np import pandas as pd -from ...core import assign, chunk, config, los, pipeline, tracing -from ...core.pipeline import Whale -from ...core.workflow import workflow_step +from activitysim.core import assign, chunk, los, tracing, workflow logger = logging.getLogger(__name__) def compute_accessibilities_for_zones( + whale, accessibility_df, land_use_df, assignment_spec, @@ -20,6 +19,7 @@ def compute_accessibilities_for_zones( network_los, trace_od, trace_label, + chunk_sizer, ): orig_zones = accessibility_df.index.values @@ -50,7 +50,7 @@ def compute_accessibilities_for_zones( # merge land_use_columns into od_df logger.info(f"{trace_label}: merge land_use_columns into od_df") od_df = pd.merge(od_df, land_use_df, left_on="dest", right_index=True).sort_index() - chunk.log_df(trace_label, "od_df", od_df) + chunk_sizer.log_df(trace_label, "od_df", od_df) locals_d = { "log": np.log, @@ -68,15 +68,16 @@ def compute_accessibilities_for_zones( logger.info(f"{trace_label}: assign.assign_variables") results, trace_results, trace_assigned_locals = assign.assign_variables( + whale, assignment_spec, od_df, locals_d, trace_rows=trace_od_rows, trace_label=trace_label, - chunk_log=True, + chunk_log=chunk_sizer, ) - chunk.log_df(trace_label, "results", results) + chunk_sizer.log_df(trace_label, "results", results) logger.info(f"{trace_label}: have results") # accessibility_df = accessibility_df.copy() @@ -113,13 +114,13 @@ def compute_accessibilities_for_zones( return accessibility_df -@workflow_step +@workflow.step def compute_accessibility( - whale: Whale, + whale: workflow.Whale, land_use: pd.DataFrame, accessibility: pd.DataFrame, - network_los, - chunk_size, + network_los: los.Network_LOS, + chunk_size: int, trace_od, ): @@ -140,23 +141,23 @@ def compute_accessibility( """ trace_label = "compute_accessibility" - model_settings = config.read_model_settings("accessibility.yaml") + model_settings = whale.filesystem.read_model_settings("accessibility.yaml") assignment_spec = assign.read_assignment_spec( - config.config_file_path("accessibility.csv") + whale.filesystem.get_config_file_path("accessibility.csv") ) - accessibility_df = accessibility.to_frame() + accessibility_df = accessibility if len(accessibility_df.columns) > 0: logger.warning( f"accessibility table is not empty. Columns:{list(accessibility_df.columns)}" ) raise RuntimeError(f"accessibility table is not empty.") - constants = config.get_model_constants(model_settings) + constants = model_settings.get("CONSTANTS", {}) # only include the land_use columns needed by spec, as specified by land_use_columns model_setting land_use_columns = model_settings.get("land_use_columns", []) - land_use_df = land_use.to_frame() + land_use_df = land_use land_use_df = land_use_df[land_use_columns] logger.info( @@ -165,11 +166,17 @@ def compute_accessibility( accessibilities_list = [] - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - accessibility_df, chunk_size, trace_label + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers( + whale, accessibility_df, chunk_size, trace_label ): accessibilities = compute_accessibilities_for_zones( + whale, chooser_chunk, land_use_df, assignment_spec, @@ -177,6 +184,7 @@ def compute_accessibility( network_los, trace_od, trace_label, + chunk_sizer, ) accessibilities_list.append(accessibilities) diff --git a/activitysim/abm/models/atwork_subtour_destination.py b/activitysim/abm/models/atwork_subtour_destination.py index 1b69cde77..9ec233800 100644 --- a/activitysim/abm/models/atwork_subtour_destination.py +++ b/activitysim/abm/models/atwork_subtour_destination.py @@ -2,24 +2,18 @@ # See full license in LICENSE.txt. import logging -import pandas as pd - -from activitysim.core import config, inject, pipeline, simulate, tracing -from activitysim.core.interaction_sample import interaction_sample -from activitysim.core.interaction_sample_simulate import interaction_sample_simulate +from activitysim.abm.models.util import estimation, tour_destination +from activitysim.core import config, inject, tracing, workflow from activitysim.core.util import assign_in_place -from .util import estimation, tour_destination - logger = logging.getLogger(__name__) DUMP = False -@inject.step() +@workflow.step def atwork_subtour_destination( - tours, persons_merged, network_los, chunk_size, trace_hh_id + whale: workflow.Whale, tours, persons_merged, network_los, chunk_size, trace_hh_id ): - trace_label = "atwork_subtour_destination" model_settings_file_name = "atwork_subtour_destination.yaml" model_settings = config.read_model_settings(model_settings_file_name) @@ -95,7 +89,7 @@ def atwork_subtour_destination( subtours[logsum_column_name] = choices_df["logsum"] assign_in_place(tours, subtours[[logsum_column_name]]) - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) tracing.print_summary( destination_column_name, subtours[destination_column_name], describe=True @@ -104,7 +98,7 @@ def atwork_subtour_destination( if want_sample_table: assert len(save_sample_df.index.get_level_values(0).unique()) == len(choices_df) # save_sample_df.set_index(model_settings['ALT_DEST_COL_NAME'], append=True, inplace=True) - pipeline.extend_table(sample_table_name, save_sample_df) + whale.extend_table(sample_table_name, save_sample_df) if trace_hh_id: tracing.trace_df( diff --git a/activitysim/abm/models/atwork_subtour_frequency.py b/activitysim/abm/models/atwork_subtour_frequency.py index d42b97fdc..34354498a 100644 --- a/activitysim/abm/models/atwork_subtour_frequency.py +++ b/activitysim/abm/models/atwork_subtour_frequency.py @@ -5,22 +5,23 @@ import numpy as np import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing - -from .util import estimation -from .util.tour_frequency import process_atwork_subtours +from activitysim.abm.models.util import estimation +from activitysim.abm.models.util.tour_frequency import process_atwork_subtours +from activitysim.core import config, expressions, simulate, tracing, workflow logger = logging.getLogger(__name__) -def add_null_results(trace_label, tours): +def add_null_results(whale, trace_label, tours): logger.info("Skipping %s: add_null_results", trace_label) tours["atwork_subtour_frequency"] = np.nan - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) -@inject.step() -def atwork_subtour_frequency(tours, persons_merged, chunk_size, trace_hh_id): +@workflow.step +def atwork_subtour_frequency( + whale: workflow.Whale, tours, persons_merged, chunk_size, trace_hh_id +): """ This model predicts the frequency of making at-work subtour tours (alternatives for this model come from a separate csv file which is @@ -35,7 +36,7 @@ def atwork_subtour_frequency(tours, persons_merged, chunk_size, trace_hh_id): # - if no work_tours if len(work_tours) == 0: - add_null_results(trace_label, tours) + add_null_results(whale, trace_label, tours) return model_settings = config.read_model_settings(model_settings_file_name) @@ -43,10 +44,12 @@ def atwork_subtour_frequency(tours, persons_merged, chunk_size, trace_hh_id): model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = simulate.eval_coefficients( + whale, model_spec, coefficients_df, estimator + ) alternatives = simulate.read_model_alts( - "atwork_subtour_frequency_alternatives.csv", set_index="alt" + whale, "atwork_subtour_frequency_alternatives.csv", set_index="alt" ) # merge persons into work_tours @@ -63,9 +66,11 @@ def atwork_subtour_frequency(tours, persons_merged, chunk_size, trace_hh_id): # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - expressions.assign_columns( - df=work_tours, model_settings=preprocessor_settings, trace_label=trace_label + whale, + df=work_tours, + model_settings=preprocessor_settings, + trace_label=trace_label, ) if estimator: @@ -99,7 +104,7 @@ def atwork_subtour_frequency(tours, persons_merged, chunk_size, trace_hh_id): # add atwork_subtour_frequency column to tours # reindex since we are working with a subset of tours tours["atwork_subtour_frequency"] = choices.reindex(tours.index) - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) # - create atwork_subtours based on atwork_subtour_frequency choice names work_tours = tours[tours.tour_type == "work"] @@ -107,10 +112,10 @@ def atwork_subtour_frequency(tours, persons_merged, chunk_size, trace_hh_id): subtours = process_atwork_subtours(work_tours, alternatives) - tours = pipeline.extend_table("tours", subtours) + tours = whale.extend_table("tours", subtours) - tracing.register_traceable_table("tours", subtours) - pipeline.get_rn_generator().add_channel("tours", subtours) + tracing.register_traceable_table(whale, "tours", subtours) + whale.get_rn_generator().add_channel("tours", subtours) tracing.print_summary( "atwork_subtour_frequency", tours.atwork_subtour_frequency, value_counts=True diff --git a/activitysim/abm/models/atwork_subtour_mode_choice.py b/activitysim/abm/models/atwork_subtour_mode_choice.py index bc3f1c66c..32c40b20a 100644 --- a/activitysim/abm/models/atwork_subtour_mode_choice.py +++ b/activitysim/abm/models/atwork_subtour_mode_choice.py @@ -5,27 +5,17 @@ import numpy as np import pandas as pd -from activitysim.core import ( - config, - expressions, - inject, - los, - pipeline, - simulate, - tracing, -) -from activitysim.core.pathbuilder import TransitVirtualPathBuilder +from activitysim.abm.models.util import estimation +from activitysim.abm.models.util.mode import run_tour_mode_choice_simulate +from activitysim.core import config, expressions, inject, los, tracing, workflow from activitysim.core.util import assign_in_place -from .util import estimation -from .util.mode import run_tour_mode_choice_simulate - logger = logging.getLogger(__name__) -@inject.step() +@workflow.step def atwork_subtour_mode_choice( - tours, persons_merged, network_los, chunk_size, trace_hh_id + whale: workflow.Whale, tours, persons_merged, network_los, chunk_size, trace_hh_id ): """ At-work subtour mode choice simulate @@ -154,21 +144,17 @@ def atwork_subtour_mode_choice( # add cached tvpb_logsum tap choices for modes specified in tvpb_mode_path_types if network_los.zone_system == los.THREE_ZONE: - tvpb_mode_path_types = model_settings.get("tvpb_mode_path_types") for mode, path_types in tvpb_mode_path_types.items(): - for direction, skim in zip( ["od", "do"], [tvpb_logsum_odt, tvpb_logsum_dot] ): - path_type = path_types[direction] skim_cache = skim.cache[path_type] print(f"mode {mode} direction {direction} path_type {path_type}") for c in skim_cache: - dest_col = f"{direction}_{c}" if dest_col not in choices_df: @@ -195,17 +181,18 @@ def atwork_subtour_mode_choice( ) assign_in_place(tours, choices_df) - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) # - annotate tours table if model_settings.get("annotate_tours"): tours = inject.get_table("tours").to_frame() expressions.assign_columns( + whale, df=tours, model_settings=model_settings.get("annotate_tours"), trace_label=tracing.extend_trace_label(trace_label, "annotate_tours"), ) - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) if trace_hh_id: tracing.trace_df( diff --git a/activitysim/abm/models/atwork_subtour_scheduling.py b/activitysim/abm/models/atwork_subtour_scheduling.py index 041d899a2..c5c41f3dc 100644 --- a/activitysim/abm/models/atwork_subtour_scheduling.py +++ b/activitysim/abm/models/atwork_subtour_scheduling.py @@ -5,22 +5,29 @@ import numpy as np import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate +from activitysim.abm.models.util import estimation +from activitysim.abm.models.util.vectorize_tour_scheduling import ( + vectorize_subtour_scheduling, +) +from activitysim.core import config, expressions, inject, simulate from activitysim.core import timetable as tt -from activitysim.core import tracing +from activitysim.core import tracing, workflow from activitysim.core.util import assign_in_place -from .util import estimation -from .util.vectorize_tour_scheduling import vectorize_subtour_scheduling - logger = logging.getLogger(__name__) DUMP = False -@inject.step() +@workflow.step def atwork_subtour_scheduling( - tours, persons_merged, tdd_alts, skim_dict, chunk_size, trace_hh_id + whale: workflow.Whale, + tours, + persons_merged, + tdd_alts, + skim_dict, + chunk_size, + trace_hh_id, ): """ This model predicts the departure time and duration of each activity for at work subtours tours @@ -43,7 +50,9 @@ def atwork_subtour_scheduling( model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) sharrow_skip = model_settings.get("sharrow_skip") coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = simulate.eval_coefficients( + whale, model_spec, coefficients_df, estimator + ) persons_merged = persons_merged.to_frame() @@ -96,7 +105,7 @@ def atwork_subtour_scheduling( ) assign_in_place(tours, tdd_choices) - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) if trace_hh_id: tracing.trace_df( diff --git a/activitysim/abm/models/auto_ownership.py b/activitysim/abm/models/auto_ownership.py index 564d6f94b..234c327ac 100644 --- a/activitysim/abm/models/auto_ownership.py +++ b/activitysim/abm/models/auto_ownership.py @@ -2,15 +2,16 @@ # See full license in LICENSE.txt. import logging -from activitysim.core import config, inject, pipeline, simulate, tracing - -from .util import estimation +from activitysim.abm.models.util import estimation +from activitysim.core import config, simulate, tracing, workflow logger = logging.getLogger(__name__) -@inject.step() -def auto_ownership_simulate(households, households_merged, chunk_size, trace_hh_id): +@workflow.step +def auto_ownership_simulate( + whale: workflow.Whale, households, households_merged, chunk_size, trace_hh_id +): """ Auto ownership is a standard model which predicts how many cars a household with given characteristics owns @@ -23,7 +24,9 @@ def auto_ownership_simulate(households, households_merged, chunk_size, trace_hh_ model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = simulate.eval_coefficients( + whale, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) @@ -63,7 +66,7 @@ def auto_ownership_simulate(households, households_merged, chunk_size, trace_hh_ # no need to reindex as we used all households households["auto_ownership"] = choices - pipeline.replace_table("households", households) + whale.add_table("households", households) tracing.print_summary( "auto_ownership", households.auto_ownership, value_counts=True diff --git a/activitysim/abm/models/cdap.py b/activitysim/abm/models/cdap.py index b37cf8a9a..b8b811d7b 100644 --- a/activitysim/abm/models/cdap.py +++ b/activitysim/abm/models/cdap.py @@ -4,16 +4,17 @@ import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.abm.models.util import cdap, estimation +from activitysim.core import config, expressions, inject, simulate, tracing, workflow from activitysim.core.util import reindex -from .util import cdap, estimation - logger = logging.getLogger(__name__) -@inject.step() -def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): +@workflow.step +def cdap_simulate( + whale: workflow.Whale, persons_merged, persons, households, chunk_size, trace_hh_id +): """ CDAP stands for Coordinated Daily Activity Pattern, which is a choice of high-level activity pattern for each person, in a coordinated way with other @@ -38,7 +39,7 @@ def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): coefficients_df = simulate.read_model_coefficients(model_settings) cdap_indiv_spec = simulate.eval_coefficients( - cdap_indiv_spec, coefficients_df, estimator + whale, cdap_indiv_spec, coefficients_df, estimator ) # Rules and coefficients for generating interaction specs for different household sizes @@ -46,7 +47,8 @@ def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): "INTERACTION_COEFFICIENTS", "cdap_interaction_coefficients.csv" ) cdap_interaction_coefficients = pd.read_csv( - config.config_file_path(interaction_coefficients_file_name), comment="#" + whale.filesystem.get_config_file_path(interaction_coefficients_file_name), + comment="#", ) # replace cdap_interaction_coefficients coefficient labels with numeric values @@ -152,21 +154,23 @@ def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): persons["cdap_activity"] = choices expressions.assign_columns( + whale, df=persons, model_settings=model_settings.get("annotate_persons"), trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), ) - pipeline.replace_table("persons", persons) + whale.add_table("persons", persons) # - annotate households table households = households.to_frame() expressions.assign_columns( + whale, df=households, model_settings=model_settings.get("annotate_households"), trace_label=tracing.extend_trace_label(trace_label, "annotate_households"), ) - pipeline.replace_table("households", households) + whale.add_table("households", households) tracing.print_summary("cdap_activity", persons.cdap_activity, value_counts=True) logger.info( diff --git a/activitysim/abm/models/disaggregate_accessibility.py b/activitysim/abm/models/disaggregate_accessibility.py index fe79d3fcd..4ca3ce11b 100644 --- a/activitysim/abm/models/disaggregate_accessibility.py +++ b/activitysim/abm/models/disaggregate_accessibility.py @@ -10,13 +10,13 @@ from activitysim.abm.models import initialize, location_choice from activitysim.abm.models.util import estimation, tour_destination from activitysim.abm.tables import shadow_pricing -from activitysim.core import config, inject, los, pipeline, tracing, util +from activitysim.core import config, inject, los, tracing, util, workflow from activitysim.core.expressions import assign_columns logger = logging.getLogger(__name__) -def read_disaggregate_accessibility_yaml(file_name): +def read_disaggregate_accessibility_yaml(whale: workflow.Whale, file_name): """ Adds in default table suffixes 'proto_' if not defined in the settings file """ @@ -39,21 +39,22 @@ def read_disaggregate_accessibility_yaml(file_name): size = model_settings.get(sample, 0) if size > 0 and size < 1: model_settings[sample] = round( - size * len(pipeline.get_table("land_use").index) + size * len(whale.get_dataframe("land_use").index) ) return model_settings class ProtoPop: - def __init__(self, network_los, chunk_size): + def __init__(self, whale: workflow.Whale, network_los, chunk_size): + self.whale = whale # Run necessary inits for later - initialize.initialize_landuse() + initialize.initialize_landuse(whale) # Initialization self.proto_pop = {} self.zone_list = [] - self.land_use = pipeline.get_table("land_use") + self.land_use = whale.get_dataframe("land_use") self.network_los = network_los self.chunk_size = chunk_size self.model_settings = read_disaggregate_accessibility_yaml( @@ -77,7 +78,7 @@ def __init__(self, network_los, chunk_size): ) ) self.inject_tables() - self.annotate_tables() + self.annotate_tables(whale) self.merge_persons() # - initialize shadow_pricing size tables after annotating household and person tables @@ -87,7 +88,7 @@ def __init__(self, network_los, chunk_size): if add_size_tables: # warnings.warn(f"Calling add_size_tables from initialize will be removed in the future.", FutureWarning) shadow_pricing._add_size_tables( - self.model_settings.get("suffixes"), scale=False + whale, self.model_settings.get("suffixes"), scale=False ) def zone_sampler(self): @@ -164,7 +165,7 @@ def zone_sampler(self): ), "K-Means only implemented for 2-zone systems for now" # Performs a simple k-means clustering using centroid XY coordinates - centroids_df = pipeline.get_table("maz_centroids") + centroids_df = self.whale.get_dataframe("maz_centroids") # Assert that land_use zone ids is subset of centroid zone ids assert set(self.land_use.index).issubset(set(centroids_df.index)) @@ -463,7 +464,7 @@ def create_proto_pop(self): if self.model_settings.get("FROM_TEMPLATES"): table_params = {k: self.params.get(k) for k in klist} tables = { - k: pd.read_csv(config.config_file_path(v.get("file"))) + k: pd.read_csv(whale.filesystem.get_config_file_path(v.get("file"))) for k, v in table_params.items() } households, persons, tours = self.expand_template_zones(tables) @@ -524,18 +525,18 @@ def inject_tables(self): ) for tablename, df in self.proto_pop.items(): inject.add_table(tablename, df) - pipeline.get_rn_generator().add_channel(tablename, df) + self.whale.get_rn_generator().add_channel(tablename, df) tracing.register_traceable_table(tablename, df) - # pipeline.get_rn_generator().drop_channel(tablename) - def annotate_tables(self): + def annotate_tables(self, whale: workflow.Whale): # Extract annotations for annotations in self.model_settings["annotate_proto_tables"]: tablename = annotations["tablename"] - df = pipeline.get_table(tablename) + df = self.whale.get_dataframe(tablename) assert df is not None assert annotations is not None assign_columns( + whale, df=df, model_settings={ **annotations["annotate"], @@ -543,11 +544,11 @@ def annotate_tables(self): }, trace_label=tracing.extend_trace_label("ProtoPop.annotate", tablename), ) - pipeline.replace_table(tablename, df) + self.whale.add_table(tablename, df) def merge_persons(self): - persons = pipeline.get_table("proto_persons") - households = pipeline.get_table("proto_households") + persons = self.whale.get_dataframe("proto_persons") + households = self.whale.get_dataframe("proto_households") # For dropping any extra columns created during merge cols_to_use = households.columns.difference(persons.columns) @@ -569,13 +570,15 @@ def merge_persons(self): inject.add_table("proto_persons_merged", persons_merged) -def get_disaggregate_logsums(network_los, chunk_size, trace_hh_id): +def get_disaggregate_logsums( + whale: workflow.Whale, network_los, chunk_size, trace_hh_id +): logsums = {} - persons_merged = pipeline.get_table("proto_persons_merged").sort_index( + persons_merged = whale.get_dataframe("proto_persons_merged").sort_index( inplace=False ) disagg_model_settings = read_disaggregate_accessibility_yaml( - "disaggregate_accessibility.yaml" + whale, "disaggregate_accessibility.yaml" ) for model_name in [ @@ -616,6 +619,7 @@ def get_disaggregate_logsums(network_los, chunk_size, trace_hh_id): # run location choice and return logsums _logsums, _ = location_choice.run_location_choice( + whale, choosers, network_los, shadow_price_calculator=spc, @@ -638,7 +642,7 @@ def get_disaggregate_logsums(network_los, chunk_size, trace_hh_id): ) else: - tours = pipeline.get_table("proto_tours") + tours = whale.get_dataframe("proto_tours") tours = tours[tours.tour_category == "non_mandatory"] _logsums, _ = tour_destination.run_tour_destination( @@ -670,15 +674,17 @@ def get_disaggregate_logsums(network_los, chunk_size, trace_hh_id): return logsums -@inject.step() -def initialize_proto_population(network_los, chunk_size): +@workflow.step +def initialize_proto_population(whale: workflow.Whale, network_los, chunk_size): # Synthesize the proto-population - ProtoPop(network_los, chunk_size) + ProtoPop(whale, network_los, chunk_size) return -@inject.step() -def compute_disaggregate_accessibility(network_los, chunk_size, trace_hh_id): +@workflow.step +def compute_disaggregate_accessibility( + whale: workflow.Whale, network_los, chunk_size, trace_hh_id +): """ Compute enhanced disaggregate accessibility for user specified population segments, as well as each zone in land use file using expressions from accessibility_spec. @@ -689,15 +695,15 @@ def compute_disaggregate_accessibility(network_los, chunk_size, trace_hh_id): for tablename in ["proto_households", "proto_persons", "proto_tours"]: df = inject.get_table(tablename).to_frame() traceables = inject.get_injectable("traceable_tables") - if tablename not in pipeline.get_rn_generator().channels: - pipeline.get_rn_generator().add_channel(tablename, df) + if tablename not in whale.get_rn_generator().channels: + whale.get_rn_generator().add_channel(tablename, df) if tablename not in traceables: inject.add_injectable("traceable_tables", traceables + [tablename]) tracing.register_traceable_table(tablename, df) del df # Run location choice - logsums = get_disaggregate_logsums(network_los, chunk_size, trace_hh_id) + logsums = get_disaggregate_logsums(whale, network_los, chunk_size, trace_hh_id) logsums = {k + "_accessibility": v for k, v in logsums.items()} # Combined accessibility table @@ -726,7 +732,7 @@ def compute_disaggregate_accessibility(network_los, chunk_size, trace_hh_id): # Merge in the proto pop data and inject it access_df = ( access_df.merge( - pipeline.get_table("proto_persons_merged").reset_index(), + whale.get_dataframe("proto_persons_merged").reset_index(), on="proto_household_id", ) .set_index("proto_person_id") @@ -740,24 +746,24 @@ def compute_disaggregate_accessibility(network_los, chunk_size, trace_hh_id): "school_destination_size", "workplace_destination_size", ]: - pipeline.drop_table(tablename) + whale.drop_table(tablename) - for ch in list(pipeline.get_rn_generator().channels.keys()): - pipeline.get_rn_generator().drop_channel(ch) + for ch in list(whale.get_rn_generator().channels.keys()): + whale.get_rn_generator().drop_channel(ch) # Drop any prematurely added traceables for trace in [ x for x in inject.get_injectable("traceable_tables") if "proto_" not in x ]: - tracing.deregister_traceable_table(trace) + tracing.deregister_traceable_table(whale, trace) - # need to clear any premature tables that were added during the previous run - orca._TABLES.clear() - for name, func in inject._DECORATED_TABLES.items(): - logger.debug("reinject decorated table %s" % name) - orca.add_table(name, func) + # # need to clear any premature tables that were added during the previous run + # orca._TABLES.clear() + # for name, func in inject._DECORATED_TABLES.items(): + # logger.debug("reinject decorated table %s" % name) + # orca.add_table(name, func) # Inject accessibility results into pipeline - [inject.add_table(k, df) for k, df in logsums.items()] + [whale.add_table(k, df) for k, df in logsums.items()] return diff --git a/activitysim/abm/models/free_parking.py b/activitysim/abm/models/free_parking.py index 086422ba2..c9f891186 100644 --- a/activitysim/abm/models/free_parking.py +++ b/activitysim/abm/models/free_parking.py @@ -2,15 +2,16 @@ # See full license in LICENSE.txt. import logging -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing - -from .util import estimation +from activitysim.abm.models.util import estimation +from activitysim.core import config, expressions, simulate, tracing, workflow logger = logging.getLogger(__name__) -@inject.step() -def free_parking(persons_merged, persons, chunk_size, trace_hh_id): +@workflow.step +def free_parking( + whale: workflow.Whale, persons_merged, persons, chunk_size, trace_hh_id +): """ """ trace_label = "free_parking" @@ -28,12 +29,12 @@ def free_parking(persons_merged, persons, chunk_size, trace_hh_id): # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_d = {} if constants is not None: locals_d.update(constants) expressions.assign_columns( + whale, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, @@ -42,7 +43,9 @@ def free_parking(persons_merged, persons, chunk_size, trace_hh_id): model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = simulate.eval_coefficients( + whale, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) @@ -79,7 +82,7 @@ def free_parking(persons_merged, persons, chunk_size, trace_hh_id): choices.reindex(persons.index).fillna(0).astype(bool) ) - pipeline.replace_table("persons", persons) + whale.add_table("persons", persons) tracing.print_summary( "free_parking", persons.free_parking_at_work, value_counts=True diff --git a/activitysim/abm/models/initialize.py b/activitysim/abm/models/initialize.py index 2c8c10655..465a95147 100644 --- a/activitysim/abm/models/initialize.py +++ b/activitysim/abm/models/initialize.py @@ -4,13 +4,13 @@ import os import warnings -import pandas as pd - -from ...core import chunk, config, expressions, inject, mem, pipeline, tracing -from ...core.pipeline import Whale -from ...core.steps.output import track_skim_usage, write_data_dictionary, write_tables -from ...core.workflow import workflow_step -from ..tables import disaggregate_accessibility, shadow_pricing +from activitysim.abm.tables import disaggregate_accessibility, shadow_pricing +from activitysim.core import chunk, config, expressions, inject, tracing, workflow +from activitysim.core.steps.output import ( + track_skim_usage, + write_data_dictionary, + write_tables, +) # We are using the naming conventions in the mtc_asim.h5 example # file for our default list. This provides backwards compatibility @@ -29,12 +29,12 @@ logger = logging.getLogger(__name__) -def annotate_tables(whale, model_settings, trace_label, chunk_sizer): +def annotate_tables(whale: workflow.Whale, model_settings, trace_label, chunk_sizer): """ Parameters ---------- - whale : Whale + whale : workflow.Whale model_settings : trace_label : str chunk_sizer : ChunkSizer @@ -100,8 +100,8 @@ def annotate_tables(whale, model_settings, trace_label, chunk_sizer): chunk_sizer.log_df(trace_label, tablename, None) -@workflow_step -def initialize_landuse(whale): +@workflow.step +def initialize_landuse(whale: workflow.Whale): """ Initialize the land use table. @@ -130,8 +130,8 @@ def initialize_landuse(whale): chunk_sizer.log_df(trace_label, "accessibility", accessibility) -@workflow_step -def initialize_households(whale): +@workflow.step +def initialize_households(whale: workflow.Whale): trace_label = "initialize_households" with whale.chunk_log(trace_label, base=True) as chunk_sizer: diff --git a/activitysim/abm/models/initialize_los.py b/activitysim/abm/models/initialize_los.py index 2649c09a3..6804453ef 100644 --- a/activitysim/abm/models/initialize_los.py +++ b/activitysim/abm/models/initialize_los.py @@ -8,19 +8,8 @@ import numba import numpy as np -import pandas as pd - -from activitysim.core import ( - assign, - chunk, - config, - inject, - los, - pathbuilder, - pipeline, - simulate, - tracing, -) + +from activitysim.core import chunk, inject, los, pathbuilder, tracing, workflow logger = logging.getLogger(__name__) @@ -55,21 +44,19 @@ def num_nans(data): def any_uninitialized(data, lock=None): - with lock_data(lock): result = any_nans(data) return result def num_uninitialized(data, lock=None): - with lock_data(lock): result = num_nans(data) return result -@inject.step() -def initialize_los(network_los): +@workflow.step +def initialize_los(whale: workflow.Whale, network_los): """ Currently, this step is only needed for THREE_ZONE systems in which the tap_tap_utilities are precomputed in the (presumably subsequent) initialize_tvpb step. @@ -84,13 +71,12 @@ def initialize_los(network_los): trace_label = "initialize_los" if network_los.zone_system == los.THREE_ZONE: - tap_cache = network_los.tvpb.tap_cache uid_calculator = network_los.tvpb.uid_calculator attribute_combinations_df = uid_calculator.scalar_attribute_combinations() # - write table to pipeline (so we can slice it, when multiprocessing) - pipeline.replace_table("attribute_combinations", attribute_combinations_df) + whale.add_table("attribute_combinations", attribute_combinations_df) # clean up any unwanted cache files from previous run if network_los.rebuild_tvpb_cache: @@ -118,9 +104,8 @@ def initialize_los(network_los): def compute_utilities_for_attribute_tuple( - network_los, scalar_attributes, data, chunk_size, trace_label + whale, network_los, scalar_attributes, data, chunk_size, trace_label ): - # scalar_attributes is a dict of attribute name/value pairs for this combination # (e.g. {'demographic_segment': 0, 'tod': 'AM', 'access_mode': 'walk'}) @@ -151,7 +136,7 @@ def compute_utilities_for_attribute_tuple( chunk_tag = "initialize_tvpb" # all attribute_combinations can use same cached data for row_size calc for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - choosers_df, chunk_size, trace_label, chunk_tag=chunk_tag + whale, choosers_df, chunk_size, trace_label, chunk_tag=chunk_tag ): # we should count choosers_df as chunk overhead since its pretty big and was custom made for compute_utilities if chooser_chunk._is_view: @@ -194,8 +179,10 @@ def compute_utilities_for_attribute_tuple( logger.debug(f"{trace_label} updated utilities") -@inject.step() -def initialize_tvpb(network_los, attribute_combinations, chunk_size): +@workflow.step +def initialize_tvpb( + whale: workflow.Whale, network_los, attribute_combinations, chunk_size +): """ Initialize STATIC tap_tap_utility cache and write mmap to disk. @@ -258,7 +245,7 @@ def initialize_tvpb(network_los, attribute_combinations, chunk_size): tuple_trace_label = tracing.extend_trace_label(trace_label, f"offset{offset}") compute_utilities_for_attribute_tuple( - network_los, scalar_attributes, data, chunk_size, tuple_trace_label + whale, network_los, scalar_attributes, data, chunk_size, tuple_trace_label ) # make sure we populated the entire offset @@ -271,7 +258,6 @@ def initialize_tvpb(network_los, attribute_combinations, chunk_size): write_results = not multiprocess or inject.get_injectable("locutor", False) if write_results: - if multiprocess: # if multiprocessing, wait for all processes to fully populate share data before writing results # (the other processes don't have to wait, since we were sliced by attribute combination diff --git a/activitysim/abm/models/initialize_tours.py b/activitysim/abm/models/initialize_tours.py index 476a53feb..c8607fc27 100644 --- a/activitysim/abm/models/initialize_tours.py +++ b/activitysim/abm/models/initialize_tours.py @@ -7,7 +7,7 @@ import pandas as pd from activitysim.abm.models.util import tour_frequency as tf -from activitysim.core import config, expressions, inject, pipeline, tracing +from activitysim.core import config, expressions, inject, tracing, workflow from activitysim.core.input import read_input_table logger = logging.getLogger(__name__) @@ -74,12 +74,13 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): return patched_tours -@inject.step() -def initialize_tours(network_los, households, persons, trace_hh_id): - +@workflow.step +def initialize_tours( + whale: workflow.Whale, network_los, households, persons, trace_hh_id +): trace_label = "initialize_tours" - tours = read_input_table("tours") + tours = read_input_table(whale, "tours") # FIXME can't use households_sliced injectable as flag like persons table does in case of resume_after. # FIXME could just always slice... @@ -95,6 +96,7 @@ def initialize_tours(network_los, households, persons, trace_hh_id): # annotate before patching tour_id to allow addition of REQUIRED_TOUR_COLUMNS defined above model_settings = config.read_model_settings("initialize_tours.yaml", mandatory=True) expressions.assign_columns( + whale, df=tours, model_settings=model_settings.get("annotate_tours"), trace_label=tracing.extend_trace_label(trace_label, "annotate_tours"), @@ -110,7 +112,7 @@ def initialize_tours(network_los, households, persons, trace_hh_id): # replace table function with dataframe inject.add_table("tours", tours) - pipeline.get_rn_generator().add_channel("tours", tours) + whale.get_rn_generator().add_channel("tours", tours) tracing.register_traceable_table("tours", tours) diff --git a/activitysim/abm/models/joint_tour_composition.py b/activitysim/abm/models/joint_tour_composition.py index a041fb9e8..681d9e217 100644 --- a/activitysim/abm/models/joint_tour_composition.py +++ b/activitysim/abm/models/joint_tour_composition.py @@ -4,22 +4,23 @@ import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing - -from .util import estimation -from .util.overlap import hh_time_window_overlap +from activitysim.abm.models.util import estimation +from activitysim.abm.models.util.overlap import hh_time_window_overlap +from activitysim.core import config, expressions, simulate, tracing, workflow logger = logging.getLogger(__name__) -def add_null_results(trace_label, tours): +def add_null_results(whale, trace_label, tours): logger.info("Skipping %s: add_null_results" % trace_label) tours["composition"] = "" - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) -@inject.step() -def joint_tour_composition(tours, households, persons, chunk_size, trace_hh_id): +@workflow.step +def joint_tour_composition( + whale: workflow.Whale, tours, households, persons, chunk_size, trace_hh_id +): """ This model predicts the makeup of the travel party (adults, children, or mixed). """ @@ -31,7 +32,7 @@ def joint_tour_composition(tours, households, persons, chunk_size, trace_hh_id): # - if no joint tours if joint_tours.shape[0] == 0: - add_null_results(trace_label, tours) + add_null_results(whale, trace_label, tours) return model_settings = config.read_model_settings(model_settings_file_name) @@ -51,13 +52,13 @@ def joint_tour_composition(tours, households, persons, chunk_size, trace_hh_id): # - run preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_dict = { "persons": persons, "hh_time_window_overlap": hh_time_window_overlap, } expressions.assign_columns( + whale, df=households, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -71,7 +72,9 @@ def joint_tour_composition(tours, households, persons, chunk_size, trace_hh_id): # - simple_simulate model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = simulate.eval_coefficients( + whale, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) @@ -107,7 +110,7 @@ def joint_tour_composition(tours, households, persons, chunk_size, trace_hh_id): # reindex since we ran model on a subset of households tours["composition"] = choices.reindex(tours.index).fillna("").astype(str) - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) tracing.print_summary( "joint_tour_composition", joint_tours.composition, value_counts=True diff --git a/activitysim/abm/models/joint_tour_destination.py b/activitysim/abm/models/joint_tour_destination.py index 02651d2a4..5c5c2927e 100644 --- a/activitysim/abm/models/joint_tour_destination.py +++ b/activitysim/abm/models/joint_tour_destination.py @@ -4,7 +4,7 @@ import pandas as pd -from activitysim.core import config, inject, pipeline, simulate, tracing +from activitysim.core import config, inject, tracing, workflow from activitysim.core.util import assign_in_place from .util import estimation, tour_destination @@ -12,11 +12,16 @@ logger = logging.getLogger(__name__) -@inject.step() +@workflow.step def joint_tour_destination( - tours, persons_merged, households_merged, network_los, chunk_size, trace_hh_id + whale: workflow.Whale, + tours, + persons_merged, + households_merged, + network_los, + chunk_size, + trace_hh_id, ): - """ Given the tour generation from the above, each tour needs to have a destination, so in this case tours are the choosers (with the associated @@ -85,7 +90,7 @@ def joint_tour_destination( # add column as we want joint_tours table for tracing. joint_tours["destination"] = choices_df.choice assign_in_place(tours, joint_tours[["destination"]]) - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) if want_logsums: joint_tours[logsum_column_name] = choices_df["logsum"] @@ -96,7 +101,7 @@ def joint_tour_destination( if want_sample_table: assert len(save_sample_df.index.get_level_values(0).unique()) == len(choices_df) # save_sample_df.set_index(model_settings['ALT_DEST_COL_NAME'], append=True, inplace=True) - pipeline.extend_table(sample_table_name, save_sample_df) + whale.extend_table(sample_table_name, save_sample_df) if trace_hh_id: tracing.trace_df(joint_tours, label="joint_tour_destination.joint_tours") diff --git a/activitysim/abm/models/joint_tour_frequency.py b/activitysim/abm/models/joint_tour_frequency.py index 103964665..f8cdd98a6 100644 --- a/activitysim/abm/models/joint_tour_frequency.py +++ b/activitysim/abm/models/joint_tour_frequency.py @@ -5,17 +5,18 @@ import numpy as np import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing - -from .util import estimation -from .util.overlap import hh_time_window_overlap -from .util.tour_frequency import process_joint_tours +from activitysim.abm.models.util import estimation +from activitysim.abm.models.util.overlap import hh_time_window_overlap +from activitysim.abm.models.util.tour_frequency import process_joint_tours +from activitysim.core import config, expressions, simulate, tracing, workflow logger = logging.getLogger(__name__) -@inject.step() -def joint_tour_frequency(households, persons, chunk_size, trace_hh_id): +@workflow.step +def joint_tour_frequency( + whale: workflow.Whale, households, persons, chunk_size, trace_hh_id +): """ This model predicts the frequency of making fully joint trips (see the alternatives above). @@ -28,7 +29,7 @@ def joint_tour_frequency(households, persons, chunk_size, trace_hh_id): model_settings = config.read_model_settings(model_settings_file_name) alternatives = simulate.read_model_alts( - "joint_tour_frequency_alternatives.csv", set_index="alt" + whale, "joint_tour_frequency_alternatives.csv", set_index="alt" ) # - only interested in households with more than one cdap travel_active person and @@ -49,22 +50,24 @@ def joint_tour_frequency(households, persons, chunk_size, trace_hh_id): # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_dict = { "persons": persons, "hh_time_window_overlap": hh_time_window_overlap, } expressions.assign_columns( + whale, df=multi_person_households, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label, ) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) + model_spec = simulate.read_model_spec(whale, file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = simulate.eval_coefficients( + whale, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) @@ -110,10 +113,10 @@ def joint_tour_frequency(households, persons, chunk_size, trace_hh_id): joint_tours = process_joint_tours(choices, alternatives, temp_point_persons) - tours = pipeline.extend_table("tours", joint_tours) + tours = whale.extend_table("tours", joint_tours) - tracing.register_traceable_table("tours", joint_tours) - pipeline.get_rn_generator().add_channel("tours", joint_tours) + tracing.register_traceable_table(whale, "tours", joint_tours) + whale.get_rn_generator().add_channel("tours", joint_tours) # - annotate households @@ -131,7 +134,7 @@ def joint_tour_frequency(households, persons, chunk_size, trace_hh_id): .astype(np.int8) ) - pipeline.replace_table("households", households) + whale.add_table("households", households) tracing.print_summary( "joint_tour_frequency", households.joint_tour_frequency, value_counts=True diff --git a/activitysim/abm/models/joint_tour_participation.py b/activitysim/abm/models/joint_tour_participation.py index 939d9fedd..d9806a076 100644 --- a/activitysim/abm/models/joint_tour_participation.py +++ b/activitysim/abm/models/joint_tour_participation.py @@ -4,27 +4,24 @@ import pandas as pd +from activitysim.abm.models.util import estimation from activitysim.abm.models.util.canonical_ids import MAX_PARTICIPANT_PNUM +from activitysim.abm.models.util.overlap import person_time_window_overlap from activitysim.core import ( - chunk, config, expressions, inject, logit, - pipeline, simulate, tracing, + workflow, ) from activitysim.core.util import assign_in_place, reindex -from .util import estimation -from .util.overlap import person_time_window_overlap - logger = logging.getLogger(__name__) def joint_tour_participation_candidates(joint_tours, persons_merged): - # - only interested in persons from households with joint_tours persons_merged = persons_merged[persons_merged.num_hh_joint_tours > 0] @@ -73,11 +70,9 @@ def joint_tour_participation_candidates(joint_tours, persons_merged): def get_tour_satisfaction(candidates, participate): - tour_ids = candidates.tour_id.unique() if participate.any(): - candidates = candidates[participate] # if this happens, we would need to filter them out! @@ -183,7 +178,6 @@ def participants_chooser(probs, choosers, spec, trace_label): iter = 0 while candidates.shape[0] > 0: - iter += 1 if iter > MAX_ITERATIONS: @@ -201,7 +195,7 @@ def participants_chooser(probs, choosers, spec, trace_label): assert False choices, rands = logit.make_choices( - probs, trace_label=trace_label, trace_choosers=choosers + whale, probs, trace_label=trace_label, trace_choosers=choosers ) participate = choices == PARTICIPATE_CHOICE @@ -210,7 +204,6 @@ def participants_chooser(probs, choosers, spec, trace_label): num_tours_satisfied_this_iter = tour_satisfaction.sum() if num_tours_satisfied_this_iter > 0: - num_tours_remaining -= num_tours_satisfied_this_iter satisfied = reindex(tour_satisfaction, candidates.tour_id) @@ -246,18 +239,18 @@ def participants_chooser(probs, choosers, spec, trace_label): def annotate_jtp(model_settings, trace_label): - # - annotate persons persons = inject.get_table("persons").to_frame() expressions.assign_columns( + whale, df=persons, model_settings=model_settings.get("annotate_persons"), trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), ) - pipeline.replace_table("persons", persons) + whale.add_table("persons", persons) -def add_null_results(model_settings, trace_label): +def add_null_results(whale, model_settings, trace_label): logger.info("Skipping %s: joint tours", trace_label) # participants table is used downstream in non-joint tour expressions @@ -265,14 +258,16 @@ def add_null_results(model_settings, trace_label): participants = pd.DataFrame(columns=PARTICIPANT_COLS) participants.index.name = "participant_id" - pipeline.replace_table("joint_tour_participants", participants) + whale.add_table("joint_tour_participants", participants) # - run annotations annotate_jtp(model_settings, trace_label) -@inject.step() -def joint_tour_participation(tours, persons_merged, chunk_size, trace_hh_id): +@workflow.step +def joint_tour_participation( + whale: workflow.Whale, tours, persons_merged, chunk_size, trace_hh_id +): """ Predicts for each eligible person to participate or not participate in each joint tour. """ @@ -285,7 +280,7 @@ def joint_tour_participation(tours, persons_merged, chunk_size, trace_hh_id): # - if no joint tours if joint_tours.shape[0] == 0: - add_null_results(model_settings, trace_label) + add_null_results(whale, model_settings, trace_label) return persons_merged = persons_merged.to_frame() @@ -293,7 +288,7 @@ def joint_tour_participation(tours, persons_merged, chunk_size, trace_hh_id): # - create joint_tour_participation_candidates table candidates = joint_tour_participation_candidates(joint_tours, persons_merged) tracing.register_traceable_table("joint_tour_participants", candidates) - pipeline.get_rn_generator().add_channel("joint_tour_participants", candidates) + whale.get_rn_generator().add_channel("joint_tour_participants", candidates) logger.info( "Running joint_tours_participation with %d potential participants (candidates)" @@ -303,13 +298,13 @@ def joint_tour_participation(tours, persons_merged, chunk_size, trace_hh_id): # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_dict = { "person_time_window_overlap": person_time_window_overlap, "persons": persons_merged, } expressions.assign_columns( + whale, df=candidates, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -322,7 +317,9 @@ def joint_tour_participation(tours, persons_merged, chunk_size, trace_hh_id): model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = simulate.eval_coefficients( + whale, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) @@ -400,10 +397,10 @@ def joint_tour_participation(tours, persons_merged, chunk_size, trace_hh_id): + 1 ) - pipeline.replace_table("joint_tour_participants", participants) + whale.add_table("joint_tour_participants", participants) # drop channel as we aren't using any more (and it has candidates that weren't chosen) - pipeline.get_rn_generator().drop_channel("joint_tour_participants") + whale.get_rn_generator().drop_channel("joint_tour_participants") # - assign joint tour 'point person' (participant_num == 1) point_persons = participants[participants.participant_num == 1] @@ -414,7 +411,7 @@ def joint_tour_participation(tours, persons_merged, chunk_size, trace_hh_id): assign_in_place(tours, joint_tours[["person_id", "number_of_participants"]]) - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) # - run annotations annotate_jtp(model_settings, trace_label) diff --git a/activitysim/abm/models/joint_tour_scheduling.py b/activitysim/abm/models/joint_tour_scheduling.py index ca56327d0..f2fb41796 100644 --- a/activitysim/abm/models/joint_tour_scheduling.py +++ b/activitysim/abm/models/joint_tour_scheduling.py @@ -4,17 +4,20 @@ import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.abm.models.util import estimation +from activitysim.abm.models.util.vectorize_tour_scheduling import ( + vectorize_joint_tour_scheduling, +) +from activitysim.core import config, expressions, inject, simulate, tracing, workflow from activitysim.core.util import assign_in_place, reindex -from .util import estimation -from .util.vectorize_tour_scheduling import vectorize_joint_tour_scheduling - logger = logging.getLogger(__name__) -@inject.step() -def joint_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_id): +@workflow.step +def joint_tour_scheduling( + whale: workflow.Whale, tours, persons_merged, tdd_alts, chunk_size, trace_hh_id +): """ This model predicts the departure time and duration of each joint tour """ @@ -54,12 +57,12 @@ def joint_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_ # - run preprocessor to annotate choosers preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_d = {} if constants is not None: locals_d.update(constants) expressions.assign_columns( + whale, df=joint_tours, model_settings=preprocessor_settings, locals_dict=locals_d, @@ -73,7 +76,9 @@ def joint_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_ model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) sharrow_skip = model_settings.get("sharrow_skip", False) coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = simulate.eval_coefficients( + whale, model_spec, coefficients_df, estimator + ) if estimator: estimator.write_model_settings(model_settings, model_settings_file_name) @@ -82,6 +87,7 @@ def joint_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_ timetable.begin_transaction(estimator) choices = vectorize_joint_tour_scheduling( + whale, joint_tours, joint_tour_participants, persons_merged, @@ -126,7 +132,7 @@ def joint_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_ ) assign_in_place(tours, choices) - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) # updated df for tracing joint_tours = tours[tours.tour_category == "joint"] diff --git a/activitysim/abm/models/location_choice.py b/activitysim/abm/models/location_choice.py index 1a7564942..fa28b1e7f 100644 --- a/activitysim/abm/models/location_choice.py +++ b/activitysim/abm/models/location_choice.py @@ -5,23 +5,14 @@ import numpy as np import pandas as pd +from activitysim.abm.models.util import estimation +from activitysim.abm.models.util import logsums as logsum +from activitysim.abm.models.util import tour_destination from activitysim.abm.tables import shadow_pricing -from activitysim.core import ( - config, - expressions, - inject, - los, - pipeline, - simulate, - tracing, -) +from activitysim.core import expressions, inject, los, simulate, tracing, workflow from activitysim.core.interaction_sample import interaction_sample from activitysim.core.interaction_sample_simulate import interaction_sample_simulate -from .util import estimation -from .util import logsums as logsum -from .util import tour_destination - # import multiprocessing @@ -80,7 +71,7 @@ ALT_LOGSUM = "mode_choice_logsum" -def write_estimation_specs(estimator, model_settings, settings_file): +def write_estimation_specs(whale, estimator, model_settings, settings_file): """ write sample_spec, spec, and coefficients to estimation data bundle @@ -96,14 +87,13 @@ def write_estimation_specs(estimator, model_settings, settings_file): estimator.write_coefficients(model_settings=model_settings) estimator.write_table( - inject.get_injectable("size_terms"), "size_terms", append=False - ) - estimator.write_table( - inject.get_table("land_use").to_frame(), "landuse", append=False + whale.get_injectable("size_terms"), "size_terms", append=False ) + estimator.write_table(whale.get_dataframe("land_use"), "landuse", append=False) def _location_sample( + whale, segment_name, choosers, alternatives, @@ -138,7 +128,7 @@ def _location_sample( logger.info("Running %s with %d persons" % (trace_label, len(choosers.index))) sample_size = model_settings["SAMPLE_SIZE"] - if config.setting("disable_destination_sampling", False) or ( + if whale.settings.disable_destination_sampling or ( estimator and estimator.want_unsampled_alternatives ): # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count @@ -155,10 +145,11 @@ def _location_sample( "dest_col_name": skims.dest_key, # added for sharrow flows "timeframe": "timeless", } - constants = config.get_model_constants(model_settings) + constants = model_settings.get("CONSTANTS", {}) locals_d.update(constants) spec = simulate.spec_for_segment( + whale, model_settings, spec_id="SAMPLE_SPEC", segment_name=segment_name, @@ -166,9 +157,10 @@ def _location_sample( ) # here since presumably we want this when called for either sample or presample - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = whale.settings.log_alt_losers choices = interaction_sample( + whale, choosers, alternatives, spec=spec, @@ -187,6 +179,7 @@ def _location_sample( def location_sample( + whale, segment_name, persons_merged, network_los, @@ -197,7 +190,6 @@ def location_sample( chunk_tag, trace_label, ): - # FIXME - MEMORY HACK - only include columns actually used in spec chooser_columns = model_settings["SIMULATE_CHOOSER_COLUMNS"] choosers = persons_merged[chooser_columns] @@ -212,6 +204,7 @@ def location_sample( alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] choices = _location_sample( + whale, segment_name, choosers, dest_size_terms, @@ -316,6 +309,7 @@ def aggregate_size_terms(dest_size_terms, network_los, model_settings): def location_presample( + whale, segment_name, persons_merged, network_los, @@ -326,7 +320,6 @@ def location_presample( chunk_tag, trace_label, ): - trace_label = tracing.extend_trace_label(trace_label, "presample") logger.info(f"{trace_label} location_presample") @@ -359,6 +352,7 @@ def location_presample( skims = skim_dict.wrap(HOME_TAZ, DEST_TAZ) taz_sample = _location_sample( + whale, segment_name, choosers, TAZ_size_terms, @@ -382,7 +376,7 @@ def location_presample( # choose a MAZ for each DEST_TAZ choice, choice probability based on MAZ size_term fraction of TAZ total maz_choices = tour_destination.choose_MAZ_for_TAZ( - taz_sample, MAZ_size_terms, trace_label + whale, taz_sample, MAZ_size_terms, trace_label ) assert DEST_MAZ in maz_choices @@ -392,6 +386,7 @@ def location_presample( def run_location_sample( + whale, segment_name, persons_merged, network_los, @@ -428,7 +423,7 @@ def run_location_sample( # by default, enable presampling for multizone systems, unless they disable it in settings file pre_sample_taz = not (network_los.zone_system == los.ONE_ZONE) - if pre_sample_taz and not config.setting("want_dest_choice_presampling", True): + if pre_sample_taz and not whale.setting.want_dest_choice_presampling: pre_sample_taz = False logger.info( f"Disabled destination zone presampling for {trace_label} " @@ -436,13 +431,13 @@ def run_location_sample( ) if pre_sample_taz: - logger.info( "Running %s location_presample with %d persons" % (trace_label, len(persons_merged)) ) choices = location_presample( + whale, segment_name, persons_merged, network_los, @@ -455,8 +450,8 @@ def run_location_sample( ) else: - choices = location_sample( + whale, segment_name, persons_merged, network_los, @@ -472,6 +467,7 @@ def run_location_sample( def run_location_logsums( + whale, segment_name, persons_merged_df, network_los, @@ -504,7 +500,9 @@ def run_location_logsums( assert not location_sample_df.empty - logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) + logsum_settings = whale.filesystem.read_model_settings( + model_settings["LOGSUM_SETTINGS"] + ) # FIXME - MEMORY HACK - only include columns actually used in spec persons_merged_df = logsum.filter_chooser_columns( @@ -522,6 +520,7 @@ def run_location_logsums( tour_purpose = tour_purpose[segment_name] logsums = logsum.compute_logsums( + whale, choosers, tour_purpose, logsum_settings, @@ -542,6 +541,7 @@ def run_location_logsums( def run_location_simulate( + whale, segment_name, persons_merged, location_sample_df, @@ -600,7 +600,7 @@ def run_location_simulate( "dest_col_name": skims.dest_key, # added for sharrow flows "timeframe": "timeless", } - constants = config.get_model_constants(model_settings) + constants = model_settings.get("CONSTANTS", {}) if constants is not None: locals_d.update(constants) @@ -614,7 +614,7 @@ def run_location_simulate( model_settings, spec_id="SPEC", segment_name=segment_name, estimator=estimator ) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = whale.settings.log_alt_losers choices = interaction_sample_simulate( choosers, @@ -644,6 +644,7 @@ def run_location_simulate( def run_location_choice( + whale, persons_merged_df, network_los, shadow_price_calculator, @@ -694,7 +695,6 @@ def run_location_choice( choices_list = [] sample_list = [] for segment_name, segment_id in segment_ids.items(): - choosers = persons_merged_df[ persons_merged_df[chooser_segment_column] == segment_id ] @@ -712,6 +712,7 @@ def run_location_choice( # - location_sample location_sample_df = run_location_sample( + whale, segment_name, choosers, network_los, @@ -727,6 +728,7 @@ def run_location_choice( # - location_logsums location_sample_df = run_location_logsums( + whale, segment_name, choosers, network_los, @@ -741,6 +743,7 @@ def run_location_choice( # - location_simulate choices_df = run_location_simulate( + whale, segment_name, choosers, location_sample_df, @@ -835,6 +838,7 @@ def run_location_choice( def iterate_location_choice( + whale, model_settings, persons_merged, persons, @@ -881,11 +885,10 @@ def iterate_location_choice( sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + whale.settings.want_dest_choice_sample_tables and sample_table_name is not None ) - persons_merged_df = persons_merged.to_frame() + persons_merged_df = persons_merged persons_merged_df = persons_merged_df[persons_merged[chooser_filter_column]] @@ -901,7 +904,7 @@ def iterate_location_choice( chooser_segment_column in persons_merged_df ), f"CHOOSER_SEGMENT_COLUMN '{chooser_segment_column}' not in persons_merged table." - spc = shadow_pricing.load_shadow_price_calculator(model_settings) + spc = shadow_pricing.load_shadow_price_calculator(whale, model_settings) max_iterations = spc.max_iterations assert not (spc.use_shadow_pricing and estimator) @@ -910,11 +913,10 @@ def iterate_location_choice( choices_df = None # initialize to None, will be populated in first iteration for iteration in range(1, max_iterations + 1): - persons_merged_df_ = persons_merged_df.copy() if spc.use_shadow_pricing and iteration > 1: - spc.update_shadow_prices() + spc.update_shadow_prices(whale) if spc.shadow_settings["SHADOW_PRICE_METHOD"] == "simulation": # filter from the sampled persons @@ -924,6 +926,7 @@ def iterate_location_choice( persons_merged_df_ = persons_merged_df_.sort_index() choices_df_, save_sample_df = run_location_choice( + whale, persons_merged_df_, network_los, shadow_price_calculator=spc, @@ -1012,35 +1015,36 @@ def iterate_location_choice( # might be None for tiny samples even if sample_table_name was specified assert len(save_sample_df.index.get_level_values(0).unique()) == len(choices_df) # lest they try to put school and workplace samples into the same table - if pipeline.is_table(sample_table_name): + if whale.is_table(sample_table_name): raise RuntimeError( "dest choice sample table %s already exists" % sample_table_name ) - pipeline.extend_table(sample_table_name, save_sample_df) + whale.extend_table(sample_table_name, save_sample_df) # - annotate persons table if "annotate_persons" in model_settings: expressions.assign_columns( + whale, df=persons_df, model_settings=model_settings.get("annotate_persons"), trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), ) - pipeline.replace_table("persons", persons_df) + whale.add_table("persons", persons_df) if trace_hh_id: tracing.trace_df(persons_df, label=trace_label, warn_if_empty=True) # - annotate households table if "annotate_households" in model_settings: - households_df = households.to_frame() expressions.assign_columns( + whale, df=households_df, model_settings=model_settings.get("annotate_households"), trace_label=tracing.extend_trace_label(trace_label, "annotate_households"), ) - pipeline.replace_table("households", households_df) + whale.add_table("households", households_df) if trace_hh_id: tracing.trace_df(households_df, label=trace_label, warn_if_empty=True) @@ -1053,9 +1057,16 @@ def iterate_location_choice( return persons_df -@inject.step() +@workflow.step def workplace_location( - persons_merged, persons, households, network_los, chunk_size, trace_hh_id, locutor + whale: workflow.Whale, + persons_merged, + persons, + households, + network_los, + chunk_size, + trace_hh_id, + locutor, ): """ workplace location choice model @@ -1064,7 +1075,7 @@ def workplace_location( """ trace_label = "workplace_location" - model_settings = config.read_model_settings("workplace_location.yaml") + model_settings = whale.filesystem.read_model_settings("workplace_location.yaml") estimator = estimation.manager.begin_estimation("workplace_location") if estimator: @@ -1076,7 +1087,7 @@ def workplace_location( # raise RuntimeError(f"fake fail {process_name}") # disable locutor for benchmarking - if config.setting("benchmarking", False): + if whale.settings.benchmarking: locutor = False iterate_location_choice( @@ -1096,9 +1107,15 @@ def workplace_location( estimator.end_estimation() -@inject.step() +@workflow.step def school_location( - persons_merged, persons, households, network_los, chunk_size, trace_hh_id, locutor + whale: workflow.Whale, + persons_merged, + persons, + households, + network_los, + chunk_size, + locutor, ): """ School location choice model @@ -1107,17 +1124,18 @@ def school_location( """ trace_label = "school_location" - model_settings = config.read_model_settings("school_location.yaml") + model_settings = whale.filesystem.read_model_settings("school_location.yaml") - estimator = estimation.manager.begin_estimation("school_location") + estimator = estimation.manager.begin_estimation(whale, "school_location") if estimator: write_estimation_specs(estimator, model_settings, "school_location.yaml") # disable locutor for benchmarking - if config.setting("benchmarking", False): + if whale.settings.benchmarking: locutor = False iterate_location_choice( + whale, model_settings, persons_merged, persons, @@ -1125,7 +1143,7 @@ def school_location( network_los, estimator, chunk_size, - trace_hh_id, + whale.settings.trace_hh_id, locutor, trace_label, ) diff --git a/activitysim/abm/models/mandatory_scheduling.py b/activitysim/abm/models/mandatory_scheduling.py index 6a9618874..fbed6b27d 100644 --- a/activitysim/abm/models/mandatory_scheduling.py +++ b/activitysim/abm/models/mandatory_scheduling.py @@ -2,24 +2,20 @@ # See full license in LICENSE.txt. import logging -import pandas as pd - -from activitysim.core import config, expressions, inject, pipeline, simulate +from activitysim.abm.models.util.tour_scheduling import run_tour_scheduling from activitysim.core import timetable as tt -from activitysim.core import tracing +from activitysim.core import tracing, workflow from activitysim.core.util import assign_in_place, reindex -from .util import estimation -from .util import vectorize_tour_scheduling as vts -from .util.tour_scheduling import run_tour_scheduling - logger = logging.getLogger(__name__) DUMP = False -@inject.step() -def mandatory_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_id): +@workflow.step +def mandatory_tour_scheduling( + whale: workflow.Whale, tours, persons_merged, tdd_alts, chunk_size, trace_hh_id +): """ This model predicts the departure time and duration of each activity for mandatory tours """ @@ -64,7 +60,7 @@ def mandatory_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace ) assign_in_place(tours, choices) - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) # updated df for tracing mandatory_tours = tours[tours.tour_category == "mandatory"] diff --git a/activitysim/abm/models/mandatory_tour_frequency.py b/activitysim/abm/models/mandatory_tour_frequency.py index 727a591f0..3b072f369 100644 --- a/activitysim/abm/models/mandatory_tour_frequency.py +++ b/activitysim/abm/models/mandatory_tour_frequency.py @@ -4,15 +4,14 @@ import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing - -from .util import estimation -from .util.tour_frequency import process_mandatory_tours +from activitysim.abm.models.util import estimation +from activitysim.abm.models.util.tour_frequency import process_mandatory_tours +from activitysim.core import config, expressions, inject, simulate, tracing, workflow logger = logging.getLogger(__name__) -def add_null_results(trace_label, mandatory_tour_frequency_settings): +def add_null_results(whale, trace_label, mandatory_tour_frequency_settings): logger.info("Skipping %s: add_null_results", trace_label) persons = inject.get_table("persons").to_frame() @@ -23,19 +22,22 @@ def add_null_results(trace_label, mandatory_tour_frequency_settings): tours["tour_type"] = None tours["person_id"] = None tours.index.name = "tour_id" - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) expressions.assign_columns( + whale, df=persons, model_settings=mandatory_tour_frequency_settings.get("annotate_persons"), trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), ) - pipeline.replace_table("persons", persons) + whale.add_table("persons", persons) -@inject.step() -def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): +@workflow.step +def mandatory_tour_frequency( + whale: workflow.Whale, persons_merged, chunk_size, trace_hh_id +): """ This model predicts the frequency of making mandatory trips (see the alternatives above) - these trips include work and school in some combination. @@ -52,16 +54,16 @@ def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): # - if no mandatory tours if choosers.shape[0] == 0: - add_null_results(trace_label, model_settings) + add_null_results(whale, trace_label, model_settings) return # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_dict = {} expressions.assign_columns( + whale, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -72,7 +74,9 @@ def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = simulate.eval_coefficients( + whale, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) @@ -112,7 +116,7 @@ def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): the same as got non_mandatory_tours except trip types are "work" and "school" """ alternatives = simulate.read_model_alts( - "mandatory_tour_frequency_alternatives.csv", set_index="alt" + whale, "mandatory_tour_frequency_alternatives.csv", set_index="alt" ) choosers["mandatory_tour_frequency"] = choices.reindex(choosers.index) @@ -120,9 +124,9 @@ def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): persons=choosers, mandatory_tour_frequency_alts=alternatives ) - tours = pipeline.extend_table("tours", mandatory_tours) + tours = whale.extend_table("tours", mandatory_tours) tracing.register_traceable_table("tours", mandatory_tours) - pipeline.get_rn_generator().add_channel("tours", mandatory_tours) + whale.get_rn_generator().add_channel("tours", mandatory_tours) # - annotate persons persons = inject.get_table("persons").to_frame() @@ -133,12 +137,13 @@ def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): ) expressions.assign_columns( + whale, df=persons, model_settings=model_settings.get("annotate_persons"), trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), ) - pipeline.replace_table("persons", persons) + whale.add_table("persons", persons) tracing.print_summary( "mandatory_tour_frequency", persons.mandatory_tour_frequency, value_counts=True diff --git a/activitysim/abm/models/non_mandatory_destination.py b/activitysim/abm/models/non_mandatory_destination.py index adfd4a098..7f400e63e 100644 --- a/activitysim/abm/models/non_mandatory_destination.py +++ b/activitysim/abm/models/non_mandatory_destination.py @@ -4,20 +4,18 @@ import pandas as pd -from activitysim.core import config, inject, pipeline, simulate, tracing +from activitysim.core import config, inject, simulate, tracing, workflow from activitysim.core.util import assign_in_place -from .util import estimation, tour_destination, annotate - +from .util import annotate, estimation, tour_destination logger = logging.getLogger(__name__) -@inject.step() +@workflow.step def non_mandatory_tour_destination( - tours, persons_merged, network_los, chunk_size, trace_hh_id + whale: workflow.Whale, tours, persons_merged, network_los, chunk_size, trace_hh_id ): - """ Given the tour generation from the above, each tour needs to have a destination, so in this case tours are the choosers (with the associated @@ -46,7 +44,7 @@ def non_mandatory_tour_destination( # separating out pure escort school tours # they already have their destination set - if pipeline.is_table("school_escort_tours"): + if whale.is_table("school_escort_tours"): nm_tour_index = non_mandatory_tours.index pure_school_escort_tours = non_mandatory_tours[ (non_mandatory_tours["school_esc_outbound"] == "pure_escort") @@ -98,7 +96,7 @@ def non_mandatory_tour_destination( non_mandatory_tours["destination"] = choices_df.choice # merging back in school escort tours and preserving index - if pipeline.is_table("school_escort_tours"): + if whale.is_table("school_escort_tours"): non_mandatory_tours = pd.concat( [pure_school_escort_tours, non_mandatory_tours] ).set_index(nm_tour_index) @@ -113,7 +111,7 @@ def non_mandatory_tour_destination( ~tours["destination"].isna() ), f"Tours are missing destination: {tours[tours['destination'].isna()]}" - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) if model_settings.get("annotate_tours"): annotate.annotate_tours(model_settings, trace_label) @@ -121,7 +119,7 @@ def non_mandatory_tour_destination( if want_sample_table: assert len(save_sample_df.index.get_level_values(0).unique()) == len(choices_df) # save_sample_df.set_index(model_settings['ALT_DEST_COL_NAME'], append=True, inplace=True) - pipeline.extend_table(sample_table_name, save_sample_df) + whale.extend_table(sample_table_name, save_sample_df) if trace_hh_id: tracing.trace_df( diff --git a/activitysim/abm/models/non_mandatory_scheduling.py b/activitysim/abm/models/non_mandatory_scheduling.py index 5b32c9550..7ad7f547e 100644 --- a/activitysim/abm/models/non_mandatory_scheduling.py +++ b/activitysim/abm/models/non_mandatory_scheduling.py @@ -2,21 +2,18 @@ # See full license in LICENSE.txt. import logging -import pandas as pd - -from activitysim.core import config, expressions, inject, pipeline, simulate +from activitysim.abm.models.util.tour_scheduling import run_tour_scheduling from activitysim.core import timetable as tt -from activitysim.core import tracing +from activitysim.core import tracing, workflow from activitysim.core.util import assign_in_place -from .util.tour_scheduling import run_tour_scheduling logger = logging.getLogger(__name__) DUMP = False -@inject.step() +@workflow.step def non_mandatory_tour_scheduling( - tours, persons_merged, tdd_alts, chunk_size, trace_hh_id + whale: workflow.Whale, tours, persons_merged, tdd_alts, chunk_size, trace_hh_id ): """ This model predicts the departure time and duration of each activity for non-mandatory tours @@ -48,7 +45,7 @@ def non_mandatory_tour_scheduling( ) assign_in_place(tours, choices) - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) # updated df for tracing non_mandatory_tours = tours[tours.tour_category == "non_mandatory"] diff --git a/activitysim/abm/models/non_mandatory_tour_frequency.py b/activitysim/abm/models/non_mandatory_tour_frequency.py index 93b36b2f4..0b83563b7 100644 --- a/activitysim/abm/models/non_mandatory_tour_frequency.py +++ b/activitysim/abm/models/non_mandatory_tour_frequency.py @@ -5,29 +5,22 @@ import numpy as np import pandas as pd -from activitysim.core import ( - config, - expressions, - inject, - logit, - pipeline, - simulate, - tracing, +from activitysim.abm.models.util import annotate, estimation +from activitysim.abm.models.util.overlap import person_max_window +from activitysim.abm.models.util.school_escort_tours_trips import ( + recompute_tour_count_statistics, ) +from activitysim.abm.models.util.tour_frequency import process_non_mandatory_tours +from activitysim.core import config, expressions, logit, simulate, tracing, workflow from activitysim.core.interaction_simulate import interaction_simulate -from .util import estimation -from .util import annotate -from .util.school_escort_tours_trips import recompute_tour_count_statistics - -from .util.overlap import person_max_window -from .util.tour_frequency import process_non_mandatory_tours - logger = logging.getLogger(__name__) -def extension_probs(): - f = config.config_file_path("non_mandatory_tour_frequency_extension_probs.csv") +def extension_probs(whale: workflow.Whale): + f = whale.filesystem.get_config_file_path( + "non_mandatory_tour_frequency_extension_probs.csv" + ) df = pd.read_csv(f, comment="#") # convert cum probs to individual probs @@ -37,7 +30,9 @@ def extension_probs(): return df -def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_label): +def extend_tour_counts( + whale: workflow.Whale, persons, tour_counts, alternatives, trace_hh_id, trace_label +): """ extend tour counts based on a probability table @@ -86,10 +81,11 @@ def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_la logger.info("extend_tour_counts - no persons eligible for tour_count extension") return tour_counts - have_trace_targets = trace_hh_id and tracing.has_trace_targets(extend_tour_counts) + have_trace_targets = trace_hh_id and tracing.has_trace_targets( + whale, extend_tour_counts + ) for i, tour_type in enumerate(alternatives.columns): - i_tour_type = i + 1 # (probs_spec nonmandatory_tour_type column is 1-based) tour_type_trace_label = tracing.extend_trace_label(trace_label, tour_type) @@ -113,6 +109,7 @@ def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_la # - random choice of extension magnitude based on relative probs choices, rands = logit.make_choices( + whale, choosers[PROBABILITY_COLUMNS], trace_label=tour_type_trace_label, trace_choosers=choosers, @@ -137,8 +134,10 @@ def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_la return tour_counts -@inject.step() -def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_id): +@workflow.step +def non_mandatory_tour_frequency( + whale: workflow.Whale, persons, persons_merged, chunk_size, trace_hh_id +): """ This model predicts the frequency of making non-mandatory trips (alternatives for this model come from a separate csv file which is @@ -154,7 +153,7 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i # FIXME kind of tacky both that we know to add this here and del it below # 'tot_tours' is used in model_spec expressions alternatives = simulate.read_model_alts( - "non_mandatory_tour_frequency_alternatives.csv", set_index=None + whale, "non_mandatory_tour_frequency_alternatives.csv", set_index=None ) alternatives["tot_tours"] = alternatives.sum(axis=1) @@ -165,10 +164,10 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_dict = {"person_max_window": person_max_window} expressions.assign_columns( + whale, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -185,7 +184,6 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i # segment by person type and pick the right spec for each person type choices_list = [] for segment_settings in spec_segments: - segment_name = segment_settings["NAME"] ptype = segment_settings["PTYPE"] @@ -208,7 +206,7 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i coefficients_df = simulate.read_model_coefficients(segment_settings) segment_spec = simulate.eval_coefficients( - segment_spec, coefficients_df, estimator + whale, segment_spec, coefficients_df, estimator ) if estimator: @@ -238,6 +236,7 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i log_alt_losers = config.setting("log_alt_losers", False) choices = interaction_simulate( + whale, chooser_segment, alternatives, spec=segment_spec, @@ -342,7 +341,6 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i assert len(non_mandatory_tours) == extended_tour_counts.sum().sum() if estimator: - # make sure they created the right tours survey_tours = estimation.manager.get_survey_table("tours").sort_index() non_mandatory_survey_tours = survey_tours[ @@ -374,12 +372,12 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i assert not tours_differ.any() - pipeline.extend_table("tours", non_mandatory_tours) + whale.extend_table("tours", non_mandatory_tours) tracing.register_traceable_table("tours", non_mandatory_tours) - pipeline.get_rn_generator().add_channel("tours", non_mandatory_tours) + whale.get_rn_generator().add_channel("tours", non_mandatory_tours) - if pipeline.is_table("school_escort_tours"): + if whale.is_table("school_escort_tours"): # need to re-compute tour frequency statistics to account for school escort tours recompute_tour_count_statistics() @@ -387,12 +385,13 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i annotate.annotate_tours(model_settings, trace_label) expressions.assign_columns( + whale, df=persons, model_settings=model_settings.get("annotate_persons"), trace_label=trace_label, ) - pipeline.replace_table("persons", persons) + whale.add_table("persons", persons) tracing.print_summary( "non_mandatory_tour_frequency", diff --git a/activitysim/abm/models/parking_location_choice.py b/activitysim/abm/models/parking_location_choice.py index 3b10f32b4..79b4ba8e3 100644 --- a/activitysim/abm/models/parking_location_choice.py +++ b/activitysim/abm/models/parking_location_choice.py @@ -10,16 +10,14 @@ expressions, inject, logit, - pipeline, simulate, tracing, + workflow, ) from activitysim.core.interaction_sample_simulate import interaction_sample_simulate from activitysim.core.tracing import print_elapsed_time from activitysim.core.util import assign_in_place -from .util import estimation - logger = logging.getLogger(__name__) NO_DESTINATION = -1 @@ -81,7 +79,6 @@ def wrap_skims(model_settings): def get_spec_for_segment(model_settings, spec_name, segment): - omnibus_spec = simulate.read_model_spec(file_name=model_settings[spec_name]) spec = omnibus_spec[[segment]] @@ -94,6 +91,7 @@ def get_spec_for_segment(model_settings, spec_name, segment): def parking_destination_simulate( + whale: workflow.Whale, segment_name, trips, destination_sample, @@ -117,7 +115,7 @@ def parking_destination_simulate( spec = get_spec_for_segment(model_settings, "SPECIFICATION", segment_name) coefficients_df = simulate.read_model_coefficients(model_settings) - spec = simulate.eval_coefficients(spec, coefficients_df, None) + spec = simulate.eval_coefficients(whale, spec, coefficients_df, None) alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] @@ -165,7 +163,6 @@ def choose_parking_location( trace_hh_id, trace_label, ): - logger.info("choose_parking_location %s with %d trips", trace_label, trips.shape[0]) t0 = print_elapsed_time() @@ -211,7 +208,6 @@ def run_parking_destination( trace_label, fail_some_trips_for_testing=False, ): - chooser_filter_column = model_settings.get("CHOOSER_FILTER_COLUMN_NAME") chooser_segment_column = model_settings.get("CHOOSER_SEGMENT_COLUMN_NAME") @@ -279,9 +275,15 @@ def run_parking_destination( return trips[parking_location_column_name], save_sample_df -@inject.step() +@workflow.step def parking_location( - trips, trips_merged, land_use, network_los, chunk_size, trace_hh_id + whale: workflow.Whale, + trips, + trips_merged, + land_use, + network_los, + chunk_size, + trace_hh_id, ): """ Given a set of trips, each trip needs to have a parking location if @@ -322,6 +324,7 @@ def parking_location( if preprocessor_settings: expressions.assign_columns( + whale, df=trips_merged_df, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -339,7 +342,7 @@ def parking_location( assign_in_place(trips_df, parking_locations.to_frame(alt_destination_col_name)) - pipeline.replace_table("trips", trips_df) + whale.add_table("trips", trips_df) if trace_hh_id: tracing.trace_df( @@ -363,6 +366,6 @@ def parking_location( ) # lest they try to put tour samples into the same table - if pipeline.is_table(sample_table_name): + if whale.is_table(sample_table_name): raise RuntimeError("sample table %s already exists" % sample_table_name) - pipeline.extend_table(sample_table_name, save_sample_df) + whale.extend_table(sample_table_name, save_sample_df) diff --git a/activitysim/abm/models/school_escorting.py b/activitysim/abm/models/school_escorting.py index 1b5a97fc9..aef7d84ac 100644 --- a/activitysim/abm/models/school_escorting.py +++ b/activitysim/abm/models/school_escorting.py @@ -5,12 +5,11 @@ import numpy as np import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.abm.models.util import estimation, school_escort_tours_trips +from activitysim.core import config, expressions, inject, simulate, tracing, workflow from activitysim.core.interaction_simulate import interaction_simulate from activitysim.core.util import reindex -from .util import estimation, school_escort_tours_trips - logger = logging.getLogger(__name__) # setting global defaults for max number of escortees and escortees in model @@ -326,9 +325,15 @@ def create_school_escorting_bundles_table(choosers, tours, stage): return bundles -@inject.step() +@workflow.step def school_escorting( - households, households_merged, persons, tours, chunk_size, trace_hh_id + whale: workflow.Whale, + households, + households_merged, + persons, + tours, + chunk_size, + trace_hh_id, ): """ school escorting model @@ -362,7 +367,7 @@ def school_escorting( households_merged = households_merged.to_frame() tours = tours.to_frame() - alts = simulate.read_model_alts(model_settings["ALTS"], set_index="Alt") + alts = simulate.read_model_alts(whale, model_settings["ALTS"], set_index="Alt") households_merged, participant_columns = determine_escorting_participants( households_merged, persons, model_settings @@ -388,7 +393,7 @@ def school_escorting( file_name=model_settings[stage.upper() + "_COEFFICIENTS"] ) model_spec = simulate.eval_coefficients( - model_spec_raw, coefficients_df, estimator + whale, model_spec_raw, coefficients_df, estimator ) # allow for skipping sharrow entirely in this model with `sharrow_skip: true` @@ -426,6 +431,7 @@ def school_escorting( preprocessor_settings = model_settings.get("preprocessor_" + stage, None) if preprocessor_settings: expressions.assign_columns( + whale, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -441,6 +447,7 @@ def school_escorting( log_alt_losers = config.setting("log_alt_losers", False) choices = interaction_simulate( + whale, choosers=choosers, alternatives=alts, spec=model_spec, @@ -514,14 +521,14 @@ def school_escorting( ) # update pipeline - pipeline.replace_table("households", households) - pipeline.replace_table("tours", tours) - pipeline.get_rn_generator().drop_channel("tours") - pipeline.get_rn_generator().add_channel("tours", tours) - pipeline.replace_table("escort_bundles", escort_bundles) + whale.add_table("households", households) + whale.add_table("tours", tours) + whale.get_rn_generator().drop_channel("tours") + whale.get_rn_generator().add_channel("tours", tours) + whale.add_table("escort_bundles", escort_bundles) # save school escorting tours and trips in pipeline so we can overwrite results from downstream models - pipeline.replace_table("school_escort_tours", school_escort_tours) - pipeline.replace_table("school_escort_trips", school_escort_trips) + whale.add_table("school_escort_tours", school_escort_tours) + whale.add_table("school_escort_trips", school_escort_trips) # updating timetable object with pure escort tours so joint tours do not schedule ontop timetable = inject.get_injectable("timetable") diff --git a/activitysim/abm/models/stop_frequency.py b/activitysim/abm/models/stop_frequency.py index 94a208075..0d89c7677 100644 --- a/activitysim/abm/models/stop_frequency.py +++ b/activitysim/abm/models/stop_frequency.py @@ -2,21 +2,24 @@ # See full license in LICENSE.txt. import logging -import numpy as np import pandas as pd -from activitysim.abm.models.util import school_escort_tours_trips -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing -from activitysim.core.util import assign_in_place, reindex - -from .util import estimation, trip +from activitysim.abm.models.util import estimation, school_escort_tours_trips, trip +from activitysim.core import config, expressions, simulate, tracing, workflow +from activitysim.core.util import assign_in_place logger = logging.getLogger(__name__) -@inject.step() +@workflow.step def stop_frequency( - tours, tours_merged, stop_frequency_alts, network_los, chunk_size, trace_hh_id + whale: workflow.Whale, + tours, + tours_merged, + stop_frequency_alts, + network_los, + chunk_size, + trace_hh_id, ): """ stop frequency model @@ -61,7 +64,6 @@ def stop_frequency( # - run preprocessor to annotate tours_merged preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - # hack: preprocessor adds origin column in place if it does not exist already assert "origin" in tours_merged assert "destination" in tours_merged @@ -102,7 +104,6 @@ def stop_frequency( choices_list = [] for segment_settings in spec_segments: - segment_name = segment_settings[segment_col] segment_value = segment_settings[segment_col] @@ -130,7 +131,7 @@ def stop_frequency( file_name=coefficients_file_name ) segment_spec = simulate.eval_coefficients( - segment_spec, coefficients_df, estimator + whale, segment_spec, coefficients_df, estimator ) if estimator: @@ -180,13 +181,13 @@ def stop_frequency( # if not already there, then it will have been added by stop_freq_annotate_tours_preprocessor assign_in_place(tours, tours_merged[["primary_purpose"]]) - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) # create trips table trips = trip.initialize_from_tours(tours, stop_frequency_alts) - pipeline.replace_table("trips", trips) - tracing.register_traceable_table("trips", trips) - pipeline.get_rn_generator().add_channel("trips", trips) + whale.add_table("trips", trips) + tracing.register_traceable_table(whale, "trips", trips) + whale.get_rn_generator().add_channel("trips", trips) if estimator: # make sure they created trips with the expected tour_ids @@ -236,5 +237,5 @@ def stop_frequency( columns=None, ) - if pipeline.is_table("school_escort_trips"): - school_escort_tours_trips.merge_school_escort_trips_into_pipeline() + if whale.is_table("school_escort_trips"): + school_escort_tours_trips.merge_school_escort_trips_into_pipeline(whale) diff --git a/activitysim/abm/models/summarize.py b/activitysim/abm/models/summarize.py index d4479cfb4..56a790c3a 100644 --- a/activitysim/abm/models/summarize.py +++ b/activitysim/abm/models/summarize.py @@ -6,10 +6,8 @@ import numpy as np import pandas as pd -from activitysim.abm.models.trip_matrices import annotate_trips -from activitysim.core import config, expressions, inject, pipeline - -from ...core.los import Network_LOS +from activitysim.core import config, expressions, workflow +from activitysim.core.los import Network_LOS logger = logging.getLogger(__name__) @@ -200,8 +198,9 @@ def manual_breaks( return bins -@inject.step() +@workflow.step def summarize( + whale: workflow.Whale, network_los: Network_LOS, persons: pd.DataFrame, persons_merged: pd.DataFrame, @@ -234,7 +233,8 @@ def summarize( os.makedirs(config.output_file_path(output_location), exist_ok=True) spec = pd.read_csv( - config.config_file_path(model_settings["SPECIFICATION"]), comment="#" + whale.filesystem.get_config_file_path(model_settings["SPECIFICATION"]), + comment="#", ) # Load dataframes from pipeline @@ -279,7 +279,6 @@ def summarize( for table_name, df in locals_d.items(): if table_name in model_settings: - meta = model_settings[table_name] df = eval(table_name) @@ -337,13 +336,11 @@ def summarize( ) for i, row in spec.iterrows(): - out_file = row["Output"] expr = row["Expression"] # Save temporary variables starting with underscores in locals_d if out_file.startswith("_"): - logger.debug(f"Temp Variable: {expr} -> {out_file}") locals_d[out_file] = eval(expr, globals(), locals_d) diff --git a/activitysim/abm/models/telecommute_frequency.py b/activitysim/abm/models/telecommute_frequency.py index cc1eec489..5e52c4b37 100755 --- a/activitysim/abm/models/telecommute_frequency.py +++ b/activitysim/abm/models/telecommute_frequency.py @@ -5,13 +5,15 @@ import pandas as pd from activitysim.abm.models.util import estimation -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.core import config, expressions, simulate, tracing, workflow logger = logging.getLogger("activitysim") -@inject.step() -def telecommute_frequency(persons_merged, persons, chunk_size, trace_hh_id): +@workflow.step +def telecommute_frequency( + whale: workflow.Whale, persons_merged, persons, chunk_size, trace_hh_id +): """ This model predicts the frequency of telecommute for a person (worker) who does not works from home. The alternatives of this model are 'No Telecommute', @@ -36,12 +38,12 @@ def telecommute_frequency(persons_merged, persons, chunk_size, trace_hh_id): # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_d = {} if constants is not None: locals_d.update(constants) expressions.assign_columns( + whale, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, @@ -50,7 +52,9 @@ def telecommute_frequency(persons_merged, persons, chunk_size, trace_hh_id): model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = simulate.eval_coefficients( + whale, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) @@ -86,7 +90,7 @@ def telecommute_frequency(persons_merged, persons, chunk_size, trace_hh_id): choices.reindex(persons.index).fillna("").astype(str) ) - pipeline.replace_table("persons", persons) + whale.add_table("persons", persons) tracing.print_summary( "telecommute_frequency", persons.telecommute_frequency, value_counts=True diff --git a/activitysim/abm/models/tour_mode_choice.py b/activitysim/abm/models/tour_mode_choice.py index 1e826ae91..b3b91fc79 100644 --- a/activitysim/abm/models/tour_mode_choice.py +++ b/activitysim/abm/models/tour_mode_choice.py @@ -6,22 +6,16 @@ import pandas as pd from orca import orca -from activitysim.core import ( - config, - expressions, - inject, - logit, - los, - pipeline, - simulate, - tracing, +from activitysim.abm.models.util import ( + annotate, + estimation, + school_escort_tours_trips, + trip, ) -from activitysim.core.pathbuilder import TransitVirtualPathBuilder +from activitysim.abm.models.util.mode import run_tour_mode_choice_simulate +from activitysim.core import config, inject, logit, los, simulate, tracing, workflow from activitysim.core.util import assign_in_place, reindex -from .util import estimation, trip, annotate, school_escort_tours_trips -from .util.mode import run_tour_mode_choice_simulate - logger = logging.getLogger(__name__) """ @@ -129,8 +123,9 @@ def append_tour_leg_trip_mode_choice_logsums(tours): return tours +@workflow.func def get_trip_mc_logsums_for_all_modes( - tours, segment_column_name, model_settings, trace_label + whale: workflow.Whale, tours, segment_column_name, model_settings, trace_label ): """Creates pseudo-trips from tours and runs trip mode choice to get logsums @@ -154,9 +149,9 @@ def get_trip_mc_logsums_for_all_modes( ) # temporarily register trips in the pipeline - pipeline.replace_table("trips", logsum_trips) + whale.add_table("trips", logsum_trips) tracing.register_traceable_table("trips", logsum_trips) - pipeline.get_rn_generator().add_channel("trips", logsum_trips) + whale.get_rn_generator().add_channel("trips", logsum_trips) # run trip mode choice on pseudo-trips. use orca instead of pipeline to # execute the step because pipeline can only handle one open step at a time @@ -166,15 +161,15 @@ def get_trip_mc_logsums_for_all_modes( tours = append_tour_leg_trip_mode_choice_logsums(tours) # de-register logsum trips table - pipeline.get_rn_generator().drop_channel("trips") + whale.get_rn_generator().drop_channel("trips") tracing.deregister_traceable_table("trips") return tours -@inject.step() +@workflow.step def tour_mode_choice_simulate( - tours, persons_merged, network_los, chunk_size, trace_hh_id + whale: workflow.Whale, tours, persons_merged, network_los, chunk_size, trace_hh_id ): """ Tour mode choice simulate @@ -278,7 +273,7 @@ def tour_mode_choice_simulate( # don't create estimation data bundle if trip mode choice is being called # from another model step (i.e. tour mode choice logsum creation) - if pipeline.get_rn_generator().step_name != "tour_mode_choice_simulate": + if whale.get_rn_generator().step_name != "tour_mode_choice_simulate": estimator = None else: estimator = estimation.manager.begin_estimation("tour_mode_choice") @@ -311,7 +306,6 @@ def tour_mode_choice_simulate( for tour_purpose, tours_segment in primary_tours_merged.groupby( segment_column_name ): - logger.info( "tour_mode_choice_simulate tour_type '%s' (%s tours)" % ( @@ -354,22 +348,18 @@ def tour_mode_choice_simulate( # add cached tvpb_logsum tap choices for modes specified in tvpb_mode_path_types if network_los.zone_system == los.THREE_ZONE: - tvpb_mode_path_types = model_settings.get("tvpb_mode_path_types") if tvpb_mode_path_types is not None: for mode, path_types in tvpb_mode_path_types.items(): - for direction, skim in zip( ["od", "do"], [tvpb_logsum_odt, tvpb_logsum_dot] ): - path_type = path_types[direction] skim_cache = skim.cache[path_type] print(f"mode {mode} direction {direction} path_type {path_type}") for c in skim_cache: - dest_col = f"{direction}_{c}" if dest_col not in choices_df: @@ -403,7 +393,7 @@ def tour_mode_choice_simulate( all_tours = tours.to_frame() assign_in_place(all_tours, choices_df) - if pipeline.is_table("school_escort_tours") & model_settings.get( + if whale.is_table("school_escort_tours") & model_settings.get( "FORCE_ESCORTEE_CHAUFFEUR_MODE_MATCH", True ): all_tours = ( @@ -412,7 +402,7 @@ def tour_mode_choice_simulate( ) ) - pipeline.replace_table("tours", all_tours) + whale.add_table("tours", all_tours) # - annotate tours table if model_settings.get("annotate_tours"): diff --git a/activitysim/abm/models/tour_od_choice.py b/activitysim/abm/models/tour_od_choice.py index 0825a21ea..6dd9fb9a4 100644 --- a/activitysim/abm/models/tour_od_choice.py +++ b/activitysim/abm/models/tour_od_choice.py @@ -4,19 +4,23 @@ import pandas as pd -from activitysim.core import config, inject, pipeline, simulate, tracing -from activitysim.core.util import assign_in_place - -from .util import estimation, tour_od +from activitysim.abm.models.util import estimation, tour_od +from activitysim.core import config, inject, tracing, workflow logger = logging.getLogger(__name__) -@inject.step() +@workflow.step def tour_od_choice( - tours, persons, households, land_use, network_los, chunk_size, trace_hh_id + whale: workflow.Whale, + tours, + persons, + households, + land_use, + network_los, + chunk_size, + trace_hh_id, ): - """Simulates joint origin/destination choice for all tours. Given a set of previously generated tours, each tour needs to have an @@ -82,6 +86,7 @@ def tour_od_choice( estimator.write_model_settings(model_settings, model_settings_file_name) choices_df, save_sample_df = tour_od.run_tour_od( + whale, tours, persons, want_logsums, @@ -134,13 +139,13 @@ def tour_od_choice( households["home_zone_id"] = households[origin_col_name] persons["home_zone_id"] = persons[origin_col_name] - pipeline.replace_table("tours", tours) - pipeline.replace_table("persons", persons) - pipeline.replace_table("households", households) + whale.add_table("tours", tours) + whale.add_table("persons", persons) + whale.add_table("households", households) if want_sample_table: assert len(save_sample_df.index.get_level_values(0).unique()) == len(choices_df) - pipeline.extend_table(sample_table_name, save_sample_df) + whale.extend_table(sample_table_name, save_sample_df) if trace_hh_id: tracing.trace_df( diff --git a/activitysim/abm/models/tour_scheduling_probabilistic.py b/activitysim/abm/models/tour_scheduling_probabilistic.py index 89fb41676..3de0357fe 100644 --- a/activitysim/abm/models/tour_scheduling_probabilistic.py +++ b/activitysim/abm/models/tour_scheduling_probabilistic.py @@ -3,19 +3,17 @@ import logging -import numpy as np import pandas as pd from activitysim.abm.models.util import estimation -from activitysim.core import chunk, config, inject, logit, pipeline, tracing -from activitysim.core.util import reindex - -from .util import probabilistic_scheduling as ps +from activitysim.abm.models.util import probabilistic_scheduling as ps +from activitysim.core import chunk, config, workflow logger = logging.getLogger(__name__) def run_tour_scheduling_probabilistic( + whale: workflow.Whale, tours_df, scheduling_probs, probs_join_cols, @@ -51,9 +49,10 @@ def run_tour_scheduling_probabilistic( """ result_list = [] for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - tours_df, chunk_size, trace_label, trace_label + whale, tours_df, chunk_size, trace_label, trace_label ): choices = ps.make_scheduling_choices( + whale, chooser_chunk, "departure", scheduling_probs, @@ -72,8 +71,10 @@ def run_tour_scheduling_probabilistic( return choices -@inject.step() -def tour_scheduling_probabilistic(tours, chunk_size, trace_hh_id): +@workflow.step +def tour_scheduling_probabilistic( + whale: workflow.Whale, tours, chunk_size, trace_hh_id +): """Makes tour departure and arrival choices by sampling from a probability lookup table This model samples tour scheduling choices from an exogenously defined probability @@ -96,7 +97,9 @@ def tour_scheduling_probabilistic(tours, chunk_size, trace_hh_id): model_settings_file_name = "tour_scheduling_probabilistic.yaml" model_settings = config.read_model_settings(model_settings_file_name) depart_alt_base = model_settings.get("depart_alt_base", 0) - scheduling_probs_filepath = config.config_file_path(model_settings["PROBS_SPEC"]) + scheduling_probs_filepath = whale.filesystem.get_config_file_path( + model_settings["PROBS_SPEC"] + ) scheduling_probs = pd.read_csv(scheduling_probs_filepath) probs_join_cols = model_settings["PROBS_JOIN_COLS"] tours_df = tours.to_frame() @@ -111,6 +114,7 @@ def tour_scheduling_probabilistic(tours, chunk_size, trace_hh_id): estimator.write_choosers(tours_df[chooser_cols_for_estimation]) choices = run_tour_scheduling_probabilistic( + whale, tours_df, scheduling_probs, probs_join_cols, @@ -150,4 +154,4 @@ def tour_scheduling_probabilistic(tours, chunk_size, trace_hh_id): assert not tours_df["end"].isnull().any() assert not tours_df["duration"].isnull().any() - pipeline.replace_table("tours", tours_df) + whale.add_table("tours", tours_df) diff --git a/activitysim/abm/models/transit_pass_ownership.py b/activitysim/abm/models/transit_pass_ownership.py index 6507ab825..c9728b8eb 100644 --- a/activitysim/abm/models/transit_pass_ownership.py +++ b/activitysim/abm/models/transit_pass_ownership.py @@ -5,13 +5,15 @@ import numpy as np from activitysim.abm.models.util import estimation -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.core import config, expressions, simulate, tracing, workflow logger = logging.getLogger("activitysim") -@inject.step() -def transit_pass_ownership(persons_merged, persons, chunk_size, trace_hh_id): +@workflow.step +def transit_pass_ownership( + whale: workflow.Whale, persons_merged, persons, chunk_size, trace_hh_id +): """ Transit pass ownership model. """ @@ -30,12 +32,12 @@ def transit_pass_ownership(persons_merged, persons, chunk_size, trace_hh_id): # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_d = {} if constants is not None: locals_d.update(constants) expressions.assign_columns( + whale, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, @@ -44,7 +46,9 @@ def transit_pass_ownership(persons_merged, persons, chunk_size, trace_hh_id): model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = simulate.eval_coefficients( + whale, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) @@ -76,7 +80,7 @@ def transit_pass_ownership(persons_merged, persons, chunk_size, trace_hh_id): persons = persons.to_frame() persons["transit_pass_ownership"] = choices.reindex(persons.index) - pipeline.replace_table("persons", persons) + whale.add_table("persons", persons) tracing.print_summary( "transit_pass_ownership", persons.transit_pass_ownership, value_counts=True diff --git a/activitysim/abm/models/transit_pass_subsidy.py b/activitysim/abm/models/transit_pass_subsidy.py index 4e513a661..4a07a803a 100644 --- a/activitysim/abm/models/transit_pass_subsidy.py +++ b/activitysim/abm/models/transit_pass_subsidy.py @@ -5,13 +5,15 @@ import numpy as np from activitysim.abm.models.util import estimation -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.core import config, expressions, simulate, tracing, workflow logger = logging.getLogger("activitysim") -@inject.step() -def transit_pass_subsidy(persons_merged, persons, chunk_size, trace_hh_id): +@workflow.step +def transit_pass_subsidy( + whale: workflow.Whale, persons_merged, persons, chunk_size, trace_hh_id +): """ Transit pass subsidy model. """ @@ -30,12 +32,12 @@ def transit_pass_subsidy(persons_merged, persons, chunk_size, trace_hh_id): # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_d = {} if constants is not None: locals_d.update(constants) expressions.assign_columns( + whale, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, @@ -44,7 +46,9 @@ def transit_pass_subsidy(persons_merged, persons, chunk_size, trace_hh_id): model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = simulate.eval_coefficients( + whale, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) @@ -76,7 +80,7 @@ def transit_pass_subsidy(persons_merged, persons, chunk_size, trace_hh_id): persons = persons.to_frame() persons["transit_pass_subsidy"] = choices.reindex(persons.index) - pipeline.replace_table("persons", persons) + whale.add_table("persons", persons) tracing.print_summary( "transit_pass_subsidy", persons.transit_pass_subsidy, value_counts=True diff --git a/activitysim/abm/models/trip_departure_choice.py b/activitysim/abm/models/trip_departure_choice.py index 2dad8d37c..38cb58e78 100644 --- a/activitysim/abm/models/trip_departure_choice.py +++ b/activitysim/abm/models/trip_departure_choice.py @@ -8,12 +8,11 @@ chunk, config, expressions, - inject, interaction_simulate, logit, - pipeline, simulate, tracing, + workflow, ) from activitysim.core.simulate import set_skim_wrapper_targets from activitysim.core.util import reindex @@ -164,7 +163,6 @@ def build_patterns(trips, time_windows): def get_spec_for_segment(omnibus_spec, segment): - spec = omnibus_spec[[segment]] # might as well ignore any spec rows with 0 utility @@ -174,9 +172,11 @@ def get_spec_for_segment(omnibus_spec, segment): return spec -def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label="trace_label"): +def choose_tour_leg_pattern( + whale, trip_segment, patterns, spec, trace_label="trace_label" +): alternatives = generate_alternatives(trip_segment, STOP_TIME_DURATION).sort_index() - have_trace_targets = tracing.has_trace_targets(trip_segment) + have_trace_targets = tracing.has_trace_targets(whale, trip_segment) if have_trace_targets: tracing.trace_df( @@ -220,7 +220,7 @@ def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label="trace_lab interaction_utilities, trace_eval_results, ) = interaction_simulate.eval_interaction_utilities( - spec, interaction_df, None, trace_label, trace_rows, estimator=None + whale, spec, interaction_df, None, trace_label, trace_rows, estimator=None ) interaction_utilities = pd.concat( @@ -335,7 +335,7 @@ def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label="trace_lab # positions is series with the chosen alternative represented as a column index in probs # which is an integer between zero and num alternatives in the alternative sample positions, rands = logit.make_choices( - probs, trace_label=trace_label, trace_choosers=trip_segment + whale, probs, trace_label=trace_label, trace_choosers=trip_segment ) chunk.log_df(trace_label, "positions", positions) @@ -371,8 +371,7 @@ def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label="trace_lab return choices -def apply_stage_two_model(omnibus_spec, trips, chunk_size, trace_label): - +def apply_stage_two_model(whale, omnibus_spec, trips, chunk_size, trace_label): if not trips.index.is_monotonic: trips = trips.sort_index() @@ -429,7 +428,6 @@ def apply_stage_two_model(omnibus_spec, trips, chunk_size, trace_label): ) in chunk.adaptive_chunked_choosers_by_chunk_id( side_trips, chunk_size, trace_label ): - for is_outbound, trip_segment in chooser_chunk.groupby(OUTBOUND): direction = OUTBOUND if is_outbound else "inbound" spec = get_spec_for_segment(omnibus_spec, direction) @@ -438,7 +436,7 @@ def apply_stage_two_model(omnibus_spec, trips, chunk_size, trace_label): patterns = build_patterns(trip_segment, time_windows) choices = choose_tour_leg_pattern( - trip_segment, patterns, spec, trace_label=segment_trace_label + whale, trip_segment, patterns, spec, trace_label=segment_trace_label ) choices = pd.merge( @@ -466,9 +464,10 @@ def apply_stage_two_model(omnibus_spec, trips, chunk_size, trace_label): return trips["depart"].astype(int) -@inject.step() -def trip_departure_choice(trips, trips_merged, skim_dict, chunk_size, trace_hh_id): - +@workflow.step +def trip_departure_choice( + whale: workflow.Whale, trips, trips_merged, skim_dict, chunk_size, trace_hh_id +): trace_label = "trip_departure_choice" model_settings = config.read_model_settings("trip_departure_choice.yaml") @@ -490,7 +489,7 @@ def trip_departure_choice(trips, trips_merged, skim_dict, chunk_size, trace_hh_i preprocessor_settings = model_settings.get("PREPROCESSOR", None) tour_legs = get_tour_legs(trips_merged_df) - pipeline.get_rn_generator().add_channel("tour_legs", tour_legs) + whale.get_rn_generator().add_channel("tour_legs", tour_legs) if preprocessor_settings: od_skim = skim_dict.wrap("origin", "destination") @@ -508,13 +507,16 @@ def trip_departure_choice(trips, trips_merged, skim_dict, chunk_size, trace_hh_i ) expressions.assign_columns( + whale, df=trips_merged_df, model_settings=preprocessor_settings, locals_dict=locals_d, trace_label=trace_label, ) - choices = apply_stage_two_model(spec, trips_merged_df, chunk_size, trace_label) + choices = apply_stage_two_model( + whale, spec, trips_merged_df, chunk_size, trace_label + ) trips_df = trips.to_frame() trip_length = len(trips_df) @@ -522,4 +524,4 @@ def trip_departure_choice(trips, trips_merged, skim_dict, chunk_size, trace_hh_i assert len(trips_df) == trip_length assert trips_df[trips_df["depart"].isnull()].empty - pipeline.replace_table("trips", trips_df) + whale.add_table("trips", trips_df) diff --git a/activitysim/abm/models/trip_destination.py b/activitysim/abm/models/trip_destination.py index 545cfee29..6a69eb025 100644 --- a/activitysim/abm/models/trip_destination.py +++ b/activitysim/abm/models/trip_destination.py @@ -7,6 +7,10 @@ import numpy as np import pandas as pd +from activitysim.abm.models.util import estimation +from activitysim.abm.models.util.school_escort_tours_trips import ( + split_out_school_escorting_trips, +) from activitysim.abm.models.util.trip import ( cleanup_failed_trips, flag_failed_trip_leg_mates, @@ -18,20 +22,17 @@ expressions, inject, los, - pipeline, simulate, tracing, + workflow, ) +from activitysim.core.configuration.base import Any, PydanticBase from activitysim.core.interaction_sample import interaction_sample from activitysim.core.interaction_sample_simulate import interaction_sample_simulate from activitysim.core.skim_dictionary import DataFrameMatrix from activitysim.core.tracing import print_elapsed_time from activitysim.core.util import assign_in_place, reindex -from ...core.configuration.base import Any, PydanticBase -from .util.school_escort_tours_trips import split_out_school_escorting_trips -from .util import estimation - logger = logging.getLogger(__name__) NO_DESTINATION = -1 @@ -74,7 +75,9 @@ class TripDestinationSettings(PydanticBase): """This setting is used by testing code to force failed trip_destination.""" +@workflow.func def _destination_sample( + whale: workflow.Whale, primary_purpose, trips, alternatives, @@ -105,6 +108,7 @@ def _destination_sample( """ spec = simulate.spec_for_segment( + whale, model_settings, spec_id="DESTINATION_SAMPLE_SPEC", segment_name=primary_purpose, @@ -112,7 +116,7 @@ def _destination_sample( ) sample_size = model_settings["SAMPLE_SIZE"] - if config.setting("disable_destination_sampling", False) or ( + if whale.settings.disable_destination_sampling or ( estimator and estimator.want_unsampled_alternatives ): # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count @@ -138,9 +142,10 @@ def _destination_sample( ) locals_dict.update(skims) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = whale.settings.log_alt_losers choices = interaction_sample( + whale, choosers=trips, alternatives=alternatives, sample_size=sample_size, @@ -159,7 +164,9 @@ def _destination_sample( return choices +@workflow.func def destination_sample( + whale: workflow.Whale, primary_purpose, trips, alternatives, @@ -170,13 +177,13 @@ def destination_sample( chunk_size, trace_label, ): - chunk_tag = "trip_destination.sample" skims = skim_hotel.sample_skims(presample=False) alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] choices = _destination_sample( + whale, primary_purpose, trips, alternatives, @@ -194,7 +201,6 @@ def destination_sample( def aggregate_size_term_matrix(maz_size_term_matrix, network_los): - df = maz_size_term_matrix.df assert ALT_DEST_TAZ not in df @@ -207,7 +213,13 @@ def aggregate_size_term_matrix(maz_size_term_matrix, network_los): def choose_MAZ_for_TAZ( - taz_sample, MAZ_size_terms, trips, network_los, alt_dest_col_name, trace_label + whale, + taz_sample, + MAZ_size_terms, + trips, + network_los, + alt_dest_col_name, + trace_label, ): """ Convert taz_sample table with TAZ zone sample choices to a table with a MAZ zone chosen for each TAZ @@ -236,12 +248,12 @@ def choose_MAZ_for_TAZ( taz_sample.rename(columns={alt_dest_col_name: DEST_TAZ}, inplace=True) trace_hh_id = inject.get_injectable("trace_hh_id", None) - have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample) + have_trace_targets = trace_hh_id and tracing.has_trace_targets(whale, taz_sample) if have_trace_targets: trace_label = tracing.extend_trace_label(trace_label, "choose_MAZ_for_TAZ") # write taz choices, pick_counts, probs - trace_targets = tracing.trace_targets(taz_sample) + trace_targets = tracing.trace_targets(whale, taz_sample) tracing.trace_df( taz_sample[trace_targets], label=tracing.extend_trace_label(trace_label, "taz_sample"), @@ -298,7 +310,8 @@ def choose_MAZ_for_TAZ( # (preserve index, which will have duplicates as result of join) maz_taz = ( - network_los.get_maz_to_taz_series.rename(DEST_TAZ) + network_los.get_maz_to_taz_series(whale) + .rename(DEST_TAZ) .rename_axis(index=DEST_MAZ) .to_frame() .reset_index() @@ -358,7 +371,7 @@ def choose_MAZ_for_TAZ( assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count) rands = ( - pipeline.get_rn_generator() + whale.get_rn_generator() .random_for_df(chooser_df, n=taz_sample_size) .reshape(-1, 1) ) @@ -378,7 +391,6 @@ def choose_MAZ_for_TAZ( taz_choices["prob"] = taz_choices["TAZ_prob"] * taz_choices["MAZ_prob"] if have_trace_targets: - taz_choices_trace_targets = tracing.trace_targets(taz_choices, slicer="trip_id") trace_taz_choices_df = taz_choices[taz_choices_trace_targets] tracing.trace_df( @@ -453,7 +465,9 @@ def choose_MAZ_for_TAZ( return taz_choices +@workflow.func def destination_presample( + whale: workflow.Whale, primary_purpose, trips, alternatives, @@ -466,7 +480,6 @@ def destination_presample( trace_hh_id, trace_label, ): - trace_label = tracing.extend_trace_label(trace_label, "presample") chunk_tag = "trip_destination.presample" # distinguish from trip_destination.sample @@ -494,6 +507,7 @@ def destination_presample( skims = skim_hotel.sample_skims(presample=True) taz_sample = _destination_sample( + whale, primary_purpose, trips_taz, alternatives, @@ -510,7 +524,13 @@ def destination_presample( # choose a MAZ for each DEST_TAZ choice, choice probability based on MAZ size_term fraction of TAZ total maz_sample = choose_MAZ_for_TAZ( - taz_sample, size_term_matrix, trips, network_los, alt_dest_col_name, trace_label + whale, + taz_sample, + size_term_matrix, + trips, + network_los, + alt_dest_col_name, + trace_label, ) assert alt_dest_col_name in maz_sample @@ -519,6 +539,7 @@ def destination_presample( def trip_destination_sample( + whale: workflow.Whale, primary_purpose, trips, alternatives, @@ -554,7 +575,7 @@ def trip_destination_sample( # by default, enable presampling for multizone systems, unless they disable it in settings file network_los = inject.get_injectable("network_los") pre_sample_taz = network_los.zone_system != los.ONE_ZONE - if pre_sample_taz and not config.setting("want_dest_choice_presampling", True): + if pre_sample_taz and not whale.settings.want_dest_choice_presampling: pre_sample_taz = False logger.info( f"Disabled destination zone presampling for {trace_label} " @@ -562,7 +583,6 @@ def trip_destination_sample( ) if pre_sample_taz: - logger.info( "Running %s trip_destination_presample with %d trips" % (trace_label, len(trips)) @@ -598,7 +618,9 @@ def trip_destination_sample( return choices +@workflow.func def compute_ood_logsums( + whale: workflow.Whale, choosers, logsum_settings, nest_spec, @@ -623,13 +645,16 @@ def compute_ood_logsums( # in `chunk.chunk_log()` at chunk.py L927. To avoid failing this assertion, # the preprocessor must be called from within a "null chunker" as follows: with chunk.chunk_log( - tracing.extend_trace_label(trace_label, "annotate_preprocessor"), base=True + tracing.extend_trace_label(trace_label, "annotate_preprocessor"), + base=True, + settings=whale.settings, ): expressions.annotate_preprocessors( choosers, locals_dict, od_skims, logsum_settings, trace_label ) logsums = simulate.simple_simulate_logsums( + whale, choosers, logsum_spec, nest_spec, @@ -649,6 +674,7 @@ def compute_ood_logsums( def compute_logsums( + whale, primary_purpose, trips, destination_sample, @@ -700,8 +726,10 @@ def compute_logsums( nest_spec = config.get_logit_model_settings(logsum_settings) nest_spec = simulate.eval_nest_coefficients(nest_spec, coefficients, trace_label) - logsum_spec = simulate.read_model_spec(file_name=logsum_settings["SPEC"]) - logsum_spec = simulate.eval_coefficients(logsum_spec, coefficients, estimator=None) + logsum_spec = simulate.read_model_spec(whale, file_name=logsum_settings["SPEC"]) + logsum_spec = simulate.eval_coefficients( + whale, logsum_spec, coefficients, estimator=None + ) locals_dict = {} locals_dict.update(config.get_model_constants(logsum_settings)) @@ -777,6 +805,7 @@ def compute_logsums( def trip_destination_simulate( + whale: workflow.Whale, primary_purpose, trips, destination_sample, @@ -833,7 +862,7 @@ def trip_destination_simulate( ) locals_dict.update(skims) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = whale.settings.log_alt_losers destinations = interaction_sample_simulate( choosers=trips, alternatives=destination_sample, @@ -874,7 +903,9 @@ def trip_destination_simulate( return destinations +@workflow.func def choose_trip_destination( + whale: workflow.Whale, primary_purpose, trips, alternatives, @@ -889,13 +920,13 @@ def choose_trip_destination( trace_hh_id, trace_label, ): - logger.info("choose_trip_destination %s with %d trips", trace_label, trips.shape[0]) t0 = print_elapsed_time() # - trip_destination_sample destination_sample = trip_destination_sample( + whale, primary_purpose=primary_purpose, trips=trips, alternatives=alternatives, @@ -924,6 +955,7 @@ def choose_trip_destination( # - compute logsums destination_sample = compute_logsums( + whale, primary_purpose=primary_purpose, trips=trips, destination_sample=destination_sample, @@ -938,6 +970,7 @@ def choose_trip_destination( # - trip_destination_simulate destinations = trip_destination_simulate( + whale, primary_purpose=primary_purpose, trips=trips, destination_sample=destination_sample, @@ -974,14 +1007,12 @@ def choose_trip_destination( class SkimHotel(object): def __init__(self, model_settings, network_los, trace_label): - self.model_settings = model_settings self.trace_label = tracing.extend_trace_label(trace_label, "skim_hotel") self.network_los = network_los self.zone_system = network_los.zone_system def sample_skims(self, presample): - o = self.model_settings["TRIP_ORIGIN"] d = self.model_settings["ALT_DEST_COL_NAME"] n = self.model_settings.get("PRIMARY_ORIGIN", "origin") @@ -1027,7 +1058,6 @@ def sample_skims(self, presample): return skims def logsum_skims(self): - o = self.model_settings["TRIP_ORIGIN"] d = self.model_settings["ALT_DEST_COL_NAME"] p = self.model_settings["PRIMARY_DEST"] @@ -1099,7 +1129,9 @@ def logsum_skims(self): return skims +@workflow.func def run_trip_destination( + whale: workflow.Whale, trips, tours_merged, estimator, @@ -1141,8 +1173,7 @@ def run_trip_destination( sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + whale.settings.want_dest_choice_sample_tables and sample_table_name is not None ) land_use = inject.get_table("land_use") @@ -1162,7 +1193,7 @@ def run_trip_destination( # stop_frequency step calls trip.initialize_from_tours. But if this module is being # called from trip_destination_and_purpose, these columns will have been deleted # so they must be re-created - if pipeline.get_rn_generator().step_name == "trip_purpose_and_destination": + if whale.get_rn_generator().step_name == "trip_purpose_and_destination": trips["destination"] = np.where(trips.outbound, tour_destination, tour_origin) trips["origin"] = np.where(trips.outbound, tour_origin, tour_destination) trips["failed"] = False @@ -1231,13 +1262,11 @@ def run_trip_destination( # - process intermediate trips in ascending trip_num order intermediate = trips.trip_num < trips.trip_count if intermediate.any(): - first_trip_num = trips[intermediate].trip_num.min() last_trip_num = trips[intermediate].trip_num.max() # iterate over trips in ascending trip_num order for trip_num in range(first_trip_num, last_trip_num + 1): - nth_trips = trips[intermediate & (trips.trip_num == trip_num)] nth_trace_label = tracing.extend_trace_label( trace_label, "trip_num_%s" % trip_num @@ -1252,6 +1281,7 @@ def run_trip_destination( # - annotate nth_trips if preprocessor_settings: expressions.assign_columns( + whale, df=nth_trips, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -1273,6 +1303,7 @@ def run_trip_destination( choices_list = [] for primary_purpose, trips_segment in nth_trips.groupby("primary_purpose"): choices, destination_sample = choose_trip_destination( + whale, primary_purpose, trips_segment, alternatives, @@ -1348,8 +1379,10 @@ def run_trip_destination( return trips, save_sample_df -@inject.step() -def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): +@workflow.step +def trip_destination( + whale: workflow.Whale, trips, tours_merged, chunk_size, trace_hh_id +): """ Choose a destination for all intermediate trips based on trip purpose. @@ -1388,8 +1421,8 @@ def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): trips_df = trips.to_frame() tours_merged_df = tours_merged.to_frame() - if pipeline.is_table("school_escort_trips"): - school_escort_trips = pipeline.get_table("school_escort_trips") + if whale.is_table("school_escort_trips"): + school_escort_trips = whale.get_dataframe("school_escort_trips") # separate out school escorting trips to exclude them from the model and estimation data bundle trips_df, se_trips_df, full_trips_index = split_out_school_escorting_trips( trips_df, school_escort_trips @@ -1413,6 +1446,7 @@ def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) trips_df, save_sample_df = run_trip_destination( + whale, trips_df, tours_merged_df, estimator=estimator, @@ -1454,7 +1488,6 @@ def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): assert not trips_df.failed.any() if CLEANUP: - if trips_df.failed.any(): flag_failed_trip_leg_mates(trips_df, "failed") @@ -1467,7 +1500,7 @@ def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): trips_df.drop(columns="failed", inplace=True, errors="ignore") - if pipeline.is_table("school_escort_trips"): + if whale.is_table("school_escort_trips"): # setting destination for school escort trips se_trips_df["destination"] = reindex( school_escort_trips.destination, se_trips_df.index @@ -1484,7 +1517,7 @@ def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): trips_df.groupby("tour_id")["destination"].shift(), ).astype(int) - pipeline.replace_table("trips", trips_df) + whale.add_table("trips", trips_df) if trace_hh_id: tracing.trace_df( @@ -1511,6 +1544,6 @@ def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): ) # lest they try to put tour samples into the same table - if pipeline.is_table(sample_table_name): + if whale.is_table(sample_table_name): raise RuntimeError("sample table %s already exists" % sample_table_name) - pipeline.extend_table(sample_table_name, save_sample_df) + whale.extend_table(sample_table_name, save_sample_df) diff --git a/activitysim/abm/models/trip_matrices.py b/activitysim/abm/models/trip_matrices.py index 0c9e1f447..683c3e108 100644 --- a/activitysim/abm/models/trip_matrices.py +++ b/activitysim/abm/models/trip_matrices.py @@ -7,13 +7,13 @@ import openmatrix as omx import pandas as pd -from activitysim.core import config, expressions, inject, los, pipeline +from activitysim.core import config, expressions, los, workflow logger = logging.getLogger(__name__) -@inject.step() -def write_trip_matrices(network_los): +@workflow.step +def write_trip_matrices(whale: workflow.Whale, network_los): """ Write trip matrices step. @@ -32,7 +32,7 @@ def write_trip_matrices(network_los): """ - trips = inject.get_table("trips", None) + trips = whale.get("trips", None) if trips is None: # this step is a NOP if there is no trips table # this might legitimately happen if they comment out some steps to debug but still want write_tables @@ -43,12 +43,12 @@ def write_trip_matrices(network_los): return model_settings = config.read_model_settings("write_trip_matrices.yaml") - trips_df = annotate_trips(trips, network_los, model_settings) + trips_df = annotate_trips(whale, trips, network_los, model_settings) if bool(model_settings.get("SAVE_TRIPS_TABLE")): - pipeline.replace_table("trips", trips_df) + whale.add_table("trips", trips_df) - if "parking_location" in config.setting("models"): + if "parking_location" in whale.settings.models: parking_settings = config.read_model_settings("parking_location_choice.yaml") parking_taz_col_name = parking_settings["ALT_DEST_COL_NAME"] if parking_taz_col_name in trips_df: @@ -78,7 +78,7 @@ def write_trip_matrices(network_los): dest_vals = aggregate_trips.index.get_level_values("destination") # use the land use table for the set of possible tazs - land_use = pipeline.get_table("land_use") + land_use = whale.get_dataframe("land_use") zone_index = land_use.index assert all(zone in zone_index for zone in orig_vals) assert all(zone in zone_index for zone in dest_vals) @@ -98,10 +98,12 @@ def write_trip_matrices(network_los): elif network_los.zone_system == los.TWO_ZONE: # maz trips written to taz matrices logger.info("aggregating trips two zone...") trips_df["otaz"] = ( - pipeline.get_table("land_use").reindex(trips_df["origin"]).TAZ.tolist() + whale.get_dataframe("land_use").reindex(trips_df["origin"]).TAZ.tolist() ) trips_df["dtaz"] = ( - pipeline.get_table("land_use").reindex(trips_df["destination"]).TAZ.tolist() + whale.get_dataframe("land_use") + .reindex(trips_df["destination"]) + .TAZ.tolist() ) aggregate_trips = trips_df.groupby(["otaz", "dtaz"], sort=False).sum( numeric_only=True @@ -120,7 +122,7 @@ def write_trip_matrices(network_los): dest_vals = aggregate_trips.index.get_level_values("dtaz") try: - land_use_taz = pipeline.get_table("land_use_taz") + land_use_taz = whale.get_dataframe("land_use_taz") except (KeyError, RuntimeError): pass # table missing, ignore else: @@ -142,13 +144,14 @@ def write_trip_matrices(network_los): elif ( network_los.zone_system == los.THREE_ZONE ): # maz trips written to taz and tap matrices - logger.info("aggregating trips three zone taz...") trips_df["otaz"] = ( - pipeline.get_table("land_use").reindex(trips_df["origin"]).TAZ.tolist() + whale.get_dataframe("land_use").reindex(trips_df["origin"]).TAZ.tolist() ) trips_df["dtaz"] = ( - pipeline.get_table("land_use").reindex(trips_df["destination"]).TAZ.tolist() + whale.get_dataframe("land_use") + .reindex(trips_df["destination"]) + .TAZ.tolist() ) aggregate_trips = trips_df.groupby(["otaz", "dtaz"], sort=False).sum( numeric_only=True @@ -167,7 +170,7 @@ def write_trip_matrices(network_los): dest_vals = aggregate_trips.index.get_level_values("dtaz") try: - land_use_taz = pipeline.get_table("land_use_taz") + land_use_taz = whale.get_dataframe("land_use_taz") except (KeyError, RuntimeError): pass # table missing, ignore else: @@ -215,7 +218,10 @@ def write_trip_matrices(network_los): ) -def annotate_trips(trips, network_los, model_settings): +@workflow.func +def annotate_trips( + whale: workflow.Whale, trips: pd.DataFrame, network_los, model_settings +): """ Add columns to local trips table. The annotator has access to the origin/destination skims and everything @@ -225,7 +231,7 @@ def annotate_trips(trips, network_los, model_settings): TABLES in the preprocessor settings. """ - trips_df = trips.to_frame() + trips_df = trips trace_label = "trip_matrices" @@ -263,7 +269,7 @@ def annotate_trips(trips, network_los, model_settings): if hh_weight_col and hh_weight_col not in trips_df: logger.info("adding '%s' from households to trips table" % hh_weight_col) - household_weights = pipeline.get_table("households")[hh_weight_col] + household_weights = whale.get_dataframe("households")[hh_weight_col] trips_df[hh_weight_col] = trips_df.household_id.map(household_weights) return trips_df diff --git a/activitysim/abm/models/trip_mode_choice.py b/activitysim/abm/models/trip_mode_choice.py index 54586a448..edcaf4695 100644 --- a/activitysim/abm/models/trip_mode_choice.py +++ b/activitysim/abm/models/trip_mode_choice.py @@ -6,28 +6,27 @@ import numpy as np import pandas as pd +from activitysim.abm.models.util import annotate, estimation, school_escort_tours_trips +from activitysim.abm.models.util.mode import mode_choice_simulate from activitysim.core import ( - assign, chunk, config, expressions, inject, los, - pipeline, simulate, tracing, + workflow, ) -from activitysim.core.pathbuilder import TransitVirtualPathBuilder from activitysim.core.util import assign_in_place -from .util import estimation, annotate, school_escort_tours_trips -from .util.mode import mode_choice_simulate - logger = logging.getLogger(__name__) -@inject.step() -def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): +@workflow.step +def trip_mode_choice( + whale: workflow.Whale, trips, network_los, chunk_size, trace_hh_id +): """ Trip mode choice - compute trip_mode (same values as for tour_mode) for each trip. @@ -152,7 +151,7 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): # don't create estimation data bundle if trip mode choice is being called # from another model step (e.g. tour mode choice logsum creation) - if pipeline._PIPELINE.rng().step_name != "trip_mode_choice": + if whale.current_model_name != "trip_mode_choice": estimator = None else: estimator = estimation.manager.begin_estimation("trip_mode_choice") @@ -162,12 +161,11 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): estimator.write_spec(model_settings) estimator.write_model_settings(model_settings, model_settings_file_name) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) + model_spec = simulate.read_model_spec(whale, file_name=model_settings["SPEC"]) nest_spec = config.get_logit_model_settings(model_settings) choices_list = [] for primary_purpose, trips_segment in trips_merged.groupby("primary_purpose"): - segment_trace_label = tracing.extend_trace_label(trace_label, primary_purpose) logger.info( @@ -200,7 +198,9 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): # have to initialize chunker for preprocessing in order to access # tvpb logsum terms in preprocessor expressions. with chunk.chunk_log( - tracing.extend_trace_label(trace_label, "preprocessing"), base=True + tracing.extend_trace_label(trace_label, "preprocessing"), + base=True, + settings=whale.settings, ): expressions.annotate_preprocessors( trips_segment, locals_dict, skims, model_settings, segment_trace_label @@ -215,7 +215,7 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): choices = mode_choice_simulate( choosers=trips_segment, - spec=simulate.eval_coefficients(model_spec, coefficients, estimator), + spec=simulate.eval_coefficients(whale, model_spec, coefficients, estimator), nest_spec=simulate.eval_nest_coefficients( nest_spec, coefficients, segment_trace_label ), @@ -255,10 +255,8 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): # add cached tvpb_logsum tap choices for modes specified in tvpb_mode_path_types if network_los.zone_system == los.THREE_ZONE: - tvpb_mode_path_types = model_settings.get("tvpb_mode_path_types") for mode, path_type in tvpb_mode_path_types.items(): - skim_cache = tvpb_logsum_odt.cache[path_type] for c in skim_cache: @@ -281,7 +279,7 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): trips_df = trips.to_frame() assign_in_place(trips_df, choices_df) - if pipeline.is_table("school_escort_tours") & model_settings.get( + if whale.is_table("school_escort_tours") & model_settings.get( "FORCE_ESCORTEE_CHAUFFEUR_MODE_MATCH", True ): trips_df = ( @@ -298,10 +296,10 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): assert not trips_df[mode_column_name].isnull().any() - pipeline.replace_table("trips", trips_df) + whale.add_table("trips", trips_df) if model_settings.get("annotate_trips"): - annotate.annotate_trips(model_settings, trace_label) + annotate.annotate_trips(whale, model_settings, trace_label) if trace_hh_id: tracing.trace_df( diff --git a/activitysim/abm/models/trip_purpose.py b/activitysim/abm/models/trip_purpose.py index 1e48444e7..0d8dbde23 100644 --- a/activitysim/abm/models/trip_purpose.py +++ b/activitysim/abm/models/trip_purpose.py @@ -1,23 +1,25 @@ # ActivitySim # See full license in LICENSE.txt. import logging + import numpy as np import pandas as pd +from activitysim.abm.models.util import estimation +from activitysim.abm.models.util.school_escort_tours_trips import ( + split_out_school_escorting_trips, +) from activitysim.core import ( chunk, config, expressions, inject, logit, - pipeline, simulate, tracing, + workflow, ) - -from .util import estimation from activitysim.core.util import reindex -from .util.school_escort_tours_trips import split_out_school_escorting_trips logger = logging.getLogger(__name__) @@ -45,6 +47,7 @@ def map_coefficients(spec, coefficients): def choose_intermediate_trip_purpose( + whale: workflow.Whale, trips, probs_spec, estimator, @@ -68,7 +71,7 @@ def choose_intermediate_trip_purpose( purpose_cols = [c for c in probs_spec.columns if c not in non_purpose_cols] num_trips = len(trips.index) - have_trace_targets = trace_hh_id and tracing.has_trace_targets(trips) + have_trace_targets = trace_hh_id and tracing.has_trace_targets(whale, trips) # probs should sum to 1 across rows sum_probs = probs_spec[purpose_cols].sum(axis=1) @@ -81,7 +84,6 @@ def choose_intermediate_trip_purpose( chunk.log_df(trace_label, "choosers", choosers) if use_depart_time: - # select the matching depart range (this should result on in exactly one chooser row per trip) chooser_probs = (choosers.start >= choosers["depart_range_start"]) & ( choosers.start <= choosers["depart_range_end"] @@ -89,7 +91,6 @@ def choose_intermediate_trip_purpose( # if we failed to match a row in probs_spec if chooser_probs.sum() < num_trips: - # this can happen if the spec doesn't have probs for the trips matching a trip's probs_join_cols missing_trip_ids = trips.index[ ~trips.index.isin(choosers.index[chooser_probs]) @@ -147,7 +148,7 @@ def choose_intermediate_trip_purpose( estimator.write_table(choosers[probs_cols], "probs", append=True) choices, rands = logit.make_choices( - choosers[purpose_cols], trace_label=trace_label, trace_choosers=choosers + whale, choosers[purpose_cols], trace_label=trace_label, trace_choosers=choosers ) if have_trace_targets: @@ -160,7 +161,9 @@ def choose_intermediate_trip_purpose( return choices -def run_trip_purpose(trips_df, estimator, chunk_size, trace_hh_id, trace_label): +def run_trip_purpose( + whale: workflow.Whale, trips_df, estimator, chunk_size, trace_hh_id, trace_label +): """ trip purpose - main functionality separated from model step so it can be called iteratively @@ -186,7 +189,9 @@ def run_trip_purpose(trips_df, estimator, chunk_size, trace_hh_id, trace_label): probs_join_cols = model_settings.get("probs_join_cols", PROBS_JOIN_COLUMNS) spec_file_name = model_settings.get("PROBS_SPEC", "trip_purpose_probs.csv") - probs_spec = pd.read_csv(config.config_file_path(spec_file_name), comment="#") + probs_spec = pd.read_csv( + whale.filesystem.get_config_file_path(spec_file_name), comment="#" + ) # FIXME for now, not really doing estimation for probabilistic model - just overwriting choices # besides, it isn't clear that named coefficients would be helpful if we had some form of estimation # coefficients_df = simulate.read_model_coefficients(model_settings) @@ -221,6 +226,7 @@ def run_trip_purpose(trips_df, estimator, chunk_size, trace_hh_id, trace_label): if preprocessor_settings: locals_dict = config.get_model_constants(model_settings) expressions.assign_columns( + whale, df=trips_df, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -230,9 +236,10 @@ def run_trip_purpose(trips_df, estimator, chunk_size, trace_hh_id, trace_label): use_depart_time = model_settings.get("use_depart_time", True) for i, trips_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - trips_df, chunk_size, chunk_tag, trace_label + whale, trips_df, chunk_size, chunk_tag, trace_label ): choices = choose_intermediate_trip_purpose( + whale, trips_chunk, probs_spec, estimator, @@ -252,9 +259,8 @@ def run_trip_purpose(trips_df, estimator, chunk_size, trace_hh_id, trace_label): return choices -@inject.step() -def trip_purpose(trips, chunk_size, trace_hh_id): - +@workflow.step +def trip_purpose(whale: workflow.Whale, trips, chunk_size, trace_hh_id): """ trip purpose model step - calls run_trip_purpose to run the actual model @@ -264,8 +270,8 @@ def trip_purpose(trips, chunk_size, trace_hh_id): trips_df = trips.to_frame() - if pipeline.is_table("school_escort_trips"): - school_escort_trips = pipeline.get_table("school_escort_trips") + if whale.is_table("school_escort_trips"): + school_escort_trips = whale.get_dataframe("school_escort_trips") # separate out school escorting trips to exclude them from the model and estimation data bundle trips_df, se_trips_df, full_trips_index = split_out_school_escorting_trips( trips_df, school_escort_trips @@ -282,6 +288,7 @@ def trip_purpose(trips, chunk_size, trace_hh_id): estimator.write_choosers(trips_df[chooser_cols_for_estimation]) choices = run_trip_purpose( + whale, trips_df, estimator, chunk_size=chunk_size, @@ -299,7 +306,7 @@ def trip_purpose(trips, chunk_size, trace_hh_id): trips_df["purpose"] = choices - if pipeline.is_table("school_escort_trips"): + if whale.is_table("school_escort_trips"): # setting purpose for school escort trips se_trips_df["purpose"] = reindex(school_escort_trips.purpose, se_trips_df.index) # merge trips back together preserving index order @@ -309,7 +316,7 @@ def trip_purpose(trips, chunk_size, trace_hh_id): # we should have assigned a purpose to all trips assert not trips_df.purpose.isnull().any() - pipeline.replace_table("trips", trips_df) + whale.add_table("trips", trips_df) if trace_hh_id: tracing.trace_df( diff --git a/activitysim/abm/models/trip_purpose_and_destination.py b/activitysim/abm/models/trip_purpose_and_destination.py index 31bca977e..5e6130d88 100644 --- a/activitysim/abm/models/trip_purpose_and_destination.py +++ b/activitysim/abm/models/trip_purpose_and_destination.py @@ -6,25 +6,30 @@ from activitysim.abm.models.trip_destination import run_trip_destination from activitysim.abm.models.trip_purpose import run_trip_purpose +from activitysim.abm.models.util import estimation from activitysim.abm.models.util.trip import ( cleanup_failed_trips, flag_failed_trip_leg_mates, ) -from activitysim.core import config, inject, pipeline, tracing +from activitysim.core import config, tracing, workflow from activitysim.core.util import assign_in_place -from .util import estimation - logger = logging.getLogger(__name__) +@workflow.func def run_trip_purpose_and_destination( - trips_df, tours_merged_df, chunk_size, trace_hh_id, trace_label + whale: workflow.Whale, + trips_df, + tours_merged_df, + chunk_size, + trace_hh_id, + trace_label, ): - assert not trips_df.empty choices = run_trip_purpose( + whale, trips_df, estimator=None, chunk_size=chunk_size, @@ -35,6 +40,7 @@ def run_trip_purpose_and_destination( trips_df["purpose"] = choices trips_df, save_sample_df = run_trip_destination( + whale, trips_df, tours_merged_df, estimator=None, @@ -46,9 +52,10 @@ def run_trip_purpose_and_destination( return trips_df, save_sample_df -@inject.step() -def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): - +@workflow.step +def trip_purpose_and_destination( + whale: workflow.Whale, trips, tours_merged, chunk_size, trace_hh_id +): trace_label = "trip_purpose_and_destination" model_settings = config.read_model_settings("trip_purpose_and_destination.yaml") @@ -60,8 +67,7 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): "DEST_CHOICE_SAMPLE_TABLE_NAME" ) want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + whale.settings.want_dest_choice_sample_tables and sample_table_name is not None ) MAX_ITERATIONS = model_settings.get("MAX_ITERATIONS", 5) @@ -79,7 +85,6 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): # if trip_destination has been run before, keep only failed trips (and leg_mates) to retry if "destination" in trips_df: - if "failed" not in trips_df.columns: # trip_destination model cleaned up any failed trips logger.info("%s - no failed column from prior model run." % trace_label) @@ -89,7 +94,7 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): # 'failed' column but no failed trips from prior run of trip_destination logger.info("%s - no failed trips from prior model run." % trace_label) trips_df.drop(columns="failed", inplace=True) - pipeline.replace_table("trips", trips_df) + whale.add_table("trips", trips_df) return else: @@ -102,11 +107,11 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): logger.info("Rerunning %s failed trips and leg-mates" % trips_df.shape[0]) # drop any previously saved samples of failed trips - if want_sample_table and pipeline.is_table(sample_table_name): + if want_sample_table and whale.is_table(sample_table_name): logger.info("Dropping any previously saved samples of failed trips") - save_sample_df = pipeline.get_table(sample_table_name) + save_sample_df = whale.get_dataframe(sample_table_name) save_sample_df.drop(trips_df.index, level="trip_id", inplace=True) - pipeline.replace_table(sample_table_name, save_sample_df) + whale.add_table(sample_table_name, save_sample_df) del save_sample_df # if we estimated trip_destination, there should have been no failed trips @@ -122,7 +127,6 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): i = 0 TRIP_RESULT_COLUMNS = ["purpose", "destination", "origin", "failed"] while True: - i += 1 for c in TRIP_RESULT_COLUMNS: @@ -130,6 +134,7 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): del trips_df[c] trips_df, save_sample_df = run_trip_purpose_and_destination( + whale, trips_df, tours_merged_df, chunk_size=chunk_size, @@ -139,7 +144,7 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): # # if testing, make sure at least one trip fails if ( - config.setting("testing_fail_trip_destination", False) + whale.settings.testing_fail_trip_destination and (i == 1) and not trips_df.failed.any() ): @@ -202,7 +207,7 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): logger.info( "adding %s samples to %s" % (len(save_sample_df), sample_table_name) ) - pipeline.extend_table(sample_table_name, save_sample_df) + whale.extend_table(sample_table_name, save_sample_df) logger.info( "%s %s failed trips after %s iterations" @@ -214,14 +219,14 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): trips_df = cleanup_failed_trips(trips_df) - pipeline.replace_table("trips", trips_df) + whale.add_table("trips", trips_df) # check to make sure we wrote sample file if requestsd if want_sample_table and len(trips_df) > 0: - assert pipeline.is_table(sample_table_name) + assert whale.is_table(sample_table_name) # since we have saved samples for all successful trips # once we discard failed trips, we should samples for all trips - save_sample_df = pipeline.get_table(sample_table_name) + save_sample_df = whale.get_dataframe(sample_table_name) # expect samples only for intermediate trip destinatinos assert len(save_sample_df.index.get_level_values(0).unique()) == len( trips_df[trips_df.trip_num < trips_df.trip_count] diff --git a/activitysim/abm/models/trip_scheduling.py b/activitysim/abm/models/trip_scheduling.py index f8345650d..6e7f73d71 100644 --- a/activitysim/abm/models/trip_scheduling.py +++ b/activitysim/abm/models/trip_scheduling.py @@ -7,13 +7,14 @@ import pandas as pd from activitysim.abm.models.util import estimation +from activitysim.abm.models.util import probabilistic_scheduling as ps +from activitysim.abm.models.util.school_escort_tours_trips import ( + split_out_school_escorting_trips, +) from activitysim.abm.models.util.trip import cleanup_failed_trips, failed_trip_cohorts -from activitysim.core import chunk, config, inject, logit, pipeline, tracing +from activitysim.core import chunk, config, tracing, workflow from activitysim.core.util import reindex -from .util.school_escort_tours_trips import split_out_school_escorting_trips -from .util import probabilistic_scheduling as ps - logger = logging.getLogger(__name__) """ @@ -152,6 +153,7 @@ def update_tour_earliest(trips, outbound_choices): def schedule_trips_in_leg( + whale: workflow.Whale, outbound, trips, probs_spec, @@ -164,6 +166,7 @@ def schedule_trips_in_leg( Parameters ---------- + whale outbound trips probs_spec @@ -234,7 +237,6 @@ def schedule_trips_in_leg( first_trip_in_leg = True for i in range(trips.trip_num.min(), trips.trip_num.max() + 1): - if outbound or scheduling_mode == DURATION_MODE: # iterate in ascending trip_num order nth_trips = trips[trips.trip_num == i] @@ -245,6 +247,7 @@ def schedule_trips_in_leg( nth_trace_label = tracing.extend_trace_label(trace_label, "num_%s" % i) choices = ps.make_scheduling_choices( + whale, nth_trips, scheduling_mode, probs_spec, @@ -289,6 +292,7 @@ def schedule_trips_in_leg( def run_trip_scheduling( + whale: workflow.Whale, trips_chunk, tours, probs_spec, @@ -299,7 +303,6 @@ def run_trip_scheduling( trace_hh_id, trace_label, ): - set_tour_hour(trips_chunk, tours) set_stop_num(trips_chunk) @@ -314,6 +317,7 @@ def run_trip_scheduling( leg_chunk = trips_chunk[trips_chunk.outbound] leg_trace_label = tracing.extend_trace_label(trace_label, "outbound") choices = schedule_trips_in_leg( + whale, outbound=True, trips=leg_chunk, probs_spec=probs_spec, @@ -334,6 +338,7 @@ def run_trip_scheduling( leg_chunk = trips_chunk[~trips_chunk.outbound] leg_trace_label = tracing.extend_trace_label(trace_label, "inbound") choices = schedule_trips_in_leg( + whale, outbound=False, trips=leg_chunk, probs_spec=probs_spec, @@ -351,9 +356,8 @@ def run_trip_scheduling( return choices -@inject.step() -def trip_scheduling(trips, tours, chunk_size, trace_hh_id): - +@workflow.step +def trip_scheduling(whale: workflow.Whale, trips, tours, chunk_size, trace_hh_id): """ Trip scheduling assigns depart times for trips within the start, end limits of the tour. @@ -406,8 +410,8 @@ def trip_scheduling(trips, tours, chunk_size, trace_hh_id): trips_df = trips.to_frame() tours = tours.to_frame() - if pipeline.is_table("school_escort_trips"): - school_escort_trips = pipeline.get_table("school_escort_trips") + if whale.is_table("school_escort_trips"): + school_escort_trips = whale.get_dataframe("school_escort_trips") # separate out school escorting trips to exclude them from the model and estimation data bundle trips_df, se_trips_df, full_trips_index = split_out_school_escorting_trips( trips_df, school_escort_trips @@ -438,7 +442,7 @@ def trip_scheduling(trips, tours, chunk_size, trace_hh_id): estimator.write_choosers(trips_df[chooser_cols_for_estimation]) probs_spec = pd.read_csv( - config.config_file_path("trip_scheduling_probs.csv"), comment="#" + whale.filesystem.get_config_file_path("trip_scheduling_probs.csv"), comment="#" ) # FIXME for now, not really doing estimation for probabilistic model - just overwriting choices # besides, it isn't clear that named coefficients would be helpful if we had some form of estimation @@ -465,13 +469,12 @@ def trip_scheduling(trips, tours, chunk_size, trace_hh_id): ) in chunk.adaptive_chunked_choosers_by_chunk_id( trips_df, chunk_size, trace_label, trace_label ): - i = 0 while (i < max_iterations) and not trips_chunk.empty: - # only chunk log first iteration since memory use declines with each iteration - with chunk.chunk_log(trace_label) if i == 0 else chunk.chunk_log_skip(): - + with chunk.chunk_log( + trace_label, settings=whale.settings + ) if i == 0 else chunk.chunk_log_skip(): i += 1 is_last_iteration = i == max_iterations @@ -484,6 +487,7 @@ def trip_scheduling(trips, tours, chunk_size, trace_hh_id): ) choices = run_trip_scheduling( + whale, trips_chunk, tours, probs_spec, @@ -509,7 +513,7 @@ def trip_scheduling(trips, tours, chunk_size, trace_hh_id): trips_df = trips.to_frame() - if pipeline.is_table("school_escort_trips"): + if whale.is_table("school_escort_trips"): # separate out school escorting trips to exclude them from the model and estimation data bundle trips_df, se_trips_df, full_trips_index = split_out_school_escorting_trips( trips_df, school_escort_trips @@ -546,7 +550,7 @@ def trip_scheduling(trips, tours, chunk_size, trace_hh_id): trips_df["depart"] = choices - if pipeline.is_table("school_escort_trips"): + if whale.is_table("school_escort_trips"): # setting destination for school escort trips se_trips_df["depart"] = reindex(school_escort_trips.depart, se_trips_df.index) non_se_trips_df["depart"] = reindex(trips_df.depart, non_se_trips_df.index) @@ -562,4 +566,4 @@ def trip_scheduling(trips, tours, chunk_size, trace_hh_id): assert not trips_df.depart.isnull().any() - pipeline.replace_table("trips", trips_df) + whale.add_table("trips", trips_df) diff --git a/activitysim/abm/models/trip_scheduling_choice.py b/activitysim/abm/models/trip_scheduling_choice.py index 5a7abe1f5..7a8edb2c7 100644 --- a/activitysim/abm/models/trip_scheduling_choice.py +++ b/activitysim/abm/models/trip_scheduling_choice.py @@ -7,15 +7,7 @@ generate_alternative_sizes, get_time_windows, ) -from activitysim.core import ( - chunk, - config, - expressions, - inject, - pipeline, - simulate, - tracing, -) +from activitysim.core import chunk, config, expressions, simulate, tracing, workflow from activitysim.core.interaction_sample_simulate import _interaction_sample_simulate logger = logging.getLogger(__name__) @@ -216,9 +208,15 @@ def get_spec_for_segment(model_settings, spec_name, segment): def run_trip_scheduling_choice( - spec, tours, skims, locals_dict, chunk_size, trace_hh_id, trace_label + whale: workflow.Whale, + spec, + tours, + skims, + locals_dict, + chunk_size, + trace_hh_id, + trace_label, ): - NUM_TOUR_LEGS = 3 trace_label = tracing.extend_trace_label(trace_label, "interaction_sample_simulate") @@ -258,13 +256,11 @@ def run_trip_scheduling_choice( indirect_tours = tours.loc[tours[HAS_OB_STOPS] | tours[HAS_IB_STOPS]] if len(indirect_tours) > 0: - # Iterate through the chunks result_list = [] for i, choosers, chunk_trace_label in chunk.adaptive_chunked_choosers( - indirect_tours, chunk_size, trace_label + whale, indirect_tours, chunk_size, trace_label ): - # Sort the choosers and get the schedule alternatives choosers = choosers.sort_index() schedules = generate_schedule_alternatives(choosers).sort_index() @@ -319,9 +315,10 @@ def run_trip_scheduling_choice( return tours -@inject.step() -def trip_scheduling_choice(trips, tours, skim_dict, chunk_size, trace_hh_id): - +@workflow.step +def trip_scheduling_choice( + whale: workflow.Whale, trips, tours, skim_dict, chunk_size, trace_hh_id +): trace_label = "trip_scheduling_choice" model_settings = config.read_model_settings("trip_scheduling_choice.yaml") spec = get_spec_for_segment(model_settings, "SPECIFICATION", "stage_one") @@ -378,6 +375,7 @@ def trip_scheduling_choice(trips, tours, skim_dict, chunk_size, trace_hh_id): simulate.set_skim_wrapper_targets(tours_df, skims) expressions.assign_columns( + whale, df=tours_df, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -385,7 +383,7 @@ def trip_scheduling_choice(trips, tours, skim_dict, chunk_size, trace_hh_id): ) tours_df = run_trip_scheduling_choice( - spec, tours_df, skims, locals_dict, chunk_size, trace_hh_id, trace_label + whale, spec, tours_df, skims, locals_dict, chunk_size, trace_hh_id, trace_label ) - pipeline.replace_table("tours", tours_df) + whale.add_table("tours", tours_df) diff --git a/activitysim/abm/models/util/annotate.py b/activitysim/abm/models/util/annotate.py index a7ef4f54c..e252e8124 100644 --- a/activitysim/abm/models/util/annotate.py +++ b/activitysim/abm/models/util/annotate.py @@ -1,12 +1,9 @@ -# ActivitySim -# See full license in LICENSE.txt. -import pandas as pd import logging -from activitysim.core import expressions -from activitysim.core import tracing -from activitysim.core import inject -from activitysim.core import pipeline +from activitysim.core import expressions, tracing, workflow + +# ActivitySim +# See full license in LICENSE.txt. """ Code for annotating tables @@ -15,7 +12,7 @@ logger = logging.getLogger(__name__) -def annotate_tours(model_settings, trace_label): +def annotate_tours(whale: workflow.Whale, model_settings, trace_label): """ Add columns to the tours table in the pipeline according to spec. @@ -24,16 +21,17 @@ def annotate_tours(model_settings, trace_label): model_settings : dict trace_label : str """ - tours = inject.get_table("tours").to_frame() + tours = whale.get_table("tours") expressions.assign_columns( + whale, df=tours, model_settings=model_settings.get("annotate_tours"), trace_label=tracing.extend_trace_label(trace_label, "annotate_tours"), ) - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) -def annotate_trips(model_settings, trace_label): +def annotate_trips(whale: workflow.Whale, model_settings, trace_label): """ Add columns to the trips table in the pipeline according to spec. @@ -42,10 +40,11 @@ def annotate_trips(model_settings, trace_label): model_settings : dict trace_label : str """ - trips = inject.get_table("trips").to_frame() + trips = whale.get_table("trips") expressions.assign_columns( + whale, df=trips, model_settings=model_settings.get("annotate_trips"), trace_label=tracing.extend_trace_label(trace_label, "annotate_trips"), ) - pipeline.replace_table("trips", trips) + whale.add_table("trips", trips) diff --git a/activitysim/abm/models/util/canonical_ids.py b/activitysim/abm/models/util/canonical_ids.py index 4e46e26aa..ca95169c4 100644 --- a/activitysim/abm/models/util/canonical_ids.py +++ b/activitysim/abm/models/util/canonical_ids.py @@ -5,11 +5,8 @@ import numpy as np import pandas as pd -import re -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import simulate +from activitysim.core import config, simulate, workflow logger = logging.getLogger(__name__) @@ -58,9 +55,9 @@ def enumerate_tour_types(tour_flavors): return channels -def read_alts_file(file_name, set_index=None): +def read_alts_file(whale: workflow.Whale, file_name, set_index=None): try: - alts = simulate.read_model_alts(file_name, set_index=set_index) + alts = simulate.read_model_alts(whale, file_name, set_index=set_index) except (RuntimeError, FileNotFoundError): logger.warning(f"Could not find file {file_name} to determine tour flavors.") return pd.DataFrame() @@ -210,7 +207,7 @@ def determine_flavors_from_alts_file( return flavors -def canonical_tours(): +def canonical_tours(whale: workflow.Whale): """ create labels for every the possible tour by combining tour_type/tour_num. @@ -222,11 +219,11 @@ def canonical_tours(): # ---- non_mandatory_channels nm_model_settings_file_name = "non_mandatory_tour_frequency.yaml" nm_model_settings = config.read_model_settings(nm_model_settings_file_name) - nm_alts = read_alts_file("non_mandatory_tour_frequency_alternatives.csv") + nm_alts = read_alts_file(whale, "non_mandatory_tour_frequency_alternatives.csv") # first need to determine max extension try: - ext_probs_f = config.config_file_path( + ext_probs_f = whale.filesystem.get_config_file_path( "non_mandatory_tour_frequency_extension_probs.csv" ) extension_probs = pd.read_csv(ext_probs_f, comment="#") @@ -260,7 +257,7 @@ def canonical_tours(): mtf_model_settings_file_name = "mandatory_tour_frequency.yaml" mtf_model_settings = config.read_model_settings(mtf_model_settings_file_name) mtf_spec = mtf_model_settings.get("SPEC", "mandatory_tour_frequency.csv") - mtf_model_spec = read_alts_file(file_name=mtf_spec) + mtf_model_spec = read_alts_file(whale, file_name=mtf_spec) default_mandatory_tour_flavors = {"work": 2, "school": 2} mandatory_tour_flavors = determine_mandatory_tour_flavors( @@ -273,7 +270,7 @@ def canonical_tours(): # ---- atwork_subtour_channels atwork_model_settings_file_name = "atwork_subtour_frequency.yaml" atwork_model_settings = config.read_model_settings(atwork_model_settings_file_name) - atwork_alts = read_alts_file("atwork_subtour_frequency_alternatives.csv") + atwork_alts = read_alts_file(whale, "atwork_subtour_frequency_alternatives.csv") provided_atwork_flavors = atwork_model_settings.get("ATWORK_SUBTOUR_FLAVORS", None) default_atwork_flavors = {"eat": 1, "business": 2, "maint": 1} @@ -297,7 +294,7 @@ def canonical_tours(): # ---- joint_tour_channels jtf_model_settings_file_name = "joint_tour_frequency.yaml" jtf_model_settings = config.read_model_settings(jtf_model_settings_file_name) - jtf_alts = read_alts_file("joint_tour_frequency_alternatives.csv") + jtf_alts = read_alts_file(whale, "joint_tour_frequency_alternatives.csv") provided_joint_flavors = jtf_model_settings.get("JOINT_TOUR_FLAVORS", None) default_joint_flavors = { @@ -324,8 +321,8 @@ def canonical_tours(): # ---- school escort channels # only include if model is run - if pipeline.is_table("school_escort_tours") | ( - "school_escorting" in config.setting("models", default=[]) + if whale.is_table("school_escort_tours") | ( + "school_escorting" in whale.settings.models ): se_model_settings_file_name = "school_escorting.yaml" se_model_settings = config.read_model_settings(se_model_settings_file_name) @@ -417,7 +414,7 @@ def set_tour_index( return tours -def determine_max_trips_per_leg(default_max_trips_per_leg=4): +def determine_max_trips_per_leg(whale: workflow.Whale, default_max_trips_per_leg=4): model_settings_file_name = "stop_frequency.yaml" model_settings = config.read_model_settings(model_settings_file_name) @@ -426,7 +423,7 @@ def determine_max_trips_per_leg(default_max_trips_per_leg=4): # determine flavors from alternative file try: - alts = read_alts_file("stop_frequency_alternatives.csv") + alts = read_alts_file(whale, "stop_frequency_alternatives.csv") trips_per_leg = [ int(alts[c].max()) for c in alts.columns @@ -452,10 +449,10 @@ def determine_max_trips_per_leg(default_max_trips_per_leg=4): return default_max_trips_per_leg -def set_trip_index(trips, tour_id_column="tour_id"): +def set_trip_index(whale: workflow.Whale, trips, tour_id_column="tour_id"): # max number of trips per leg (inbound or outbound) of tour # = stops + 1 for primary half-tour destination - max_trips_per_leg = determine_max_trips_per_leg() + max_trips_per_leg = determine_max_trips_per_leg(whale) # canonical_trip_num: 1st trip out = 1, 2nd trip out = 2, 1st in = 5, etc. canonical_trip_num = (~trips.outbound * max_trips_per_leg) + trips.trip_num diff --git a/activitysim/abm/models/util/cdap.py b/activitysim/abm/models/util/cdap.py index 7e0f55186..c4993dcd0 100644 --- a/activitysim/abm/models/util/cdap.py +++ b/activitysim/abm/models/util/cdap.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from activitysim.core import chunk, config, inject, logit, pipeline, simulate, tracing +from activitysim.core import chunk, config, inject, logit, simulate, tracing, workflow logger = logging.getLogger(__name__) @@ -50,7 +50,9 @@ def add_pn(col, pnum): raise RuntimeError("add_pn col not list or str") -def assign_cdap_rank(persons, person_type_map, trace_hh_id=None, trace_label=None): +def assign_cdap_rank( + whale: workflow.Whale, persons, person_type_map, trace_hh_id=None, trace_label=None +): """ Assign an integer index, cdap_rank, to each household member. (Starting with 1, not 0) @@ -130,7 +132,7 @@ def assign_cdap_rank(persons, person_type_map, trace_hh_id=None, trace_label=Non # choose up to MAX_HHSIZE, choosing randomly others = persons[[_hh_id_, "cdap_rank"]].copy() - others["random_order"] = pipeline.get_rn_generator().random_for_df(persons) + others["random_order"] = whale.get_rn_generator().random_for_df(persons) others = ( others.sort_values(by=[_hh_id_, "random_order"], ascending=[True, True]) .groupby(_hh_id_) @@ -188,7 +190,7 @@ def individual_utilities( # calculate single person utilities indiv_utils = simulate.eval_utilities( - cdap_indiv_spec, persons, locals_d, trace_label=trace_label + whale, cdap_indiv_spec, persons, locals_d, trace_label=trace_label ) # add columns from persons to facilitate building household interactions @@ -671,7 +673,7 @@ def household_activity_choices( trace_label=trace_label, ) - utils = simulate.eval_utilities(spec, choosers, trace_label=trace_label) + utils = simulate.eval_utilities(whale, spec, choosers, trace_label=trace_label) if len(utils.index) == 0: return pd.Series(dtype="float64") @@ -680,7 +682,7 @@ def household_activity_choices( # select an activity pattern alternative for each household based on probability # result is a series indexed on _hh_index_ with the (0 based) index of the column from probs - idx_choices, rands = logit.make_choices(probs, trace_label=trace_label) + idx_choices, rands = logit.make_choices(whale, probs, trace_label=trace_label) # convert choice expressed as index into alternative name from util column label choices = pd.Series(utils.columns[idx_choices].values, index=utils.index) @@ -819,7 +821,7 @@ def extra_hh_member_choices( # select an activity pattern alternative for each person based on probability # idx_choices is a series (indexed on _persons_index_ ) with the chosen alternative represented # as the integer (0 based) index of the chosen column from probs - idx_choices, rands = logit.make_choices(probs, trace_label=trace_label) + idx_choices, rands = logit.make_choices(whale, probs, trace_label=trace_label) # convert choice from column index to activity name choices = pd.Series(probs.columns[idx_choices].values, index=probs.index) @@ -883,7 +885,7 @@ def _run_cdap( # assign integer cdap_rank to each household member # persons with cdap_rank 1..MAX_HHSIZE will be have their activities chose by CDAP model # extra household members, will have activities assigned by in fixed proportions - assign_cdap_rank(persons, person_type_map, trace_hh_id, trace_label) + assign_cdap_rank(whale, persons, person_type_map, trace_hh_id, trace_label) chunk.log_df(trace_label, "persons", persons) # Calculate CDAP utilities for each individual, ignoring interactions diff --git a/activitysim/abm/models/util/estimation.py b/activitysim/abm/models/util/estimation.py index 6a5dbadf1..5a3ffd06c 100644 --- a/activitysim/abm/models/util/estimation.py +++ b/activitysim/abm/models/util/estimation.py @@ -9,7 +9,7 @@ import yaml from activitysim.abm.models.util import canonical_ids as cid -from activitysim.core import config, simulate +from activitysim.core import config, simulate, workflow from activitysim.core.util import reindex logger = logging.getLogger("estimation") @@ -29,11 +29,12 @@ def unlink_files(directory_path, file_types=("csv", "yaml")): print(e) -class Estimator(object): - def __init__(self, bundle_name, model_name, estimation_table_recipes): +class Estimator: + def __init__(self, whale, bundle_name, model_name, estimation_table_recipes): logger.info("Initialize Estimator for'%s'" % (model_name,)) + self.whale = whale self.bundle_name = bundle_name self.model_name = model_name self.settings_name = model_name @@ -41,16 +42,17 @@ def __init__(self, bundle_name, model_name, estimation_table_recipes): self.estimating = True # ensure the output data directory exists - output_dir = self.output_directory() + output_dir = self.output_directory(whale) if not os.path.exists(output_dir): os.makedirs(output_dir) # make directory if needed # delete estimation files - unlink_files(self.output_directory(), file_types=("csv", "yaml")) + unlink_files(self.output_directory(whale), file_types=("csv", "yaml")) if self.bundle_name != self.model_name: # kind of inelegant to always delete these, but ok as they are redundantly recreated for each sub model unlink_files( - self.output_directory(bundle_directory=True), file_types=("csv", "yaml") + self.output_directory(whale, bundle_directory=True), + file_types=("csv", "yaml"), ) # FIXME - not required? @@ -125,7 +127,8 @@ def output_directory(self, bundle_directory=False): assert self.model_name is not None dir = os.path.join( - config.output_file_path("estimation_data_bundle"), self.bundle_name + self.whale.filesystem.get_output_dir("estimation_data_bundle"), + self.bundle_name, ) if bundle_directory: @@ -288,7 +291,9 @@ def write_coefficients( def write_coefficients_template(self, model_settings): assert self.estimating - coefficients_df = simulate.read_model_coefficient_template(model_settings) + coefficients_df = simulate.read_model_coefficient_template( + self.whale, model_settings + ) tag = "coefficients_template" self.write_table(coefficients_df, tag, append=False) @@ -445,7 +450,7 @@ def write_spec( assert file_name is None file_name = model_settings[tag] - input_path = config.config_file_path(file_name) + input_path = whale.filesystem.get_config_file_path(file_name) table_name = tag # more readable than full spec file_name output_path = self.output_file_path(table_name, "csv", bundle_directory) @@ -462,14 +467,16 @@ def __init__(self): self.model_estimation_table_types = {} self.estimating = {} - def initialize_settings(self): + def initialize_settings(self, whale): # FIXME - can't we just initialize in init and handle no-presence of settings file as not enabled if self.settings_initialized: return assert not self.settings_initialized - settings = config.read_model_settings(ESTIMATION_SETTINGS_FILE_NAME) + settings = whale.filesystem.read_model_settings( + ESTIMATION_SETTINGS_FILE_NAME, mandatory=False + ) self.enabled = settings.get("enable", "True") self.bundles = settings.get("bundles", []) @@ -488,7 +495,7 @@ def initialize_settings(self): table_name, ESTIMATION_SETTINGS_FILE_NAME, ) - file_path = config.data_file_path( + file_path = whale.filesystem.data_file_path( table_info["file_name"], mandatory=True ) assert os.path.exists( @@ -507,7 +514,7 @@ def initialize_settings(self): self.settings_initialized = True - def begin_estimation(self, model_name, bundle_name=None): + def begin_estimation(self, whale, model_name, bundle_name=None) -> Estimator: """ begin estimating of model_name is specified as model to estimate, otherwise return False @@ -517,11 +524,11 @@ def begin_estimation(self, model_name, bundle_name=None): Returns ------- - + Estimator """ # load estimation settings file if not self.settings_initialized: - self.initialize_settings() + self.initialize_settings(whale) # global estimation setting if not self.enabled: diff --git a/activitysim/abm/models/util/logsums.py b/activitysim/abm/models/util/logsums.py index f42c15c16..e4aff206f 100644 --- a/activitysim/abm/models/util/logsums.py +++ b/activitysim/abm/models/util/logsums.py @@ -32,6 +32,7 @@ def filter_chooser_columns(choosers, logsum_settings, model_settings): def compute_logsums( + whale, choosers, tour_purpose, logsum_settings, @@ -127,10 +128,14 @@ def compute_logsums( else: logger.error("Choosers table already has column 'duration'.") - logsum_spec = simulate.read_model_spec(file_name=logsum_settings["SPEC"]) - coefficients = simulate.get_segment_coefficients(logsum_settings, tour_purpose) + logsum_spec = simulate.read_model_spec(whale, file_name=logsum_settings["SPEC"]) + coefficients = simulate.get_segment_coefficients( + whale, logsum_settings, tour_purpose + ) - logsum_spec = simulate.eval_coefficients(logsum_spec, coefficients, estimator=None) + logsum_spec = simulate.eval_coefficients( + whale, logsum_spec, coefficients, estimator=None + ) nest_spec = config.get_logit_model_settings(logsum_settings) nest_spec = simulate.eval_nest_coefficients(nest_spec, coefficients, trace_label) @@ -211,6 +216,7 @@ def compute_logsums( simulate.set_skim_wrapper_targets(choosers, skims) expressions.assign_columns( + whale, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -218,6 +224,7 @@ def compute_logsums( ) logsums = simulate.simple_simulate_logsums( + whale, choosers, logsum_spec, nest_spec, diff --git a/activitysim/abm/models/util/mode.py b/activitysim/abm/models/util/mode.py index f28713b2c..f050cbeef 100644 --- a/activitysim/abm/models/util/mode.py +++ b/activitysim/abm/models/util/mode.py @@ -4,7 +4,7 @@ import pandas as pd -from activitysim.core import config, expressions, simulate, tracing +from activitysim.core import config, expressions, simulate, tracing, workflow """ At this time, these utilities are mostly for transforming the mode choice @@ -83,6 +83,7 @@ def mode_choice_simulate( def run_tour_mode_choice_simulate( + whale: workflow.Whale, choosers, tour_purpose, model_settings, @@ -106,7 +107,7 @@ def run_tour_mode_choice_simulate( spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients = simulate.get_segment_coefficients(model_settings, tour_purpose) - spec = simulate.eval_coefficients(spec, coefficients, estimator) + spec = simulate.eval_coefficients(whale, spec, coefficients, estimator) nest_spec = config.get_logit_model_settings(model_settings) nest_spec = simulate.eval_nest_coefficients(nest_spec, coefficients, trace_label) diff --git a/activitysim/abm/models/util/probabilistic_scheduling.py b/activitysim/abm/models/util/probabilistic_scheduling.py index aab2e8d70..4de9a5686 100644 --- a/activitysim/abm/models/util/probabilistic_scheduling.py +++ b/activitysim/abm/models/util/probabilistic_scheduling.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd -from activitysim.core import chunk, config, inject, logit, pipeline, simulate, tracing +from activitysim.core import chunk, logit, tracing, workflow logger = logging.getLogger(__name__) @@ -222,6 +222,7 @@ def _postprocess_scheduling_choices( def make_scheduling_choices( + whale: workflow.Whale, choosers_df, scheduling_mode, probs_spec, @@ -267,7 +268,7 @@ def make_scheduling_choices( ).set_index(choosers_df.index.name) chunk.log_df(trace_label, "choosers", choosers) - if trace_hh_id and tracing.has_trace_targets(choosers_df): + if trace_hh_id and tracing.has_trace_targets(whale, choosers_df): tracing.trace_df(choosers, "%s.choosers" % trace_label) # different pre-processing is required based on the scheduling mode @@ -284,17 +285,17 @@ def make_scheduling_choices( chunk.log_df(trace_label, "chooser_probs", chooser_probs) - if trace_hh_id and tracing.has_trace_targets(choosers_df): + if trace_hh_id and tracing.has_trace_targets(whale, choosers_df): tracing.trace_df(chooser_probs, "%s.chooser_probs" % trace_label) raw_choices, rands = logit.make_choices( - chooser_probs, trace_label=trace_label, trace_choosers=choosers + whale, chooser_probs, trace_label=trace_label, trace_choosers=choosers ) chunk.log_df(trace_label, "choices", raw_choices) chunk.log_df(trace_label, "rands", rands) - if trace_hh_id and tracing.has_trace_targets(choosers_df): + if trace_hh_id and tracing.has_trace_targets(whale, choosers_df): tracing.trace_df( raw_choices, "%s.choices" % trace_label, @@ -324,7 +325,7 @@ def make_scheduling_choices( ) # trace before removing failures - if trace_hh_id and tracing.has_trace_targets(choosers_df): + if trace_hh_id and tracing.has_trace_targets(whale, choosers_df): tracing.trace_df( choices, "%s.choices" % trace_label, columns=[None, trace_choice_col_name] ) diff --git a/activitysim/abm/models/util/school_escort_tours_trips.py b/activitysim/abm/models/util/school_escort_tours_trips.py index 778fb8645..3c72d0175 100644 --- a/activitysim/abm/models/util/school_escort_tours_trips.py +++ b/activitysim/abm/models/util/school_escort_tours_trips.py @@ -1,11 +1,11 @@ import logging -import pandas as pd -import numpy as np import warnings +import numpy as np +import pandas as pd + from activitysim.abm.models.util import canonical_ids -from activitysim.core import pipeline -from activitysim.core import inject +from activitysim.core import inject, workflow from activitysim.core.util import reindex from ..school_escorting import NUM_ESCORTEES @@ -394,10 +394,10 @@ def process_tours_after_escorting_model(escort_bundles, tours): return tours -def merge_school_escort_trips_into_pipeline(): - school_escort_trips = pipeline.get_table("school_escort_trips") - tours = pipeline.get_table("tours") - trips = pipeline.get_table("trips") +def merge_school_escort_trips_into_pipeline(whale: workflow.Whale): + school_escort_trips = whale.get_dataframe("school_escort_trips") + tours = whale.get_dataframe("tours") + trips = whale.get_dataframe("trips") # want to remove stops if school escorting takes place on that half tour so we can replace them with the actual stops out_se_tours = tours[ @@ -469,7 +469,7 @@ def merge_school_escort_trips_into_pipeline(): trips["destination"] = trips["destination"].astype(int) # updating trip_id now that we have all trips - trips = canonical_ids.set_trip_index(trips) + trips = canonical_ids.set_trip_index(whale, trips) school_escort_trip_id_map = { v: k for k, v in trips.loc[ @@ -492,10 +492,10 @@ def merge_school_escort_trips_into_pipeline(): trips.drop(columns="school_escort_trip_id", inplace=True) # replace trip table and pipeline and register with the random number generator - pipeline.replace_table("trips", trips) + whale.add_table("trips", trips) pipeline.get_rn_generator().drop_channel("trips") pipeline.get_rn_generator().add_channel("trips", trips) - pipeline.replace_table("school_escort_trips", school_escort_trips) + whale.add_table("school_escort_trips", school_escort_trips) # updating stop frequency in tours tabel to be consistent num_outbound_stops = ( @@ -510,7 +510,7 @@ def merge_school_escort_trips_into_pipeline(): tours.loc[stop_freq.index, "stop_frequency"] = stop_freq # no need to reset random number generator since no tours added - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) return trips @@ -528,7 +528,7 @@ def recompute_tour_count_statistics(): tours["tour_num"] = grouped.cumcount() + 1 tours["tour_count"] = tours["tour_num"] + grouped.cumcount(ascending=False) - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) def create_pure_school_escort_tours(bundles): diff --git a/activitysim/abm/models/util/test/test_cdap.py b/activitysim/abm/models/util/test/test_cdap.py index 0e4bd6839..55b3ad23c 100644 --- a/activitysim/abm/models/util/test/test_cdap.py +++ b/activitysim/abm/models/util/test/test_cdap.py @@ -8,9 +8,8 @@ import pytest import yaml -from activitysim.core import chunk, config, inject, simulate - -from .. import cdap +from activitysim.abm.models.util import cdap +from activitysim.core import chunk, config, inject, simulate, workflow @pytest.fixture(scope="module") @@ -49,9 +48,9 @@ def setup_function(): def test_bad_coefficients(): - coefficients = pd.read_csv( - config.config_file_path("cdap_interaction_coefficients.csv"), comment="#" + whale.filesystem.get_config_file_path("cdap_interaction_coefficients.csv"), + comment="#", ) coefficients = cdap.preprocess_interaction_coefficients(coefficients) @@ -63,11 +62,10 @@ def test_bad_coefficients(): def test_assign_cdap_rank(people, model_settings): - person_type_map = model_settings.get("PERSON_TYPE_MAP", {}) - with chunk.chunk_log("test_assign_cdap_rank", base=True): - cdap.assign_cdap_rank(people, person_type_map) + with chunk.chunk_log("test_assign_cdap_rank", base=True, settings=whale.settings): + cdap.assign_cdap_rank(whale, people, person_type_map) expected = pd.Series( [1, 1, 1, 2, 2, 1, 3, 1, 2, 1, 3, 2, 1, 3, 2, 4, 1, 3, 4, 2], index=people.index @@ -79,15 +77,16 @@ def test_assign_cdap_rank(people, model_settings): def test_individual_utilities(people, model_settings): - cdap_indiv_and_hhsize1 = simulate.read_model_spec( file_name="cdap_indiv_and_hhsize1.csv" ) person_type_map = model_settings.get("PERSON_TYPE_MAP", {}) - with chunk.chunk_log("test_individual_utilities", base=True): - cdap.assign_cdap_rank(people, person_type_map) + with chunk.chunk_log( + "test_individual_utilities", base=True, settings=whale.settings + ): + cdap.assign_cdap_rank(whale, people, person_type_map) individual_utils = cdap.individual_utilities( people, cdap_indiv_and_hhsize1, locals_d=None ) @@ -126,15 +125,15 @@ def test_individual_utilities(people, model_settings): ) -def test_build_cdap_spec_hhsize2(people, model_settings): - +def test_build_cdap_spec_hhsize2(whale: workflow.Whale, people, model_settings): hhsize = 2 cdap_indiv_and_hhsize1 = simulate.read_model_spec( file_name="cdap_indiv_and_hhsize1.csv" ) interaction_coefficients = pd.read_csv( - config.config_file_path("cdap_interaction_coefficients.csv"), comment="#" + whale.filesystem.get_config_file_path("cdap_interaction_coefficients.csv"), + comment="#", ) interaction_coefficients = cdap.preprocess_interaction_coefficients( interaction_coefficients @@ -142,8 +141,10 @@ def test_build_cdap_spec_hhsize2(people, model_settings): person_type_map = model_settings.get("PERSON_TYPE_MAP", {}) - with chunk.chunk_log("test_build_cdap_spec_hhsize2", base=True): - cdap.assign_cdap_rank(people, person_type_map) + with chunk.chunk_log( + "test_build_cdap_spec_hhsize2", base=True, settings=whale.settings + ): + cdap.assign_cdap_rank(whale, people, person_type_map) indiv_utils = cdap.individual_utilities( people, cdap_indiv_and_hhsize1, locals_d=None ) diff --git a/activitysim/abm/models/util/tour_destination.py b/activitysim/abm/models/util/tour_destination.py index fed9c1557..b22f2e241 100644 --- a/activitysim/abm/models/util/tour_destination.py +++ b/activitysim/abm/models/util/tour_destination.py @@ -5,14 +5,13 @@ import numpy as np import pandas as pd +from activitysim.abm.models.util import logsums as logsum from activitysim.abm.tables.size_terms import tour_destination_size_terms -from activitysim.core import config, inject, los, pipeline, simulate, tracing +from activitysim.core import config, inject, los, simulate, tracing, workflow from activitysim.core.interaction_sample import interaction_sample from activitysim.core.interaction_sample_simulate import interaction_sample_simulate from activitysim.core.util import reindex -from . import logsums as logsum - logger = logging.getLogger(__name__) DUMP = False @@ -25,7 +24,6 @@ class SizeTermCalculator(object): """ def __init__(self, size_term_selector): - # do this once so they can request size_terms for various segments (tour_type or purpose) land_use = inject.get_table("land_use") size_terms = inject.get_injectable("size_terms") @@ -78,7 +76,6 @@ def _destination_sample( trace_label, zone_layer=None, ): - model_spec = simulate.spec_for_segment( model_settings, spec_id="SAMPLE_SPEC", @@ -112,6 +109,7 @@ def _destination_sample( log_alt_losers = config.setting("log_alt_losers", False) choices = interaction_sample( + whale, choosers, alternatives=destination_size_terms, sample_size=sample_size, @@ -146,7 +144,6 @@ def destination_sample( chunk_size, trace_label, ): - chunk_tag = "tour_destination.sample" # create wrapper with keys for this lookup @@ -229,7 +226,7 @@ def aggregate_size_terms(dest_size_terms, network_los): return MAZ_size_terms, TAZ_size_terms -def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): +def choose_MAZ_for_TAZ(whale: workflow.Whale, taz_sample, MAZ_size_terms, trace_label): """ Convert taz_sample table with TAZ zone sample choices to a table with a MAZ zone chosen for each TAZ choose MAZ probabilistically (proportionally by size_term) from set of MAZ zones in parent TAZ @@ -252,7 +249,7 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): # 542963 59 0.008628 1 13243 trace_hh_id = inject.get_injectable("trace_hh_id", None) - have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample) + have_trace_targets = trace_hh_id and tracing.has_trace_targets(whale, taz_sample) if have_trace_targets: trace_label = tracing.extend_trace_label(trace_label, "choose_MAZ_for_TAZ") @@ -364,7 +361,7 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): maz_probs = np.divide(padded_maz_sizes, row_sums.reshape(-1, 1)) assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count) - rands = pipeline.get_rn_generator().random_for_df(chooser_df, n=taz_sample_size) + rands = whale.get_rn_generator().random_for_df(chooser_df, n=taz_sample_size) rands = rands.reshape(-1, 1) assert len(rands) == num_choosers * taz_sample_size assert len(rands) == maz_probs.shape[0] @@ -382,7 +379,6 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): taz_choices["prob"] = taz_choices["TAZ_prob"] * taz_choices["MAZ_prob"] if have_trace_targets: - taz_choices_trace_targets = tracing.trace_targets( taz_choices, slicer=CHOOSER_ID ) @@ -469,7 +465,6 @@ def destination_presample( chunk_size, trace_label, ): - trace_label = tracing.extend_trace_label(trace_label, "presample") chunk_tag = "tour_destination.presample" @@ -508,7 +503,7 @@ def destination_presample( ) # choose a MAZ for each DEST_TAZ choice, choice probability based on MAZ size_term fraction of TAZ total - maz_choices = choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label) + maz_choices = choose_MAZ_for_TAZ(whale, taz_sample, MAZ_size_terms, trace_label) assert DEST_MAZ in maz_choices maz_choices = maz_choices.rename(columns={DEST_MAZ: alt_dest_col_name}) @@ -527,7 +522,6 @@ def run_destination_sample( chunk_size, trace_label, ): - # FIXME - MEMORY HACK - only include columns actually used in spec (omit them pre-merge) chooser_columns = model_settings["SIMULATE_CHOOSER_COLUMNS"] @@ -561,7 +555,6 @@ def run_destination_sample( ) if pre_sample_taz: - logger.info( "Running %s destination_presample with %d tours" % (trace_label, len(tours)) ) @@ -652,6 +645,7 @@ def run_destination_logsums( tracing.dump_df(DUMP, choosers, trace_label, "choosers") logsums = logsum.compute_logsums( + whale, choosers, tour_purpose, logsum_settings, @@ -792,7 +786,6 @@ def run_tour_destination( trace_label, skip_choice=False, ): - size_term_calculator = SizeTermCalculator(model_settings["SIZE_TERM_SELECTOR"]) # maps segment names to compact (integer) ids @@ -807,7 +800,6 @@ def run_tour_destination( choices_list = [] sample_list = [] for segment_name in segments: - segment_trace_label = tracing.extend_trace_label(trace_label, segment_name) if chooser_segment_column is not None: diff --git a/activitysim/abm/models/util/tour_od.py b/activitysim/abm/models/util/tour_od.py index 91377bd6d..e774eef20 100644 --- a/activitysim/abm/models/util/tour_od.py +++ b/activitysim/abm/models/util/tour_od.py @@ -13,9 +13,9 @@ inject, logit, los, - pipeline, simulate, tracing, + workflow, ) from activitysim.core.interaction_sample import interaction_sample from activitysim.core.interaction_sample_simulate import interaction_sample_simulate @@ -50,7 +50,6 @@ def create_od_id_col(df, origin_col, destination_col): def _get_od_cols_from_od_id( df, orig_col_name=None, dest_col_name=None, od_id_col="choice" ): - df[orig_col_name] = df[od_id_col].str.split("_").str[0].astype(int) df[dest_col_name] = df[od_id_col].str.split("_").str[1].astype(int) @@ -112,7 +111,9 @@ def _create_od_alts_from_dest_size_terms( return od_alts +@workflow.func def _od_sample( + whale: workflow.Whale, spec_segment_name, choosers, network_los, @@ -127,7 +128,6 @@ def _od_sample( chunk_tag, trace_label, ): - model_spec = simulate.spec_for_segment( model_settings, spec_id="SAMPLE_SPEC", @@ -186,6 +186,7 @@ def _od_sample( logger.error("Alts df is missing origin skim key column.") choices = interaction_sample( + whale, choosers, alternatives=od_alts_df, sample_size=sample_size, @@ -212,7 +213,6 @@ def od_sample( chunk_size, trace_label, ): - chunk_tag = "tour_od.sample" origin_col_name = model_settings["ORIG_COL_NAME"] @@ -275,7 +275,6 @@ def map_ext_maz_to_maz(s): def aggregate_size_terms(dest_size_terms, network_los): - # aggregate MAZ_size_terms to TAZ_size_terms MAZ_size_terms = dest_size_terms.copy() @@ -310,7 +309,9 @@ def aggregate_size_terms(dest_size_terms, network_los): return MAZ_size_terms, TAZ_size_terms +@workflow.func def choose_MAZ_for_TAZ( + whale: workflow.Whale, taz_sample, MAZ_size_terms, trace_label, @@ -342,7 +343,7 @@ def choose_MAZ_for_TAZ( # 542963 59 0.008628 1 13243 trace_hh_id = inject.get_injectable("trace_hh_id", None) - have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample) + have_trace_targets = trace_hh_id and tracing.has_trace_targets(whale, taz_sample) if have_trace_targets: trace_label = tracing.extend_trace_label(trace_label, "choose_MAZ_for_TAZ") @@ -464,7 +465,7 @@ def choose_MAZ_for_TAZ( maz_probs = np.divide(padded_maz_sizes, row_sums.reshape(-1, 1)) assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count) - rands = pipeline.get_rn_generator().random_for_df(chooser_df, n=taz_sample_size) + rands = whale.get_rn_generator().random_for_df(chooser_df, n=taz_sample_size) rands = rands.reshape(-1, 1) assert len(rands) == num_choosers * taz_sample_size assert len(rands) == maz_probs.shape[0] @@ -484,7 +485,6 @@ def choose_MAZ_for_TAZ( taz_choices["prob"] = taz_choices["TAZ_prob"] * taz_choices["MAZ_prob"] if have_trace_targets: - taz_choices_trace_targets = tracing.trace_targets( taz_choices, slicer=CHOOSER_ID ) @@ -562,7 +562,9 @@ def choose_MAZ_for_TAZ( return taz_choices_w_maz +@workflow.func def od_presample( + whale: workflow.Whale, spec_segment_name, choosers, model_settings, @@ -572,7 +574,6 @@ def od_presample( chunk_size, trace_label, ): - trace_label = tracing.extend_trace_label(trace_label, "presample") chunk_tag = "tour_od.presample" @@ -617,6 +618,7 @@ def od_presample( # MAZ size_term fraction of TAZ total maz_choices = choose_MAZ_for_TAZ( + whale, orig_MAZ_dest_TAZ_sample, MAZ_size_terms, trace_label, @@ -643,7 +645,6 @@ class SizeTermCalculator(object): """ def __init__(self, size_term_selector): - # do this once so they can request size_terms for various segments (tour_type or purpose) land_use = inject.get_table("land_use") self.land_use = land_use @@ -685,6 +686,7 @@ def dest_size_terms_series(self, segment_name): def run_od_sample( + whale, spec_segment_name, tours, model_settings, @@ -694,7 +696,6 @@ def run_od_sample( chunk_size, trace_label, ): - model_spec = simulate.spec_for_segment( model_settings, spec_id="SAMPLE_SPEC", @@ -724,12 +725,12 @@ def run_od_sample( ) if pre_sample_taz: - logger.info( "Running %s destination_presample with %d tours" % (trace_label, len(tours)) ) choices = od_presample( + whale, spec_segment_name, choosers, model_settings, @@ -795,7 +796,6 @@ def run_od_logsums( # run trip mode choice to compute tour mode choice logsums if logsum_settings.get("COMPUTE_TRIP_MODE_CHOICE_LOGSUMS", False): - pseudo_tours = choosers.copy() trip_mode_choice_settings = config.read_model_settings("trip_mode_choice") @@ -865,9 +865,9 @@ def run_od_logsums( if col not in trips: logsum_trips[col] = reindex(pseudo_tours[col], logsum_trips.unique_id) - pipeline.replace_table("trips", logsum_trips) + whale.add_table("trips", logsum_trips) tracing.register_traceable_table("trips", logsum_trips) - pipeline.get_rn_generator().add_channel("trips", logsum_trips) + whale.get_rn_generator().add_channel("trips", logsum_trips) # run trip mode choice on pseudo-trips. use orca instead of pipeline to # execute the step because pipeline can only handle one open step at a time @@ -895,7 +895,7 @@ def run_od_logsums( choosers.reset_index(inplace=True) choosers.set_index(choosers_og_index, inplace=True) - pipeline.get_rn_generator().drop_channel("trips") + whale.get_rn_generator().drop_channel("trips") tracing.deregister_traceable_table("trips") assert (od_sample.index == choosers.index).all() @@ -903,6 +903,7 @@ def run_od_logsums( od_sample[col] = choosers[col] logsums = logsum.compute_logsums( + whale, choosers, spec_segment_name, logsum_settings, @@ -1030,6 +1031,7 @@ def run_od_simulate( def run_tour_od( + whale, tours, persons, want_logsums, @@ -1041,7 +1043,6 @@ def run_tour_od( trace_hh_id, trace_label, ): - size_term_calculator = SizeTermCalculator(model_settings["SIZE_TERM_SELECTOR"]) preprocessor_settings = model_settings.get("preprocessor", None) origin_col_name = model_settings["ORIG_COL_NAME"] @@ -1057,7 +1058,6 @@ def run_tour_od( choices_list = [] sample_list = [] for segment_name in segments: - choosers = tours[tours[chooser_segment_column] == segment_name] choosers = pd.merge( @@ -1070,6 +1070,7 @@ def run_tour_od( # - annotate choosers if preprocessor_settings: expressions.assign_columns( + whale, df=choosers, model_settings=preprocessor_settings, trace_label=trace_label, @@ -1090,6 +1091,7 @@ def run_tour_od( spec_segment_name = segment_name # spec_segment_name is segment_name od_sample_df = run_od_sample( + whale, spec_segment_name, choosers, model_settings, diff --git a/activitysim/abm/models/util/tour_scheduling.py b/activitysim/abm/models/util/tour_scheduling.py index 1d6de7316..ed6f89105 100644 --- a/activitysim/abm/models/util/tour_scheduling.py +++ b/activitysim/abm/models/util/tour_scheduling.py @@ -4,15 +4,15 @@ import pandas as pd -from activitysim.core import config, expressions, inject, simulate - -from . import estimation -from . import vectorize_tour_scheduling as vts +from activitysim.abm.models.util import estimation +from activitysim.abm.models.util import vectorize_tour_scheduling as vts +from activitysim.core import config, expressions, inject, simulate, workflow logger = logging.getLogger(__name__) def run_tour_scheduling( + whale: workflow.Whale, model_name, chooser_tours, persons_merged, @@ -21,7 +21,6 @@ def run_tour_scheduling( chunk_size, trace_hh_id, ): - trace_label = model_name model_settings_file_name = f"{model_name}.yaml" @@ -50,6 +49,7 @@ def run_tour_scheduling( locals_d.update(config.get_model_constants(model_settings)) expressions.assign_columns( + whale, df=chooser_tours, model_settings=preprocessor_settings, locals_dict=locals_d, @@ -63,7 +63,6 @@ def run_tour_scheduling( specs = {} sharrow_skips = {} for spec_segment_name, spec_settings in spec_segment_settings.items(): - bundle_name = f"{model_name}_{spec_segment_name}" # estimator for this tour_segment @@ -75,7 +74,7 @@ def run_tour_scheduling( model_spec = simulate.read_model_spec(file_name=spec_file_name) coefficients_df = simulate.read_model_coefficients(spec_settings) specs[spec_segment_name] = simulate.eval_coefficients( - model_spec, coefficients_df, estimator + whale, model_spec, coefficients_df, estimator ) sharrow_skips[spec_segment_name] = spec_settings.get("sharrow_skip", False) @@ -115,7 +114,9 @@ def run_tour_scheduling( model_spec = simulate.read_model_spec(file_name=spec_file_name) sharrow_skip = model_settings.get("sharrow_skip", False) coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = simulate.eval_coefficients( + whale, model_spec, coefficients_df, estimator + ) if estimator: estimators[None] = estimator # add to local list diff --git a/activitysim/abm/models/util/trip.py b/activitysim/abm/models/util/trip.py index 870801e27..0e9f1234f 100644 --- a/activitysim/abm/models/util/trip.py +++ b/activitysim/abm/models/util/trip.py @@ -151,7 +151,7 @@ def get_time_windows(residual, level): @inject.injectable() def stop_frequency_alts(): # alt file for building trips even though simulation is simple_simulate not interaction_simulate - file_path = config.config_file_path("stop_frequency_alternatives.csv") + file_path = whale.filesystem.get_config_file_path("stop_frequency_alternatives.csv") df = pd.read_csv(file_path, comment="#") df.set_index("alt", inplace=True) return df @@ -278,7 +278,7 @@ def initialize_from_tours(tours, stop_frequency_alts, addtl_tour_cols_to_preserv else: trip_index_tour_id = "tour_id" - set_trip_index(trips, trip_index_tour_id) + set_trip_index(whale, trips, trip_index_tour_id) del trips["tour_temp_index"] return trips diff --git a/activitysim/abm/models/util/vectorize_tour_scheduling.py b/activitysim/abm/models/util/vectorize_tour_scheduling.py index 0882a6f74..b198f61a0 100644 --- a/activitysim/abm/models/util/vectorize_tour_scheduling.py +++ b/activitysim/abm/models/util/vectorize_tour_scheduling.py @@ -7,7 +7,7 @@ from activitysim.core import chunk, config, expressions, inject, los, simulate from activitysim.core import timetable as tt -from activitysim.core import tracing +from activitysim.core import tracing, workflow from activitysim.core.interaction_sample_simulate import interaction_sample_simulate from activitysim.core.util import reindex @@ -92,7 +92,14 @@ def skims_for_logsums(tour_purpose, model_settings, trace_label): def _compute_logsums( - alt_tdd, tours_merged, tour_purpose, model_settings, network_los, skims, trace_label + whale: workflow.Whale, + alt_tdd, + tours_merged, + tour_purpose, + model_settings, + network_los, + skims, + trace_label, ): """ compute logsums for tours using skims for alt_tdd out_period and in_period @@ -100,7 +107,7 @@ def _compute_logsums( trace_label = tracing.extend_trace_label(trace_label, "logsums") - with chunk.chunk_log(trace_label): + with chunk.chunk_log(trace_label, settings=whale.settings): logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) choosers = alt_tdd.join(tours_merged, how="left", rsuffix="_chooser") logger.info( @@ -134,6 +141,7 @@ def _compute_logsums( simulate.set_skim_wrapper_targets(choosers, skims) expressions.assign_columns( + whale, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -143,7 +151,7 @@ def _compute_logsums( # - compute logsums logsum_spec = simulate.read_model_spec(file_name=logsum_settings["SPEC"]) logsum_spec = simulate.eval_coefficients( - logsum_spec, coefficients, estimator=None + whale, logsum_spec, coefficients, estimator=None ) nest_spec = config.get_logit_model_settings(logsum_settings) @@ -152,6 +160,7 @@ def _compute_logsums( ) logsums = simulate.simple_simulate_logsums( + whale, choosers, logsum_spec, nest_spec, @@ -164,14 +173,17 @@ def _compute_logsums( return logsums -def dedupe_alt_tdd(alt_tdd, tour_purpose, trace_label): +def dedupe_alt_tdd(whale: workflow.Whale, alt_tdd, tour_purpose, trace_label): tdd_segments = inject.get_injectable("tdd_alt_segments", None) alt_tdd_periods = None logger.info("tdd_alt_segments specified for representative logsums") - with chunk.chunk_log(tracing.extend_trace_label(trace_label, "dedupe_alt_tdd")): + with chunk.chunk_log( + tracing.extend_trace_label(trace_label, "dedupe_alt_tdd"), + settings=whale.settings, + ): if tdd_segments is not None: @@ -287,7 +299,7 @@ def dedupe_alt_tdd(alt_tdd, tour_purpose, trace_label): def compute_logsums( - alt_tdd, tours_merged, tour_purpose, model_settings, skims, trace_label + whale, alt_tdd, tours_merged, tour_purpose, model_settings, skims, trace_label ): """ Compute logsums for the tour alt_tdds, which will differ based on their different start, stop @@ -319,7 +331,7 @@ def compute_logsums( # outside chunk_log context because we extend log_df call for alt_tdd made by our only caller _schedule_tours chunk.log_df(trace_label, "alt_tdd", alt_tdd) - with chunk.chunk_log(trace_label): + with chunk.chunk_log(trace_label, settings=whale.settings): if USE_BRUTE_FORCE_TO_COMPUTE_LOGSUMS: # compute logsums for all the tour alt_tdds (inefficient) @@ -336,7 +348,7 @@ def compute_logsums( index_name = alt_tdd.index.name deduped_alt_tdds, redupe_columns = dedupe_alt_tdd( - alt_tdd, tour_purpose, trace_label + whale, alt_tdd, tour_purpose, trace_label ) chunk.log_df(trace_label, "deduped_alt_tdds", deduped_alt_tdds) @@ -444,7 +456,13 @@ def get_previous_tour_by_tourid( def tdd_interaction_dataset( - tours, alts, timetable, choice_column, window_id_col, trace_label + whale: workflow.Whale, + tours, + alts, + timetable, + choice_column, + window_id_col, + trace_label, ): """ interaction_sample_simulate expects @@ -473,7 +491,7 @@ def tdd_interaction_dataset( trace_label = tracing.extend_trace_label(trace_label, "tdd_interaction_dataset") - with chunk.chunk_log(trace_label): + with chunk.chunk_log(trace_label, settings=whale.settings): alts_ids = np.tile(alts.index, len(tours.index)) chunk.log_df(trace_label, "alts_ids", alts_ids) @@ -574,6 +592,7 @@ def run_alts_preprocessor(model_settings, alts, segment, locals_dict, trace_labe alts = alts.copy() expressions.assign_columns( + whale, df=alts, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -584,6 +603,7 @@ def run_alts_preprocessor(model_settings, alts, segment, locals_dict, trace_labe def _schedule_tours( + whale: workflow.Whale, tours, persons_merged, alts, @@ -673,7 +693,7 @@ def _schedule_tours( # indexed (not unique) on tour_id choice_column = TDD_CHOICE_COLUMN alt_tdd = tdd_interaction_dataset( - tours, alts, timetable, choice_column, window_id_col, tour_trace_label + whale, tours, alts, timetable, choice_column, window_id_col, tour_trace_label ) # print(f"tours {tours.shape} alts {alts.shape}") @@ -682,7 +702,13 @@ def _schedule_tours( # - add logsums if logsum_tour_purpose: logsums = compute_logsums( - alt_tdd, tours, logsum_tour_purpose, model_settings, skims, tour_trace_label + whale, + alt_tdd, + tours, + logsum_tour_purpose, + model_settings, + skims, + tour_trace_label, ) else: logsums = 0 @@ -758,6 +784,7 @@ def _schedule_tours( def schedule_tours( + whale: workflow.Whale, tours, persons_merged, alts, @@ -806,10 +833,11 @@ def schedule_tours( result_list = [] for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - tours, chunk_size, tour_trace_label, tour_chunk_tag + whale, tours, chunk_size, tour_trace_label, tour_chunk_tag ): choices = _schedule_tours( + whale, chooser_chunk, persons_merged, alts, @@ -959,6 +987,7 @@ def vectorize_tour_scheduling( ) choices = schedule_tours( + whale, nth_tours_in_segment, persons_merged, alts, @@ -989,6 +1018,7 @@ def vectorize_tour_scheduling( assert tour_segments.get("spec_segment_name") is None choices = schedule_tours( + whale, nth_tours, persons_merged, alts, @@ -1107,6 +1137,7 @@ def vectorize_subtour_scheduling( assert not nth_tours.parent_tour_id.duplicated().any() choices = schedule_tours( + whale, nth_tours, persons_merged, alts, @@ -1167,6 +1198,7 @@ def build_joint_tour_timetables( def vectorize_joint_tour_scheduling( + whale: workflow.Whale, joint_tours, joint_tour_participants, persons_merged, @@ -1256,6 +1288,7 @@ def vectorize_joint_tour_scheduling( ) choices = schedule_tours( + whale, nth_tours, persons_merged, alts, diff --git a/activitysim/abm/models/vehicle_allocation.py b/activitysim/abm/models/vehicle_allocation.py index e68abc386..227b78c79 100644 --- a/activitysim/abm/models/vehicle_allocation.py +++ b/activitysim/abm/models/vehicle_allocation.py @@ -1,34 +1,17 @@ # ActivitySim # See full license in LICENSE.txt. -import itertools import logging -import os -import numpy as np import pandas as pd -from activitysim.core import ( - assign, - config, - expressions, - inject, - logit, - los, - pipeline, - simulate, - tracing, -) -from activitysim.core.interaction_simulate import interaction_simulate -from activitysim.core.util import assign_in_place - -from .util import estimation -from .util.mode import mode_choice_simulate +from activitysim.abm.models.util import estimation +from activitysim.core import config, expressions, simulate, tracing, workflow logger = logging.getLogger(__name__) -def annotate_vehicle_allocation(model_settings, trace_label): +def annotate_vehicle_allocation(whale: workflow.Whale, model_settings, trace_label): """ Add columns to the tours table in the pipeline according to spec. @@ -37,13 +20,14 @@ def annotate_vehicle_allocation(model_settings, trace_label): model_settings : dict trace_label : str """ - tours = inject.get_table("tours").to_frame() + tours = whale.get_dataframe("tours") expressions.assign_columns( + whale, df=tours, model_settings=model_settings.get("annotate_tours"), trace_label=tracing.extend_trace_label(trace_label, "annotate_tours"), ) - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) def get_skim_dict(network_los, choosers): @@ -89,8 +73,9 @@ def get_skim_dict(network_los, choosers): return skims -@inject.step() +@workflow.step def vehicle_allocation( + whale: workflow.Whale, persons, households, vehicles, @@ -113,6 +98,7 @@ def vehicle_allocation( Parameters ---------- + whale : workflow.Whale persons : orca.DataFrameWrapper households : orca.DataFrameWrapper vehicles : orca.DataFrameWrapper @@ -130,9 +116,11 @@ def vehicle_allocation( estimator = estimation.manager.begin_estimation("vehicle_allocation") - model_spec_raw = simulate.read_model_spec(file_name=model_settings["SPEC"]) + model_spec_raw = simulate.read_model_spec(whale, file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec_raw, coefficients_df, estimator) + model_spec = simulate.eval_coefficients( + whale, model_spec_raw, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) @@ -182,6 +170,7 @@ def vehicle_allocation( preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: expressions.assign_columns( + whale, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -243,7 +232,7 @@ def vehicle_allocation( estimator.write_override_choices(choices) estimator.end_estimation() - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) tracing.print_summary( "vehicle_allocation", tours[tours_veh_occup_cols], value_counts=True @@ -251,7 +240,7 @@ def vehicle_allocation( annotate_settings = model_settings.get("annotate_tours", None) if annotate_settings: - annotate_vehicle_allocation(model_settings, trace_label) + annotate_vehicle_allocation(whale, model_settings, trace_label) if trace_hh_id: tracing.trace_df(tours, label="vehicle_allocation", warn_if_empty=True) diff --git a/activitysim/abm/models/vehicle_type_choice.py b/activitysim/abm/models/vehicle_type_choice.py index 95723eb9d..d31f3278c 100644 --- a/activitysim/abm/models/vehicle_type_choice.py +++ b/activitysim/abm/models/vehicle_type_choice.py @@ -9,25 +9,24 @@ import pandas as pd from activitysim.core import ( - assign, config, expressions, inject, logit, - los, - pipeline, simulate, tracing, + workflow, ) from activitysim.core.interaction_simulate import interaction_simulate -from activitysim.core.util import assign_in_place from .util import estimation logger = logging.getLogger(__name__) -def append_probabilistic_vehtype_type_choices(choices, model_settings, trace_label): +def append_probabilistic_vehtype_type_choices( + whale: workflow.Whale, choices, model_settings, trace_label +): """ Select a fuel type for the provided body type and age of the vehicle. @@ -46,7 +45,9 @@ def append_probabilistic_vehtype_type_choices(choices, model_settings, trace_lab table of chosen vehicle types """ probs_spec_file = model_settings.get("PROBS_SPEC", None) - probs_spec = pd.read_csv(config.config_file_path(probs_spec_file), comment="#") + probs_spec = pd.read_csv( + whale.filesystem.get_config_file_path(probs_spec_file), comment="#" + ) fleet_year = model_settings.get("FLEET_YEAR") probs_spec["age"] = (1 + fleet_year - probs_spec["vehicle_year"]).astype(int) @@ -75,7 +76,7 @@ def append_probabilistic_vehtype_type_choices(choices, model_settings, trace_lab # make probabilistic choices prob_choices, rands = logit.make_choices( - chooser_probs, trace_label=trace_label, trace_choosers=choosers + whale, chooser_probs, trace_label=trace_label, trace_choosers=choosers ) # convert alt choice index to vehicle type attribute @@ -91,7 +92,9 @@ def append_probabilistic_vehtype_type_choices(choices, model_settings, trace_lab return choices -def annotate_vehicle_type_choice_households(model_settings, trace_label): +def annotate_vehicle_type_choice_households( + whale: workflow.Whale, model_settings, trace_label +): """ Add columns to the households table in the pipeline according to spec. @@ -102,14 +105,17 @@ def annotate_vehicle_type_choice_households(model_settings, trace_label): """ households = inject.get_table("households").to_frame() expressions.assign_columns( + whale, df=households, model_settings=model_settings.get("annotate_households"), trace_label=tracing.extend_trace_label(trace_label, "annotate_households"), ) - pipeline.replace_table("households", households) + whale.add_table("households", households) -def annotate_vehicle_type_choice_persons(model_settings, trace_label): +def annotate_vehicle_type_choice_persons( + whale: workflow.Whale, model_settings, trace_label +): """ Add columns to the persons table in the pipeline according to spec. @@ -120,14 +126,17 @@ def annotate_vehicle_type_choice_persons(model_settings, trace_label): """ persons = inject.get_table("persons").to_frame() expressions.assign_columns( + whale, df=persons, model_settings=model_settings.get("annotate_persons"), trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), ) - pipeline.replace_table("persons", households) + whale.add_table("persons", persons) -def annotate_vehicle_type_choice_vehicles(model_settings, trace_label): +def annotate_vehicle_type_choice_vehicles( + whale: workflow.Whale, model_settings, trace_label +): """ Add columns to the vehicles table in the pipeline according to spec. @@ -138,11 +147,12 @@ def annotate_vehicle_type_choice_vehicles(model_settings, trace_label): """ vehicles = inject.get_table("vehicles").to_frame() expressions.assign_columns( + whale, df=vehicles, model_settings=model_settings.get("annotate_vehicles"), trace_label=tracing.extend_trace_label(trace_label, "annotate_vehicles"), ) - pipeline.replace_table("vehicles", vehicles) + whale.add_table("vehicles", vehicles) def get_combinatorial_vehicle_alternatives(alts_cats_dict): @@ -200,7 +210,6 @@ def construct_model_alternatives(model_settings, alts_cats_dict, vehicle_type_da # merge vehicle type data to alternatives if data is provided if (vehicle_type_data is not None) and (probs_spec_file is None): - alts_wide = pd.merge( alts_wide, vehicle_type_data, @@ -250,7 +259,7 @@ def get_vehicle_type_data(model_settings, vehicle_type_data_file): table of vehicle type data with required body_type, age, and fuel_type columns """ vehicle_type_data = pd.read_csv( - config.config_file_path(vehicle_type_data_file), comment="#" + whale.filesystem.get_config_file_path(vehicle_type_data_file), comment="#" ) fleet_year = model_settings.get("FLEET_YEAR") @@ -267,6 +276,7 @@ def get_vehicle_type_data(model_settings, vehicle_type_data_file): def iterate_vehicle_type_choice( + whale: workflow.Whale, vehicles_merged, model_settings, model_spec, @@ -348,6 +358,7 @@ def iterate_vehicle_type_choice( preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: expressions.assign_columns( + whale, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -371,7 +382,7 @@ def iterate_vehicle_type_choice( simulation_type == "simple_simulate" ), "SIMULATION_TYPE needs to be interaction_simulate or simple_simulate" - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = whale.settings.log_alt_losers if simulation_type == "interaction_simulate": assert ( @@ -379,6 +390,7 @@ def iterate_vehicle_type_choice( ), "Need to supply combinatorial_alts in yaml" choices = interaction_simulate( + whale, choosers=choosers, alternatives=alts_wide, spec=model_spec, @@ -423,7 +435,7 @@ def iterate_vehicle_type_choice( # STEP II: append probabilistic vehicle type attributes if probs_spec_file is not None: choices = append_probabilistic_vehtype_type_choices( - choices, model_settings, trace_label + whale, choices, model_settings, trace_label ) vehicles_merged.loc[choices.index, "already_owned_veh"] = choices[ @@ -449,9 +461,15 @@ def iterate_vehicle_type_choice( return all_choices, all_choosers -@inject.step() +@workflow.step def vehicle_type_choice( - persons, households, vehicles, vehicles_merged, chunk_size, trace_hh_id + whale: workflow.Whale, + persons, + households, + vehicles, + vehicles_merged, + chunk_size, + trace_hh_id, ): """Assign a vehicle type to each vehicle in the `vehicles` table. @@ -501,9 +519,11 @@ def vehicle_type_choice( estimator = estimation.manager.begin_estimation("vehicle_type") - model_spec_raw = simulate.read_model_spec(file_name=model_settings["SPEC"]) + model_spec_raw = simulate.read_model_spec(whale, file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec_raw, coefficients_df, estimator) + model_spec = simulate.eval_coefficients( + whale, model_spec_raw, coefficients_df, estimator + ) constants = config.get_model_constants(model_settings) @@ -512,6 +532,7 @@ def vehicle_type_choice( locals_dict.update(coefficients_df) choices, choosers = iterate_vehicle_type_choice( + whale, vehicles_merged, model_settings, model_spec, @@ -550,15 +571,15 @@ def vehicle_type_choice( # update vehicles table # vehicles = pd.merge(vehicles.to_frame(), choices, left_index=True, right_index=True) vehicles = pd.concat([vehicles.to_frame(), choices], axis=1) - pipeline.replace_table("vehicles", vehicles) + whale.add_table("vehicles", vehicles) # - annotate tables if model_settings.get("annotate_households"): - annotate_vehicle_type_choice_households(model_settings, trace_label) + annotate_vehicle_type_choice_households(whale, model_settings, trace_label) if model_settings.get("annotate_persons"): - annotate_vehicle_type_choice_persons(model_settings, trace_label) + annotate_vehicle_type_choice_persons(whale, model_settings, trace_label) if model_settings.get("annotate_vehicles"): - annotate_vehicle_type_choice_vehicles(model_settings, trace_label) + annotate_vehicle_type_choice_vehicles(whale, model_settings, trace_label) tracing.print_summary( "vehicle_type_choice", vehicles.vehicle_type, value_counts=True diff --git a/activitysim/abm/models/work_from_home.py b/activitysim/abm/models/work_from_home.py index 97a80d301..ddd28d209 100755 --- a/activitysim/abm/models/work_from_home.py +++ b/activitysim/abm/models/work_from_home.py @@ -5,13 +5,15 @@ import numpy as np from activitysim.abm.models.util import estimation -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.core import config, expressions, simulate, tracing, workflow logger = logging.getLogger("activitysim") -@inject.step() -def work_from_home(persons_merged, persons, chunk_size, trace_hh_id): +@workflow.step +def work_from_home( + whale: workflow.Whale, persons_merged, persons, chunk_size, trace_hh_id +): """ This model predicts whether a person (worker) works from home. The output from this model is TRUE (if works from home) or FALSE (works away from home). @@ -38,19 +40,19 @@ def work_from_home(persons_merged, persons, chunk_size, trace_hh_id): # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_d = {} if constants is not None: locals_d.update(constants) expressions.assign_columns( + whale, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, trace_label=trace_label, ) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) + model_spec = simulate.read_model_spec(whale, file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) nest_spec = config.get_logit_model_settings(model_settings) @@ -77,7 +79,6 @@ def work_from_home(persons_merged, persons, chunk_size, trace_hh_id): ) for iteration in range(iterations): - logger.info( "Running %s with %d persons iteration %d", trace_label, @@ -86,8 +87,10 @@ def work_from_home(persons_merged, persons, chunk_size, trace_hh_id): ) # re-read spec to reset substitution - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = simulate.read_model_spec(whale, file_name=model_settings["SPEC"]) + model_spec = simulate.eval_coefficients( + whale, model_spec, coefficients_df, estimator + ) choices = simulate.simple_simulate( choosers=choosers, @@ -169,7 +172,7 @@ def work_from_home(persons_merged, persons, chunk_size, trace_hh_id): persons.work_from_home == True, -1, persons[dest_choice_column_name] ) - pipeline.replace_table("persons", persons) + whale.add_table("persons", persons) tracing.print_summary("work_from_home", persons.work_from_home, value_counts=True) diff --git a/activitysim/abm/tables/accessibility.py b/activitysim/abm/tables/accessibility.py index a45cbac8c..e89a1bf1f 100644 --- a/activitysim/abm/tables/accessibility.py +++ b/activitysim/abm/tables/accessibility.py @@ -4,16 +4,14 @@ import pandas as pd +from activitysim.core import workflow from activitysim.core.input import read_input_table -from ...core.pipeline import Whale -from ...core.workflow import workflow_table - logger = logging.getLogger(__name__) -@workflow_table -def accessibility(whale: Whale): +@workflow.table +def accessibility(whale: workflow.Whale): """ If 'accessibility' is in input_tables list, then read it in, otherwise create skeleton table with same index as landuse. diff --git a/activitysim/abm/tables/disaggregate_accessibility.py b/activitysim/abm/tables/disaggregate_accessibility.py index 63a2e8685..d16eaee0f 100644 --- a/activitysim/abm/tables/disaggregate_accessibility.py +++ b/activitysim/abm/tables/disaggregate_accessibility.py @@ -8,9 +8,7 @@ import pandas.api.types as ptypes from sklearn.naive_bayes import CategoricalNB -from activitysim.core import config, inject, input, pipeline, util - -from ...core.workflow import workflow_cached_object, workflow_step, workflow_table +from activitysim.core import config, inject, input, util, workflow logger = logging.getLogger(__name__) @@ -84,13 +82,13 @@ def nearest_node(oz, zones_df): return matched_df.loc[_idx] -@workflow_cached_object -def disaggregate_suffixes(whale): +@workflow.cached_object +def disaggregate_suffixes(whale: workflow.Whale): return {"SUFFIX": None, "ROOTS": []} -@workflow_table -def maz_centroids(whale): +@workflow.table +def maz_centroids(whale: workflow.Whale): df = input.read_input_table(whale, "maz_centroids") if not df.index.is_monotonic_increasing: @@ -104,9 +102,8 @@ def maz_centroids(whale): return df -@workflow_table -def proto_disaggregate_accessibility(whale): - +@workflow.table +def proto_disaggregate_accessibility(whale: workflow.Whale): # Read existing accessibilities, but is not required to enable model compatibility df = input.read_input_table( whale, "proto_disaggregate_accessibility", required=False @@ -128,8 +125,8 @@ def proto_disaggregate_accessibility(whale): return df -@workflow_table -def disaggregate_accessibility(whale): +@workflow.table +def disaggregate_accessibility(whale: workflow.Whale): """ This step initializes pre-computed disaggregate accessibility and merges it onto the full synthetic population. Function adds merged all disaggregate accessibility tables to the pipeline but returns nothing. diff --git a/activitysim/abm/tables/households.py b/activitysim/abm/tables/households.py index 53943ccca..f150aa5d1 100644 --- a/activitysim/abm/tables/households.py +++ b/activitysim/abm/tables/households.py @@ -6,17 +6,15 @@ import pandas as pd -from ...core import inject, mem, pipeline, tracing -from ...core.input import read_input_table -from ...core.pipeline import Whale -from ...core.workflow import workflow_table -from ..misc import override_hh_ids +from activitysim.abm.misc import override_hh_ids +from activitysim.core import inject, tracing, workflow +from activitysim.core.input import read_input_table logger = logging.getLogger(__name__) -@workflow_table -def households(whale: Whale): +@workflow.table +def households(whale: workflow.Whale): households_sample_size = whale.settings.households_sample_size _override_hh_ids = override_hh_ids(whale) _trace_hh_id = whale.settings.trace_hh_id @@ -30,7 +28,6 @@ def households(whale: Whale): # only using households listed in override_hh_ids if _override_hh_ids is not None: - # trace_hh_id will not used if it is not in list of override_hh_ids logger.info( "override household list containing %s households" % len(_override_hh_ids) @@ -50,14 +47,12 @@ def households(whale: Whale): # if we are tracing hh exclusively elif _trace_hh_id and households_sample_size == 1: - # df contains only trace_hh (or empty if not in full store) df = tracing.slice_ids(df_full, _trace_hh_id) households_sliced = True # if we need a subset of full store elif tot_households > households_sample_size > 0: - logger.info( "sampling %s of %s households" % (households_sample_size, tot_households) ) diff --git a/activitysim/abm/tables/landuse.py b/activitysim/abm/tables/landuse.py index 9865803b8..ae717b141 100644 --- a/activitysim/abm/tables/landuse.py +++ b/activitysim/abm/tables/landuse.py @@ -3,16 +3,14 @@ import io import logging -from activitysim.core import config, inject +from activitysim.core import inject, workflow from activitysim.core.input import read_input_table -from ...core.workflow import workflow_table - logger = logging.getLogger(__name__) -@workflow_table -def land_use(whale): +@workflow.table +def land_use(whale: workflow.Whale): df = read_input_table(whale, "land_use") sharrow_enabled = whale.settings.sharrow @@ -40,9 +38,8 @@ def land_use(whale): inject.broadcast("land_use", "households", cast_index=True, onto_on="home_zone_id") -@workflow_table -def land_use_taz(whale): - +@workflow.table +def land_use_taz(whale: workflow.Whale): df = read_input_table(whale, "land_use_taz") if not df.index.is_monotonic_increasing: @@ -54,6 +51,6 @@ def land_use_taz(whale): logger.debug("land_use_taz.info:\n" + buffer.getvalue()) # replace table function with dataframe - whale.tableset.store_data("land_use_taz", df) + whale.add_table("land_use_taz", df) return df diff --git a/activitysim/abm/tables/persons.py b/activitysim/abm/tables/persons.py index 0652efcac..859b4118a 100644 --- a/activitysim/abm/tables/persons.py +++ b/activitysim/abm/tables/persons.py @@ -5,9 +5,8 @@ import pandas as pd -from ...core import inject, pipeline, tracing -from ...core.input import read_input_table -from ...core.workflow import workflow_table +from activitysim.core import inject, tracing, workflow +from activitysim.core.input import read_input_table logger = logging.getLogger(__name__) @@ -23,8 +22,8 @@ def read_raw_persons(whale, households): return df -@workflow_table -def persons(whale): +@workflow.table +def persons(whale: workflow.Whale): households = whale.get_dataframe("households") trace_hh_id = whale.settings.trace_hh_id df = read_raw_persons(whale, households) @@ -93,8 +92,8 @@ def persons(whale): # return inject.merge_tables(persons.name, tables=tables) -@workflow_table -def persons_merged(whale): +@workflow.table +def persons_merged(whale: workflow.Whale): land_use = whale.get_dataframe("land_use") households = whale.get_dataframe("households") @@ -102,34 +101,37 @@ def persons_merged(whale): persons = whale.get_dataframe("persons") disaggregate_accessibility = whale.get_dataframe("disaggregate_accessibility") - households = pd.merge( + def join(left, right, left_on): + intersection = set(left.columns).intersection(right.columns) + intersection.discard(left_on) # intersection is ok if it's the join key + right = right.drop(intersection, axis=1) + return pd.merge( + left, + right, + left_on=left_on, + right_index=True, + ) + + households = join( households, land_use, left_on="home_zone_id", - right_index=True, - suffixes=("_households", "_land_use"), ) - households = pd.merge( + households = join( households, accessibility, left_on="home_zone_id", - right_index=True, - suffixes=("_households", "_accessibility"), ) - persons = pd.merge( + persons = join( persons, households, left_on="household_id", - right_index=True, - suffixes=("_persons", "_households"), ) if not disaggregate_accessibility.empty: - persons = pd.merge( + persons = join( persons, disaggregate_accessibility, left_on="person_id", - right_index=True, - suffixes=("_persons", "_disaggregate_accessibility"), ) return persons diff --git a/activitysim/abm/tables/shadow_pricing.py b/activitysim/abm/tables/shadow_pricing.py index c9dbb4335..1316e4cca 100644 --- a/activitysim/abm/tables/shadow_pricing.py +++ b/activitysim/abm/tables/shadow_pricing.py @@ -9,12 +9,10 @@ import numpy as np import pandas as pd -from ...abm.tables.size_terms import tour_destination_size_terms -from ...core import config, inject, logit, tracing, util -from ...core.input import read_input_table -from ...core.pipeline import Whale -from ...core.workflow import workflow_step -from .size_terms import size_terms as get_size_terms +from activitysim.abm.tables.size_terms import size_terms as get_size_terms +from activitysim.abm.tables.size_terms import tour_destination_size_terms +from activitysim.core import logit, tracing, util, workflow +from activitysim.core.input import read_input_table logger = logging.getLogger(__name__) @@ -84,7 +82,7 @@ def size_table_name(model_selector): class ShadowPriceCalculator(object): def __init__( self, - whale: Whale, + whale: workflow.Whale, model_settings, num_processes, shared_data=None, @@ -167,9 +165,7 @@ def __init__( self.use_shadow_pricing = False # - destination_size_table (desired_size) - self.desired_size = inject.get_table( - size_table_name(self.model_selector) - ).to_frame() + self.desired_size = whale.get_dataframe(size_table_name(self.model_selector)) self.desired_size = self.desired_size.sort_index() assert ( @@ -241,10 +237,9 @@ def __init__( self.use_shadow_pricing and self.shadow_settings["SHADOW_PRICE_METHOD"] == "simulation" ): - assert self.model_selector in ["workplace", "school"] self.target = {} - land_use = inject.get_table("land_use").to_frame() + land_use = whale.get_dataframe("land_use") if self.model_selector == "workplace": employment_targets = self.shadow_settings[ @@ -481,7 +476,6 @@ def set_choices(self, choices, segment_ids): modeled_size = pd.DataFrame(index=self.desired_size.index) for seg_name in self.desired_size: - segment_choices = choices[(segment_ids == self.segment_ids[seg_name])] modeled_size[seg_name] = segment_choices.value_counts() @@ -552,7 +546,6 @@ def check_fit(self, iteration): self.choices_by_iteration[iteration] = self.choices_synced if self.shadow_settings["SHADOW_PRICE_METHOD"] != "simulation": - modeled_size = self.modeled_size desired_size = self.desired_size @@ -648,7 +641,7 @@ def check_fit(self, iteration): return converged - def update_shadow_prices(self): + def update_shadow_prices(self, whale): """ Adjust shadow_prices based on relative values of modeled_size and desired_size. @@ -773,7 +766,7 @@ def update_shadow_prices(self): """ percent_tolerance = self.shadow_settings["PERCENT_TOLERANCE"] sampled_persons = pd.DataFrame() - persons_merged = inject.get_table("persons_merged").to_frame() + persons_merged = whale.get_dataframe("persons_merged") # need to join the segment to the choices to sample correct persons segment_to_name_dict = self.shadow_settings.get( @@ -845,7 +838,7 @@ def update_shadow_prices(self): index=choices.index, ) # using ActivitySim's RNG to make choices for repeatability - current_sample, rands = logit.make_choices(probs) + current_sample, rands = logit.make_choices(whale, probs) current_sample = current_sample[current_sample == 1] if len(sampled_persons) == 0: @@ -859,14 +852,12 @@ def update_shadow_prices(self): raise RuntimeError("unknown SHADOW_PRICE_METHOD %s" % shadow_price_method) def dest_size_terms(self, segment): - assert segment in self.segment_ids size_term_adjustment = 1 utility_adjustment = 0 if self.use_shadow_pricing: - shadow_price_method = self.shadow_settings["SHADOW_PRICE_METHOD"] if shadow_price_method == "ctramp": @@ -978,7 +969,6 @@ def buffers_for_shadow_pricing(shadow_pricing_info): data_buffers = {} for block_key, block_shape in block_shapes.items(): - # buffer_size must be int, not np.int64 buffer_size = util.iprod(block_shape) @@ -1004,7 +994,7 @@ def buffers_for_shadow_pricing(shadow_pricing_info): return data_buffers -def buffers_for_shadow_pricing_choice(shadow_pricing_choice_info): +def buffers_for_shadow_pricing_choice(whale, shadow_pricing_choice_info): """ Same as above buffers_for_shadow_price function except now we need to store the actual choices for the simulation based shadow pricing method @@ -1028,7 +1018,6 @@ def buffers_for_shadow_pricing_choice(shadow_pricing_choice_info): data_buffers = {} for block_key, block_shape in block_shapes.items(): - # buffer_size must be int, not np.int64 buffer_size = util.iprod(block_shape) @@ -1051,7 +1040,7 @@ def buffers_for_shadow_pricing_choice(shadow_pricing_choice_info): data_buffers[block_key + "_choice"] = shared_data_buffer - persons = read_input_table("persons") + persons = read_input_table(whale, "persons") sp_choice_df = persons.reset_index()["person_id"].to_frame() # declare a shared Array with data from sp_choice_df @@ -1164,7 +1153,7 @@ def shadow_price_data_from_buffers(data_buffers, shadow_pricing_info, model_sele return np.frombuffer(data.get_obj(), dtype=dtype).reshape(shape), data.get_lock() -def load_shadow_price_calculator(model_settings): +def load_shadow_price_calculator(whale, model_settings): """ Initialize ShadowPriceCalculator for model_selector (e.g. school or workplace) @@ -1180,20 +1169,20 @@ def load_shadow_price_calculator(model_settings): spc : ShadowPriceCalculator """ - num_processes = inject.get_injectable("num_processes", 1) + num_processes = whale.get_injectable("num_processes", 1) model_selector = model_settings["MODEL_SELECTOR"] # - get shared_data from data_buffers (if multiprocessing) - data_buffers = inject.get_injectable("data_buffers", None) + data_buffers = whale.get_injectable("data_buffers", None) if data_buffers is not None: logger.info("Using existing data_buffers for shadow_price") # - shadow_pricing_info - shadow_pricing_info = inject.get_injectable("shadow_pricing_info", None) + shadow_pricing_info = whale.get_injectable("shadow_pricing_info", None) assert shadow_pricing_info is not None - shadow_pricing_choice_info = inject.get_injectable( + shadow_pricing_choice_info = whale.get_injectable( "shadow_pricing_choice_info", None ) assert shadow_pricing_choice_info is not None @@ -1220,6 +1209,7 @@ def load_shadow_price_calculator(model_settings): # - ShadowPriceCalculator spc = ShadowPriceCalculator( + whale, model_settings, num_processes, data, @@ -1233,8 +1223,8 @@ def load_shadow_price_calculator(model_settings): # first define add_size_tables as an orca step with no scale argument at all. -@workflow_step -def add_size_tables(whale, disaggregate_suffixes): +@workflow.step +def add_size_tables(whale: workflow.Whale, disaggregate_suffixes): return _add_size_tables(whale, disaggregate_suffixes) @@ -1297,7 +1287,6 @@ def _add_size_tables(whale, disaggregate_suffixes, scale=True): # since these are scaled to model size, they have to be created while single-process for model_selector, model_name in shadow_pricing_models.items(): - model_settings = whale.filesystem.read_model_settings(model_name) if suffix is not None and roots: @@ -1331,7 +1320,6 @@ def _add_size_tables(whale, disaggregate_suffixes, scale=True): scale_size_table = scale and scale_size_table if (use_shadow_pricing and full_model_run) and scale_size_table: - # need to scale destination size terms because ctramp and daysim approaches directly # compare modeled size and target size when computing shadow prices # Does not apply to simulation approach which compares proportions. @@ -1406,8 +1394,8 @@ def get_shadow_pricing_info(whale): block_shapes: dict {: } """ - land_use = inject.get_table("land_use") - size_terms = inject.get_injectable("size_terms") + land_use = whale.get_dataframe("land_use") + size_terms = whale.get_injectable("size_terms") shadow_settings = whale.filesystem.read_model_settings("shadow_pricing.yaml") @@ -1416,7 +1404,6 @@ def get_shadow_pricing_info(whale): blocks = OrderedDict() for model_selector in shadow_pricing_models: - sp_rows = len(land_use) sp_cols = len(size_terms[size_terms.model_selector == model_selector]) @@ -1450,7 +1437,7 @@ def get_shadow_pricing_choice_info(whale): block_shapes: dict {: } """ - persons = read_input_table("persons") + persons = read_input_table(whale, "persons") shadow_settings = whale.filesystem.read_model_settings("shadow_pricing.yaml") @@ -1459,7 +1446,6 @@ def get_shadow_pricing_choice_info(whale): blocks = OrderedDict() for model_selector in shadow_pricing_models: - # each person will have a work or school location choice sp_rows = len(persons) @@ -1482,21 +1468,19 @@ def get_shadow_pricing_choice_info(whale): return shadow_pricing_choice_info -@inject.injectable(cache=True) -def shadow_pricing_info(): - +@workflow.cached_object +def shadow_pricing_info(whale: workflow.Whale): # when multiprocessing with shared data mp_tasks has to call network_los methods # get_shadow_pricing_info() and buffers_for_shadow_pricing() logger.debug("loading shadow_pricing_info injectable") - return get_shadow_pricing_info() - + return get_shadow_pricing_info(whale) -@inject.injectable(cache=True) -def shadow_pricing_choice_info(): +@workflow.cached_object +def shadow_pricing_choice_info(whale: workflow.Whale): # when multiprocessing with shared data mp_tasks has to call network_los methods # get_shadow_pricing_info() and buffers_for_shadow_pricing() logger.debug("loading shadow_pricing_choice_info injectable") - return get_shadow_pricing_choice_info() + return get_shadow_pricing_choice_info(whale) diff --git a/activitysim/abm/tables/size_terms.py b/activitysim/abm/tables/size_terms.py index d6710afe9..82d819a62 100644 --- a/activitysim/abm/tables/size_terms.py +++ b/activitysim/abm/tables/size_terms.py @@ -5,13 +5,13 @@ import numpy as np import pandas as pd -from ...core.workflow import workflow_cached_object +from activitysim.core import workflow logger = logging.getLogger(__name__) -@workflow_cached_object -def size_terms(whale): +@workflow.cached_object +def size_terms(whale: workflow.Whale): f = whale.filesystem.get_config_file_path("destination_choice_size_terms.csv") return pd.read_csv(f, comment="#", index_col="segment") diff --git a/activitysim/abm/tables/skims.py b/activitysim/abm/tables/skims.py index 85cf719b1..07022ee11 100644 --- a/activitysim/abm/tables/skims.py +++ b/activitysim/abm/tables/skims.py @@ -3,11 +3,7 @@ import logging -from activitysim.core import config, inject, los -from activitysim.core.pathbuilder import TransitVirtualPathBuilder - -from ...core.pipeline import Whale -from ...core.workflow import workflow_cached_object +from activitysim.core import los, workflow logger = logging.getLogger(__name__) @@ -16,33 +12,31 @@ """ -@workflow_cached_object -def network_los_preload(whale) -> los.Network_LOS: - +@workflow.cached_object +def network_los_preload(whale: workflow.Whale) -> los.Network_LOS: # when multiprocessing with shared data mp_tasks has to call network_los methods # allocate_shared_skim_buffers() and load_shared_data() BEFORE network_los.load_data() logger.debug("loading network_los_without_data_loaded injectable") - nw_los = los.Network_LOS() - + nw_los = los.Network_LOS(whale) return nw_los -@workflow_cached_object -def network_los(whale, network_los_preload: los.Network_LOS) -> los.Network_LOS: - +@workflow.cached_object +def network_los( + whale: workflow.Whale, network_los_preload: los.Network_LOS +) -> los.Network_LOS: logger.debug("loading network_los injectable") network_los_preload.load_data() return network_los_preload -@workflow_cached_object -def skim_dict(whale, network_los): +@workflow.cached_object +def skim_dict(whale: workflow.Whale, network_los): return network_los.get_default_skim_dict() -@workflow_cached_object -def log_settings(whale): - +@workflow.cached_object +def log_settings(whale: workflow.Whale): # abm settings to log on startup return [ "households_sample_size", diff --git a/activitysim/abm/tables/time_windows.py b/activitysim/abm/tables/time_windows.py index 36c95e927..c1356ce07 100644 --- a/activitysim/abm/tables/time_windows.py +++ b/activitysim/abm/tables/time_windows.py @@ -1,21 +1,18 @@ # ActivitySim # See full license in LICENSE.txt. import logging -import os import numpy as np import pandas as pd -from ...core import config, inject -from ...core import timetable as tt -from ...core.pipeline import Whale -from ...core.workflow import workflow_cached_object, workflow_table +from activitysim.core import timetable as tt +from activitysim.core import workflow logger = logging.getLogger(__name__) -@workflow_cached_object -def tdd_alts(whale) -> pd.DataFrame: +@workflow.cached_object +def tdd_alts(whale: workflow.Whale) -> pd.DataFrame: # right now this file just contains the start and end hour file_path = whale.filesystem.get_config_file_path( "tour_departure_and_duration_alternatives.csv" @@ -30,8 +27,8 @@ def tdd_alts(whale) -> pd.DataFrame: return df -@workflow_cached_object -def tdd_alt_segments(whale: Whale) -> pd.DataFrame: +@workflow.cached_object +def tdd_alt_segments(whale: workflow.Whale) -> pd.DataFrame: # tour_purpose,time_period,start,end # work,EA,3,5 # work,AM,6,8 @@ -56,9 +53,9 @@ def tdd_alt_segments(whale: Whale) -> pd.DataFrame: return df -@workflow_table +@workflow.table def person_windows( - whale: Whale, + whale: workflow.Whale, persons: pd.DataFrame, tdd_alts: pd.DataFrame, ) -> pd.DataFrame: @@ -67,7 +64,9 @@ def person_windows( return df -@inject.injectable() -def timetable(person_windows, tdd_alts): +@workflow.cached_object +def timetable( + whale: workflow.Whale, person_windows: pd.DataFrame, tdd_alts: pd.DataFrame +) -> tt.TimeTable: logging.debug("@inject timetable") - return tt.TimeTable(person_windows.to_frame(), tdd_alts, person_windows.name) + return tt.TimeTable(person_windows, tdd_alts, person_windows.name) diff --git a/activitysim/abm/tables/vehicles.py b/activitysim/abm/tables/vehicles.py index fdc886a25..498ce3faf 100644 --- a/activitysim/abm/tables/vehicles.py +++ b/activitysim/abm/tables/vehicles.py @@ -2,13 +2,15 @@ # See full license in LICENSE.txt. import logging -from activitysim.core import inject, pipeline, tracing +import pandas as pd + +from activitysim.core import inject, tracing, workflow logger = logging.getLogger(__name__) -@inject.table() -def vehicles(households): +@workflow.table +def vehicles(whale: workflow.Whale, households): """Creates the vehicles table and load it as an injectable This method initializes the `vehicles` table, where the number of rows @@ -37,14 +39,16 @@ def vehicles(households): # replace table function with dataframe inject.add_table("vehicles", vehicles) - pipeline.get_rn_generator().add_channel("vehicles", vehicles) + whale.get_rn_generator().add_channel("vehicles", vehicles) tracing.register_traceable_table("vehicles", vehicles) return vehicles -@inject.table() -def vehicles_merged(vehicles, households_merged): +@workflow.table +def vehicles_merged( + whale: workflow.Whale, vehicles: pd.DataFrame, households_merged: pd.DataFrame +): """Augments the vehicles table with household attributes Parameters diff --git a/activitysim/abm/test/test_misc/test_trip_departure_choice.py b/activitysim/abm/test/test_misc/test_trip_departure_choice.py index f36cf6df2..d4ec4631f 100644 --- a/activitysim/abm/test/test_misc/test_trip_departure_choice.py +++ b/activitysim/abm/test/test_misc/test_trip_departure_choice.py @@ -141,9 +141,11 @@ def test_generate_alternative(trips): ) -def test_apply_stage_two_model(model_spec, trips): +def test_apply_stage_two_model(whale, model_spec, trips): setup_dirs() - departures = tdc.apply_stage_two_model(model_spec, trips, 0, "TEST Trip Departure") + departures = tdc.apply_stage_two_model( + whale, model_spec, trips, 0, "TEST Trip Departure" + ) assert len(departures) == len(trips) pd.testing.assert_index_equal(departures.index, trips.index) diff --git a/activitysim/abm/test/test_misc/test_trip_scheduling_choice.py b/activitysim/abm/test/test_misc/test_trip_scheduling_choice.py index a3f6ffdd9..2edbb66d9 100644 --- a/activitysim/abm/test/test_misc/test_trip_scheduling_choice.py +++ b/activitysim/abm/test/test_misc/test_trip_scheduling_choice.py @@ -4,7 +4,7 @@ from activitysim.abm.models import trip_scheduling_choice as tsc from activitysim.abm.tables.skims import skim_dict -from activitysim.core import los +from activitysim.core import los, workflow from .setup_utils import setup_dirs @@ -167,13 +167,15 @@ def test_two_way_stop_patterns(tours): assert set(output_columns).issubset(windows.columns) -def test_run_trip_scheduling_choice(model_spec, tours, skims, locals_dict): +def test_run_trip_scheduling_choice( + whale: workflow.Whale, model_spec, tours, skims, locals_dict +): """ Test run the model. """ out_tours = tsc.run_trip_scheduling_choice( - model_spec, tours, skims, locals_dict, 2, None, "PyTest Trip Scheduling" + whale, model_spec, tours, skims, locals_dict, 2, None, "PyTest Trip Scheduling" ) assert len(tours) == len(out_tours) diff --git a/activitysim/benchmarking/componentwise.py b/activitysim/benchmarking/componentwise.py index f3c1cf812..0ae04807b 100644 --- a/activitysim/benchmarking/componentwise.py +++ b/activitysim/benchmarking/componentwise.py @@ -8,11 +8,10 @@ import pandas as pd import yaml -from ..cli.create import get_example -from ..cli.run import INJECTABLES, config, pipeline -from ..core import inject, tracing -from ..core.pipeline import Whale -from . import workspace +from activitysim.benchmarking import workspace +from activitysim.cli.create import get_example +from activitysim.cli.run import INJECTABLES, config +from activitysim.core import inject, tracing logger = logging.getLogger(__name__) diff --git a/activitysim/cli/run.py b/activitysim/cli/run.py index 89520ecdc..f8d601c92 100644 --- a/activitysim/cli/run.py +++ b/activitysim/cli/run.py @@ -9,10 +9,8 @@ import numpy as np -from activitysim.core import chunk, config, inject, mem, pipeline, tracing - -from ..core.configuration import FileSystem, Settings -from ..core.pipeline import Whale +from activitysim.core import chunk, config, inject, mem, tracing, workflow +from activitysim.core.configuration import FileSystem, Settings logger = logging.getLogger(__name__) @@ -108,7 +106,7 @@ def add_run_args(parser, multiprocess=True): ) -def validate_injectable(whale: Whale, name, make_if_missing=False): +def validate_injectable(whale: workflow.Whale, name, make_if_missing=False): try: dir_paths = whale.context.get_formatted(name) # dir_paths = inject.get_injectable(name) @@ -133,7 +131,7 @@ def validate_injectable(whale: Whale, name, make_if_missing=False): return dir_paths -def handle_standard_args(whale: Whale, args, multiprocess=True): +def handle_standard_args(whale: workflow.Whale, args, multiprocess=True): def inject_arg(name, value): assert name in INJECTABLES whale.context[name] = value @@ -252,7 +250,7 @@ def inject_arg(name, value): return whale -def cleanup_output_files(whale: Whale): +def cleanup_output_files(whale: workflow.Whale): tracing.delete_trace_files(whale) csv_ignore = [] @@ -282,7 +280,7 @@ def run(args): int: sys.exit exit code """ - whale = pipeline.Whale() + whale = workflow.Whale() # register abm steps and other abm-specific injectables # by default, assume we are running activitysim.abm @@ -396,7 +394,7 @@ def run(args): from activitysim.core import mp_tasks injectables = {k: inject.get_injectable(k) for k in INJECTABLES} - mp_tasks.run_multiprocess(injectables) + mp_tasks.run_multiprocess(whale, injectables) assert not whale.is_open diff --git a/activitysim/core/assign.py b/activitysim/core/assign.py index 5ab604e61..03f4a6c83 100644 --- a/activitysim/core/assign.py +++ b/activitysim/core/assign.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from activitysim.core import chunk, pipeline, util +from activitysim.core import chunk, util, workflow logger = logging.getLogger(__name__) @@ -278,7 +278,6 @@ def to_series(x): n_randoms += 1 assignment_expressions.loc[expression_idx, "expression"] = expression if n_randoms: - from activitysim.core import pipeline try: random_draws = whale.get_rn_generator().normal_for_df( @@ -345,7 +344,6 @@ def rng_lognormal(random_draws, mu, sigma, broadcast=True, scale=False): continue try: - # FIXME - log any numpy warnings/errors but don't raise np_logger.target = str(target) np_logger.expression = str(expression) @@ -398,7 +396,6 @@ def rng_lognormal(random_draws, mu, sigma, broadcast=True, scale=False): _locals_dict[target] = expr_values if trace_results is not None: - trace_results = pd.DataFrame.from_dict(trace_results) trace_results.index = df[trace_rows].index @@ -409,11 +406,11 @@ def rng_lognormal(random_draws, mu, sigma, broadcast=True, scale=False): assert variables, "No non-temp variables were assigned." if chunk_log: - chunk.log_df(trace_label, "temps", temps) - chunk.log_df(trace_label, "variables", variables) + chunk_log.log_df(trace_label, "temps", temps) + chunk_log.log_df(trace_label, "variables", variables) # these are going away - let caller log result df - chunk.log_df(trace_label, "temps", None) - chunk.log_df(trace_label, "variables", None) + chunk_log.log_df(trace_label, "temps", None) + chunk_log.log_df(trace_label, "variables", None) # we stored result in dict - convert to df variables = util.df_from_dict(variables, index=df.index) diff --git a/activitysim/core/chunk.py b/activitysim/core/chunk.py index 788aa41ba..c96d76ca4 100644 --- a/activitysim/core/chunk.py +++ b/activitysim/core/chunk.py @@ -1146,16 +1146,18 @@ def chunk_log_skip(): None -def adaptive_chunked_choosers(choosers, chunk_size, trace_label, chunk_tag=None): +def adaptive_chunked_choosers(whale, choosers, chunk_size, trace_label, chunk_tag=None): # generator to iterate over choosers - if chunk_training_mode() == MODE_CHUNKLESS: + if whale.settings.chunk_training_mode == MODE_CHUNKLESS: # The adaptive chunking logic is expensive and sometimes results # in needless data copying. So we short circuit it entirely # when chunking is disabled. logger.info(f"Running chunkless with {len(choosers)} choosers") - yield 0, choosers, trace_label + yield 0, choosers, trace_label, ChunkSizer( + "chunkless", trace_label, 0, 0, whale.settings.chunk_training_mode + ) return chunk_tag = chunk_tag or trace_label @@ -1190,7 +1192,7 @@ def adaptive_chunked_choosers(choosers, chunk_size, trace_label, chunk_tag=None) f"with {len(chooser_chunk)} of {num_choosers} choosers" ) - yield i, chooser_chunk, chunk_trace_label + yield i, chooser_chunk, chunk_trace_label, chunk_sizer offset += rows_per_chunk diff --git a/activitysim/core/config.py b/activitysim/core/config.py index 1557c2457..97c924f32 100644 --- a/activitysim/core/config.py +++ b/activitysim/core/config.py @@ -10,10 +10,9 @@ import yaml -from ..core import inject, util -from .exceptions import SettingsFileNotFoundError -from .pipeline import Whale -from .workflow.util import get_formatted_or_default +from activitysim.core import inject, util, workflow +from activitysim.core.exceptions import SettingsFileNotFoundError +from activitysim.core.workflow.util import get_formatted_or_default logger = logging.getLogger(__name__) @@ -22,73 +21,73 @@ """ -@inject.injectable(cache=True) -def locutor(): +@workflow.cached_object +def locutor(whale: workflow.Whale): # when multiprocessing, sometimes you only want one process to write trace files # mp_tasks overrides this definition to designate a single sub-process as locutor return True -@inject.injectable(cache=True) -def configs_dir(): - if not os.path.exists("configs"): - raise RuntimeError("'configs' directory does not exist") - return "configs" - - -@inject.injectable(cache=True) -def data_dir(): - if not os.path.exists("data"): - raise RuntimeError("'data' directory does not exist") - return "data" - - -@inject.injectable(cache=True) -def output_dir(): - if not os.path.exists("output"): - print( - f"'output' directory does not exist - current working directory: {os.getcwd()}" - ) - raise RuntimeError("'output' directory does not exist") - return "output" - - -@inject.injectable() -def output_file_prefix(): - return "" - - -@inject.injectable(cache=True) -def pipeline_file_name(settings): - - pipeline_file_name = settings.get("pipeline_file_name", "pipeline.h5") - - return pipeline_file_name +# @inject.injectable(cache=True) +# def configs_dir(): +# if not os.path.exists("configs"): +# raise RuntimeError("'configs' directory does not exist") +# return "configs" +# +# +# @inject.injectable(cache=True) +# def data_dir(): +# if not os.path.exists("data"): +# raise RuntimeError("'data' directory does not exist") +# return "data" +# +# +# @inject.injectable(cache=True) +# def output_dir(): +# if not os.path.exists("output"): +# print( +# f"'output' directory does not exist - current working directory: {os.getcwd()}" +# ) +# raise RuntimeError("'output' directory does not exist") +# return "output" +# +# @inject.injectable() +# def output_file_prefix(): +# return "" -@inject.injectable() -def rng_base_seed(): - return setting("rng_base_seed", 0) +# +# @inject.injectable(cache=True) +# def pipeline_file_name(settings): +# +# pipeline_file_name = settings.get("pipeline_file_name", "pipeline.h5") +# +# return pipeline_file_name -@inject.injectable(cache=True) -def settings_file_name(): - return "settings.yaml" +# @inject.injectable() +# def rng_base_seed(): +# return setting("rng_base_seed", 0) -@inject.injectable(cache=True) -def settings(settings_file_name): - settings_dict = read_settings_file(settings_file_name, mandatory=True) +# @inject.injectable(cache=True) +# def settings_file_name(): +# return "settings.yaml" - # basic settings validation for sharrow - sharrow_enabled = settings_dict.get("sharrow", False) - recode_pipeline_columns = settings_dict.get("recode_pipeline_columns", True) - if sharrow_enabled and not recode_pipeline_columns: - warnings.warn( - "use of `sharrow` setting generally requires `recode_pipeline_columns`" - ) - return settings_dict +# @inject.injectable(cache=True) +# def settings(settings_file_name): +# settings_dict = read_settings_file(settings_file_name, mandatory=True) +# +# # basic settings validation for sharrow +# sharrow_enabled = settings_dict.get("sharrow", False) +# recode_pipeline_columns = settings_dict.get("recode_pipeline_columns", True) +# if sharrow_enabled and not recode_pipeline_columns: +# warnings.warn( +# "use of `sharrow` setting generally requires `recode_pipeline_columns`" +# ) +# +# return settings_dict # def testing(): @@ -97,46 +96,47 @@ def settings(settings_file_name): # return "PYTEST_CURRENT_TEST" in os.environ -def get_cache_dir(): - """ - return path of cache directory in output_dir (creating it, if need be) - - cache directory is used to store - skim memmaps created by skim+dict_factories - tvpb tap_tap table cache - - Returns - ------- - str path - """ - cache_dir = setting("cache_dir", default=None) - if cache_dir is None: - cache_dir = setting( - "cache_dir", os.path.join(inject.get_injectable("output_dir"), "cache") - ) - - if not os.path.isdir(cache_dir): - os.mkdir(cache_dir) - assert os.path.isdir(cache_dir) - - # create a git-ignore in the cache dir if it does not exist. - # this helps prevent accidentally committing cache contents to git - gitignore = os.path.join(cache_dir, ".gitignore") - if not os.path.exists(gitignore): - with open(gitignore, "wt") as f: - f.write("/*") - - return cache_dir - - -def setting(key, default=None): - return inject.get_injectable("settings").get(key, default) - - -def override_setting(key, value): - new_settings = inject.get_injectable("settings") - new_settings[key] = value - inject.add_injectable("settings", new_settings) +# def get_cache_dir(): +# """ +# return path of cache directory in output_dir (creating it, if need be) +# +# cache directory is used to store +# skim memmaps created by skim+dict_factories +# tvpb tap_tap table cache +# +# Returns +# ------- +# str path +# """ +# cache_dir = setting("cache_dir", default=None) +# if cache_dir is None: +# cache_dir = setting( +# "cache_dir", os.path.join(inject.get_injectable("output_dir"), "cache") +# ) +# +# if not os.path.isdir(cache_dir): +# os.mkdir(cache_dir) +# assert os.path.isdir(cache_dir) +# +# # create a git-ignore in the cache dir if it does not exist. +# # this helps prevent accidentally committing cache contents to git +# gitignore = os.path.join(cache_dir, ".gitignore") +# if not os.path.exists(gitignore): +# with open(gitignore, "wt") as f: +# f.write("/*") +# +# return cache_dir +# +# +# def setting(key, default=None): +# return inject.get_injectable("settings").get(key, default) +# +# +# def override_setting(key, value): +# new_settings = inject.get_injectable("settings") +# new_settings[key] = value +# inject.add_injectable("settings", new_settings) +# def get_global_constants(): @@ -293,7 +293,7 @@ def data_file_path(file_name, mandatory=True, allow_glob=False): ) -def expand_input_file_list(input_files): +def expand_input_file_list(input_files, whale=None): """ expand list by unglobbing globs globs """ @@ -307,7 +307,12 @@ def expand_input_file_list(input_files): for file_name in input_files: - file_name = data_file_path(file_name, allow_glob=True) + if whale is None: + file_name = data_file_path(file_name, allow_glob=True) + else: + file_name = str( + whale.filesystem.get_data_file_path(file_name, allow_glob=True) + ) if os.path.isfile(file_name): expanded_files.append(file_name) @@ -390,7 +395,7 @@ def trace_file_path(file_name): return file_path -def log_file_path(file_name, prefix=True, whale: Whale = None): +def log_file_path(file_name, prefix=True, whale: workflow.Whale = None): if whale is not None: output_dir = whale.filesystem.get_output_dir() diff --git a/activitysim/core/configuration/filesystem.py b/activitysim/core/configuration/filesystem.py index 4ba64be87..2468fcae6 100644 --- a/activitysim/core/configuration/filesystem.py +++ b/activitysim/core/configuration/filesystem.py @@ -273,6 +273,58 @@ def _cascading_input_file_path( return Path(file_path) if file_path else None + def expand_input_file_list(self, input_files) -> list[Path]: + """ + expand list by unglobbing globs globs + """ + + # be nice and accept a string as well as a list of strings + if isinstance(input_files, (str, Path)): + input_files = [Path(input_files)] + else: + input_files = [Path(i) for i in input_files] + + expanded_files = [] + ungroked_files = 0 + + for file_name in input_files: + + file_name = self.get_data_file_path(file_name, allow_glob=True) + + if file_name.is_file(): + expanded_files.append(file_name) + continue + + if file_name.is_dir(): + logger.warning( + "WARNING: _expand_input_file_list skipping directory: " + f"(use glob instead): {file_name}", + ) + ungroked_files += 1 + continue + + # - not an exact match, could be a glob pattern + logger.debug(f"expand_input_file_list trying {file_name} as glob") + globbed_files = glob.glob(str(file_name)) + for globbed_file in globbed_files: + if os.path.isfile(globbed_file) or os.path.islink(globbed_file): + expanded_files.append(Path(globbed_file)) + else: + logger.warning( + "WARNING: expand_input_file_list skipping: " + f"(does not grok) {file_name}" + ) + ungroked_files += 1 + + if len(globbed_files) == 0: + logger.warning( + f"WARNING: expand_input_file_list file/glob not found: {file_name}", + ) + + assert ungroked_files == 0, f"{ungroked_files} ungroked file names" + + return sorted(expanded_files) + def get_configs_dir(self) -> tuple[Path]: """ Get the configs directories. diff --git a/activitysim/core/exceptions.py b/activitysim/core/exceptions.py index c25e1547a..08ab53b5b 100644 --- a/activitysim/core/exceptions.py +++ b/activitysim/core/exceptions.py @@ -2,7 +2,7 @@ class PipelineError(ValueError): """General class for errors in using a Pipeline.""" -class PipelineAccessError(PipelineError): +class WhaleAccessError(PipelineError): """Error trying to access a pipeline feature that is not yet initialized.""" diff --git a/activitysim/core/expressions.py b/activitysim/core/expressions.py index a72361740..9055f96c1 100644 --- a/activitysim/core/expressions.py +++ b/activitysim/core/expressions.py @@ -2,7 +2,7 @@ # See full license in LICENSE.txt. import logging -from . import assign, config, simulate, tracing +from . import assign, config, simulate, tracing, workflow from .util import assign_in_place, parse_suffix_args, suffix_expressions_df_str logger = logging.getLogger(__name__) @@ -122,7 +122,9 @@ def compute_columns(whale, df, model_settings, locals_dict={}, trace_label=None) return results -def assign_columns(whale, df, model_settings, locals_dict={}, trace_label=None): +def assign_columns( + whale: workflow.Whale, df, model_settings, locals_dict={}, trace_label=None +): """ Evaluate expressions in context of df and assign resulting target columns to df diff --git a/activitysim/core/flow.py b/activitysim/core/flow.py index 88189ae76..df7cbf6b4 100644 --- a/activitysim/core/flow.py +++ b/activitysim/core/flow.py @@ -11,11 +11,10 @@ import numpy as np import pandas as pd -from .. import __version__ -from ..core import tracing -from . import config, inject -from .simulate_consts import SPEC_EXPRESSION_NAME, SPEC_LABEL_NAME -from .timetable import ( +from activitysim import __version__ +from activitysim.core import tracing, workflow +from activitysim.core.simulate_consts import SPEC_EXPRESSION_NAME, SPEC_LABEL_NAME +from activitysim.core.timetable import ( sharrow_tt_adjacent_window_after, sharrow_tt_adjacent_window_before, sharrow_tt_max_time_block_available, @@ -133,7 +132,13 @@ def only_simple(x, exclude_keys=()): def get_flow( - spec, local_d, trace_label=None, choosers=None, interacts=None, zone_layer=None + whale, + spec, + local_d, + trace_label=None, + choosers=None, + interacts=None, + zone_layer=None, ): extra_vars = only_simple(local_d) orig_col_name = local_d.get("orig_col_name", None) @@ -161,6 +166,7 @@ def get_flow( else: aux_vars = {} flow = new_flow( + whale, spec, extra_vars, orig_col_name, @@ -208,7 +214,7 @@ def should_invalidate_cache_file(cache_filename, *source_filenames): return False -def scan_for_unused_names(tokens): +def scan_for_unused_names(whale: workflow.Whale, tokens): """ Scan all spec files to find unused skim variable names. @@ -220,11 +226,11 @@ def scan_for_unused_names(tokens): ------- Set[str] """ - configs_dir_list = inject.get_injectable("configs_dir") + configs_dir_list = whale.filesystem.get_configs_dir() configs_dir_list = ( [configs_dir_list] if isinstance(configs_dir_list, str) else configs_dir_list ) - assert isinstance(configs_dir_list, list) + assert isinstance(configs_dir_list, (list, tuple)) for directory in configs_dir_list: logger.debug(f"scanning for unused skims in {directory}") @@ -242,14 +248,15 @@ def scan_for_unused_names(tokens): return tokens -@inject.injectable(cache=True) -def skim_dataset_dict(skim_dataset): +@workflow.cached_object +def skim_dataset_dict(whale: workflow.Whale, skim_dataset): from .skim_dataset import SkimDataset return SkimDataset(skim_dataset) def skims_mapping( + whale: workflow.Whale, orig_col_name, dest_col_name, timeframe="tour", @@ -263,7 +270,7 @@ def skims_mapping( logger.info(f"- dest_col_name: {dest_col_name}") logger.info(f"- stop_col_name: {stop_col_name}") logger.info(f"- primary_origin_col_name: {primary_origin_col_name}") - skim_dataset = inject.get_injectable("skim_dataset") + skim_dataset = whale.get_injectable("skim_dataset") if zone_layer == "maz" or zone_layer is None: odim = "omaz" if "omaz" in skim_dataset.dims else "otaz" ddim = "dmaz" if "dmaz" in skim_dataset.dims else "dtaz" @@ -435,6 +442,7 @@ def skims_mapping( def new_flow( + whale: workflow.Whale, spec, extra_vars, orig_col_name, @@ -512,13 +520,10 @@ def new_flow( else: chooser_cols = list(choosers.columns) - cache_dir = os.path.join( - config.get_cache_dir(), - "__sharrowcache__", - ) - os.makedirs(cache_dir, exist_ok=True) + cache_dir = whale.filesystem.get_cache_dir("__sharrowcache__") logger.debug(f"flow.cache_dir: {cache_dir}") skims_mapping_ = skims_mapping( + whale, orig_col_name, dest_col_name, timeframe, @@ -719,6 +724,7 @@ def size_terms_on_flow(locals_d): def apply_flow( + whale, spec, choosers, locals_d=None, @@ -773,6 +779,7 @@ def apply_flow( with logtime("apply_flow"): try: flow = get_flow( + whale, spec, locals_d, trace_label, diff --git a/activitysim/core/input.py b/activitysim/core/input.py index ec9bd0ff7..ed7bdb454 100644 --- a/activitysim/core/input.py +++ b/activitysim/core/input.py @@ -3,12 +3,11 @@ import logging import os -import warnings import pandas as pd -from ..core import inject, util -from ..core.configuration import FileSystem, InputTable, Settings +from activitysim.core import util, workflow +from activitysim.core.configuration import InputTable logger = logging.getLogger(__name__) @@ -20,7 +19,7 @@ def canonical_table_index_name(table_name): return table_index_names and table_index_names.get(table_name, None) -def read_input_table(whale, tablename, required=True): +def read_input_table(whale: workflow.Whale, tablename, required=True): """Reads input table name and returns cleaned DataFrame. Uses settings found in input_table_list in global settings file diff --git a/activitysim/core/interaction_sample.py b/activitysim/core/interaction_sample.py index 62cabd861..f10cb389c 100644 --- a/activitysim/core/interaction_sample.py +++ b/activitysim/core/interaction_sample.py @@ -5,8 +5,8 @@ import numpy as np import pandas as pd -from . import chunk, config, interaction_simulate, logit, pipeline, tracing -from .simulate import set_skim_wrapper_targets +from activitysim.core import chunk, interaction_simulate, logit, tracing, workflow +from activitysim.core.simulate import set_skim_wrapper_targets logger = logging.getLogger(__name__) @@ -14,6 +14,7 @@ def make_sample_choices( + whale: workflow.Whale, choosers, probs, alternatives, @@ -22,6 +23,7 @@ def make_sample_choices( alt_col_name, allow_zero_probs, trace_label, + chunk_sizer, ): """ @@ -61,13 +63,13 @@ def make_sample_choices( choosers = choosers[~zero_probs] # get sample_size rands for each chooser - rands = pipeline.get_rn_generator().random_for_df(probs, n=sample_size) + rands = whale.get_rn_generator().random_for_df(probs, n=sample_size) # transform as we iterate over alternatives # reshape so rands[i] is in broadcastable (2-D) shape for cum_probs_arr # i.e rands[i] is a 2-D array of one alt choice rand for each chooser # rands = rands.T #.reshape(sample_size, -1, 1) - chunk.log_df(trace_label, "rands", rands) + chunk_sizer.log_df(trace_label, "rands", rands) # TODO: is `sample_choices_maker` more efficient? The order of samples changes, might change repro-randoms from .choosing import sample_choices_maker_preserve_ordering @@ -78,8 +80,8 @@ def make_sample_choices( alternatives.index.values, ) - chunk.log_df(trace_label, "choices_array", choices_array) - chunk.log_df(trace_label, "choice_probs_array", choice_probs_array) + chunk_sizer.log_df(trace_label, "choices_array", choices_array) + chunk_sizer.log_df(trace_label, "choice_probs_array", choice_probs_array) # explode to one row per chooser.index, alt_zone_id choices_df = pd.DataFrame( @@ -91,22 +93,23 @@ def make_sample_choices( } ) - chunk.log_df(trace_label, "choices_df", choices_df) + chunk_sizer.log_df(trace_label, "choices_df", choices_df) del choices_array - chunk.log_df(trace_label, "choices_array", None) + chunk_sizer.log_df(trace_label, "choices_array", None) del rands - chunk.log_df(trace_label, "rands", None) + chunk_sizer.log_df(trace_label, "rands", None) del choice_probs_array - chunk.log_df(trace_label, "choice_probs_array", None) + chunk_sizer.log_df(trace_label, "choice_probs_array", None) # handing this off to caller - chunk.log_df(trace_label, "choices_df", None) + chunk_sizer.log_df(trace_label, "choices_df", None) return choices_df def _interaction_sample( + whale: workflow.Whale, choosers, alternatives, spec, @@ -118,6 +121,7 @@ def _interaction_sample( locals_d=None, trace_label=None, zone_layer=None, + chunk_sizer=None, ): """ Run a MNL simulation in the situation in which alternatives must @@ -174,7 +178,7 @@ def _interaction_sample( number of duplicate picks for chooser, alt """ - have_trace_targets = tracing.has_trace_targets(choosers) + have_trace_targets = tracing.has_trace_targets(whale, choosers) trace_ids = None trace_rows = None num_choosers = len(choosers.index) @@ -201,7 +205,7 @@ def _interaction_sample( chooser_index_id = interaction_simulate.ALT_CHOOSER_ID if log_alt_losers else None - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = whale.settings.sharrow # - cross join choosers and alternatives (cartesian product) # for every chooser, there will be a row for each alternative @@ -216,6 +220,7 @@ def _interaction_sample( interaction_utilities, trace_eval_results, ) = interaction_simulate.eval_interaction_utilities( + whale, spec, choosers, locals_d, @@ -226,7 +231,7 @@ def _interaction_sample( extra_data=alternatives, zone_layer=zone_layer, ) - chunk.log_df(trace_label, "interaction_utilities", interaction_utilities) + chunk_sizer.log_df(trace_label, "interaction_utilities", interaction_utilities) if sharrow_enabled == "test" or True: interaction_utilities_sh, trace_eval_results_sh = ( interaction_utilities, @@ -240,7 +245,7 @@ def _interaction_sample( chooser_index_id=chooser_index_id, ) - chunk.log_df(trace_label, "interaction_df", interaction_df) + chunk_sizer.log_df(trace_label, "interaction_df", interaction_df) assert alternative_count == len(interaction_df.index) / len(choosers.index) @@ -271,6 +276,7 @@ def _interaction_sample( interaction_utilities, trace_eval_results, ) = interaction_simulate.eval_interaction_utilities( + whale, spec, interaction_df, locals_d, @@ -280,12 +286,12 @@ def _interaction_sample( log_alt_losers=log_alt_losers, zone_layer=zone_layer, ) - chunk.log_df(trace_label, "interaction_utilities", interaction_utilities) + chunk_sizer.log_df(trace_label, "interaction_utilities", interaction_utilities) # ########### HWM - high water mark (point of max observed memory usage) del interaction_df - chunk.log_df(trace_label, "interaction_df", None) + chunk_sizer.log_df(trace_label, "interaction_df", None) if sharrow_enabled == "test": try: @@ -363,10 +369,10 @@ def _interaction_sample( interaction_utilities.values.reshape(len(choosers), alternative_count), index=choosers.index, ) - chunk.log_df(trace_label, "utilities", utilities) + chunk_sizer.log_df(trace_label, "utilities", utilities) del interaction_utilities - chunk.log_df(trace_label, "interaction_utilities", None) + chunk_sizer.log_df(trace_label, "interaction_utilities", None) if have_trace_targets: tracing.trace_df( @@ -385,10 +391,10 @@ def _interaction_sample( trace_label=trace_label, trace_choosers=choosers, ) - chunk.log_df(trace_label, "probs", probs) + chunk_sizer.log_df(trace_label, "probs", probs) del utilities - chunk.log_df(trace_label, "utilities", None) + chunk_sizer.log_df(trace_label, "utilities", None) if have_trace_targets: tracing.trace_df( @@ -420,6 +426,7 @@ def _interaction_sample( return choices_df else: choices_df = make_sample_choices( + whale, choosers, probs, alternatives, @@ -428,12 +435,13 @@ def _interaction_sample( alt_col_name, allow_zero_probs=allow_zero_probs, trace_label=trace_label, + chunk_sizer=chunk_sizer, ) - chunk.log_df(trace_label, "choices_df", choices_df) + chunk_sizer.log_df(trace_label, "choices_df", choices_df) del probs - chunk.log_df(trace_label, "probs", None) + chunk_sizer.log_df(trace_label, "probs", None) # pick_count and pick_dup # pick_count is number of duplicate picks @@ -450,7 +458,7 @@ def _interaction_sample( # drop the duplicates choices_df = choices_df[~choices_df["pick_dup"]] del choices_df["pick_dup"] - chunk.log_df(trace_label, "choices_df", choices_df) + chunk_sizer.log_df(trace_label, "choices_df", choices_df) # set index after groupby so we can trace on it choices_df.set_index(choosers.index.name, inplace=True) @@ -467,7 +475,7 @@ def _interaction_sample( # don't need this after tracing del choices_df["rand"] - chunk.log_df(trace_label, "choices_df", choices_df) + chunk_sizer.log_df(trace_label, "choices_df", choices_df) # - NARROW choices_df["prob"] = choices_df["prob"].astype(np.float32) @@ -478,6 +486,7 @@ def _interaction_sample( def interaction_sample( + whale, choosers, alternatives, spec, @@ -561,11 +570,17 @@ def interaction_sample( sample_size = min(sample_size, len(alternatives.index)) result_list = [] - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - choosers, chunk_size, trace_label, chunk_tag + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers( + whale, choosers, chunk_size, trace_label, chunk_tag ): choices = _interaction_sample( + whale, chooser_chunk, alternatives, spec=spec, @@ -577,13 +592,14 @@ def interaction_sample( locals_d=locals_d, trace_label=chunk_trace_label, zone_layer=zone_layer, + chunk_sizer=chunk_sizer, ) if choices.shape[0] > 0: # might not be any if allow_zero_probs result_list.append(choices) - chunk.log_df(trace_label, f"result_list", result_list) + chunk_sizer.log_df(trace_label, f"result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: diff --git a/activitysim/core/interaction_sample_simulate.py b/activitysim/core/interaction_sample_simulate.py index 13e65f384..53852bb8b 100644 --- a/activitysim/core/interaction_sample_simulate.py +++ b/activitysim/core/interaction_sample_simulate.py @@ -12,6 +12,7 @@ def _interaction_sample_simulate( + whale, choosers, alternatives, spec, @@ -96,7 +97,7 @@ def _interaction_sample_simulate( alternatives.index[last_repeat] ) - have_trace_targets = tracing.has_trace_targets(choosers) + have_trace_targets = tracing.has_trace_targets(whale, choosers) if have_trace_targets: tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, "choosers")) @@ -160,6 +161,7 @@ def _interaction_sample_simulate( interaction_utilities, trace_eval_results, ) = interaction_simulate.eval_interaction_utilities( + whale, spec, interaction_df, locals_d, @@ -277,7 +279,7 @@ def _interaction_sample_simulate( # positions is series with the chosen alternative represented as a column index in probs # which is an integer between zero and num alternatives in the alternative sample positions, rands = logit.make_choices( - probs, trace_label=trace_label, trace_choosers=choosers + whale, probs, trace_label=trace_label, trace_choosers=choosers ) chunk.log_df(trace_label, "positions", positions) diff --git a/activitysim/core/interaction_simulate.py b/activitysim/core/interaction_simulate.py index 38075e5ad..dbbe7a981 100644 --- a/activitysim/core/interaction_simulate.py +++ b/activitysim/core/interaction_simulate.py @@ -19,6 +19,7 @@ def eval_interaction_utilities( + whale, spec, df, locals_d, @@ -72,7 +73,7 @@ def eval_interaction_utilities( trace_label = tracing.extend_trace_label(trace_label, "eval_interaction_utils") logger.info("Running eval_interaction_utilities on %s rows" % df.shape[0]) - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = whale.settings.sharrow if locals_d is not None and locals_d.get("_sharrow_skip", False): sharrow_enabled = False @@ -84,7 +85,7 @@ def eval_interaction_utilities( trace_eval_results = None - with chunk.chunk_log(trace_label): + with chunk.chunk_log(trace_label, settings=whale.settings) as chunk_sizer: assert len(spec.columns) == 1 @@ -170,6 +171,7 @@ def replace_in_index_level(mi, level, *repls): timelogger.mark("sharrow preamble", True, logger, trace_label) sh_util, sh_flow = apply_flow( + whale, spec_sh, df, locals_d, @@ -178,12 +180,12 @@ def replace_in_index_level(mi, level, *repls): zone_layer=zone_layer, ) if sh_util is not None: - chunk.log_df(trace_label, "sh_util", sh_util) + chunk_sizer.log_df(trace_label, "sh_util", sh_util) utilities = pd.DataFrame( {"utility": sh_util.reshape(-1)}, index=df.index if extra_data is None else None, ) - chunk.log_df(trace_label, "sh_util", None) # hand off to caller + chunk_sizer.log_df(trace_label, "sh_util", None) # hand off to caller timelogger.mark("sharrow flow", True, logger, trace_label) else: @@ -218,7 +220,7 @@ def to_series(x): utilities = pd.DataFrame({"utility": 0.0}, index=df.index) - chunk.log_df(trace_label, "eval.utilities", utilities) + chunk_sizer.log_df(trace_label, "eval.utilities", utilities) no_variability = has_missing_vals = 0 @@ -262,7 +264,7 @@ def to_series(x): # update locals to allows us to ref previously assigned targets locals_d[target] = v - chunk.log_df( + chunk_sizer.log_df( trace_label, target, v ) # track temps stored in locals @@ -342,7 +344,7 @@ def to_series(x): trace_eval_results[k] = v[trace_rows] * coefficient del v - # chunk.log_df(trace_label, 'v', None) + # chunk_sizer.log_df(trace_label, 'v', None) except Exception as err: logger.exception( @@ -379,11 +381,15 @@ def to_series(x): trace_eval_results = pd.concat( [df[trace_rows], trace_eval_results], axis=1 ) - chunk.log_df(trace_label, "eval.trace_eval_results", trace_eval_results) + chunk_sizer.log_df( + trace_label, "eval.trace_eval_results", trace_eval_results + ) - chunk.log_df(trace_label, "v", None) - chunk.log_df(trace_label, "eval.utilities", None) # out of out hands... - chunk.log_df(trace_label, "eval.trace_eval_results", None) + chunk_sizer.log_df(trace_label, "v", None) + chunk_sizer.log_df( + trace_label, "eval.utilities", None + ) # out of out hands... + chunk_sizer.log_df(trace_label, "eval.trace_eval_results", None) timelogger.mark("regular interact flow", True, logger, trace_label) else: @@ -441,7 +447,9 @@ def to_series(x): trace_eval_results.index = df[trace_rows].index except ValueError: pass - chunk.log_df(trace_label, "eval.trace_eval_results", trace_eval_results) + chunk_sizer.log_df( + trace_label, "eval.trace_eval_results", trace_eval_results + ) else: # in test mode, trace from non-sharrow exists trace_eval_results = pd.concat( @@ -456,7 +464,9 @@ def to_series(x): axis=1, ) trace_eval_results.index = df[trace_rows].index - chunk.log_df(trace_label, "eval.trace_eval_results", trace_eval_results) + chunk_sizer.log_df( + trace_label, "eval.trace_eval_results", trace_eval_results + ) # sh_utility_fat1 = np.dot(sh_utility_fat, spec.values) # sh_utility_fat2 = sh_flow.dot( @@ -582,6 +592,7 @@ def _interaction_simulate( trace_choice_name=None, log_alt_losers=False, estimator=None, + chunk_sizer=None, ): """ Run a MNL simulation in the situation in which alternatives must @@ -632,7 +643,7 @@ def _interaction_simulate( """ trace_label = tracing.extend_trace_label(trace_label, "interaction_simulate") - have_trace_targets = tracing.has_trace_targets(choosers) + have_trace_targets = tracing.has_trace_targets(whale, choosers) if have_trace_targets: tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, "choosers")) @@ -686,6 +697,7 @@ def _interaction_simulate( trace_rows = trace_ids = None interaction_utilities, trace_eval_results = eval_interaction_utilities( + whale, spec, choosers, locals_d, @@ -699,7 +711,7 @@ def _interaction_simulate( # set this index here as this is how later code extracts the chosen alt id's interaction_utilities.index = np.tile(alternatives.index, len(choosers)) - chunk.log_df(trace_label, "interaction_utilities", interaction_utilities) + chunk_sizer.log_df(trace_label, "interaction_utilities", interaction_utilities) # mem.trace_memory_info(f"{trace_label}.init interaction_utilities sh", force_garbage_collect=True) if sharrow_enabled == "test" or True: interaction_utilities_sh, trace_eval_results_sh = ( @@ -725,7 +737,7 @@ def _interaction_simulate( alt_index_id=alt_index_id, chooser_index_id=chooser_index_id, ) - chunk.log_df(trace_label, "interaction_df", interaction_df) + chunk_sizer.log_df(trace_label, "interaction_df", interaction_df) if skims is not None: simulate.set_skim_wrapper_targets(interaction_df, skims) @@ -750,6 +762,7 @@ def _interaction_simulate( trace_rows = trace_ids = None interaction_utilities, trace_eval_results = eval_interaction_utilities( + whale, spec, interaction_df, locals_d, @@ -758,14 +771,14 @@ def _interaction_simulate( estimator=estimator, log_alt_losers=log_alt_losers, ) - chunk.log_df(trace_label, "interaction_utilities", interaction_utilities) + chunk_sizer.log_df(trace_label, "interaction_utilities", interaction_utilities) # mem.trace_memory_info(f"{trace_label}.init interaction_utilities", force_garbage_collect=True) # print(f"interaction_df {interaction_df.shape}") # print(f"interaction_utilities {interaction_utilities.shape}") del interaction_df - chunk.log_df(trace_label, "interaction_df", None) + chunk_sizer.log_df(trace_label, "interaction_df", None) if have_trace_targets: tracing.trace_interaction_eval_results( @@ -787,7 +800,7 @@ def _interaction_simulate( interaction_utilities.values.reshape(len(choosers), sample_size), index=choosers.index, ) - chunk.log_df(trace_label, "utilities", utilities) + chunk_sizer.log_df(trace_label, "utilities", utilities) if have_trace_targets: tracing.trace_df( @@ -803,10 +816,10 @@ def _interaction_simulate( probs = logit.utils_to_probs( utilities, trace_label=trace_label, trace_choosers=choosers ) - chunk.log_df(trace_label, "probs", probs) + chunk_sizer.log_df(trace_label, "probs", probs) del utilities - chunk.log_df(trace_label, "utilities", None) + chunk_sizer.log_df(trace_label, "utilities", None) if have_trace_targets: tracing.trace_df( @@ -819,10 +832,10 @@ def _interaction_simulate( # positions is series with the chosen alternative represented as a column index in probs # which is an integer between zero and num alternatives in the alternative sample positions, rands = logit.make_choices( - probs, trace_label=trace_label, trace_choosers=choosers + whale, probs, trace_label=trace_label, trace_choosers=choosers ) - chunk.log_df(trace_label, "positions", positions) - chunk.log_df(trace_label, "rands", rands) + chunk_sizer.log_df(trace_label, "positions", positions) + chunk_sizer.log_df(trace_label, "rands", rands) # need to get from an integer offset into the alternative sample to the alternative index # that is, we want the index value of the row that is offset by rows into the @@ -834,7 +847,7 @@ def _interaction_simulate( # create a series with index from choosers and the index of the chosen alternative choices = pd.Series(choices, index=choosers.index) - chunk.log_df(trace_label, "choices", choices) + chunk_sizer.log_df(trace_label, "choices", choices) if have_trace_targets: tracing.trace_df( @@ -852,6 +865,7 @@ def _interaction_simulate( def interaction_simulate( + whale, choosers, alternatives, spec, @@ -919,9 +933,12 @@ def interaction_simulate( assert len(choosers) > 0 result_list = [] - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - choosers, chunk_size, trace_label - ): + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers(whale, choosers, chunk_size, trace_label): choices = _interaction_simulate( chooser_chunk, @@ -934,11 +951,12 @@ def interaction_simulate( trace_choice_name=trace_choice_name, log_alt_losers=log_alt_losers, estimator=estimator, + chunk_sizer=chunk_sizer, ) result_list.append(choices) - chunk.log_df(trace_label, "result_list", result_list) + chunk_sizer.log_df(trace_label, "result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: diff --git a/activitysim/core/logit.py b/activitysim/core/logit.py index ff3128450..de32fd593 100644 --- a/activitysim/core/logit.py +++ b/activitysim/core/logit.py @@ -6,8 +6,8 @@ import numpy as np import pandas as pd -from . import config, pipeline, tracing -from .choosing import choice_maker +from activitysim.core import config, tracing, workflow +from activitysim.core.choosing import choice_maker logger = logging.getLogger(__name__) @@ -214,7 +214,13 @@ def utils_to_probs( return probs -def make_choices(probs, trace_label=None, trace_choosers=None, allow_bad_probs=False): +def make_choices( + whale: workflow.Whale, + probs, + trace_label=None, + trace_choosers=None, + allow_bad_probs=False, +): """ Make choices for each chooser from among a set of alternatives. @@ -259,7 +265,7 @@ def make_choices(probs, trace_label=None, trace_choosers=None, allow_bad_probs=F trace_choosers=trace_choosers, ) - rands = pipeline.get_rn_generator().random_for_df(probs) + rands = whale.get_rn_generator().random_for_df(probs) choices = pd.Series(choice_maker(probs.values, rands), index=probs.index) diff --git a/activitysim/core/los.py b/activitysim/core/los.py index 5f7268767..cd6a5345d 100644 --- a/activitysim/core/los.py +++ b/activitysim/core/los.py @@ -7,13 +7,11 @@ import numpy as np import pandas as pd -from . import skim_dataset # noqa: F401 -from . import config, inject, pathbuilder, skim_dictionary, tracing, util -from .cleaning import recode_based_on_table -from .exceptions import SettingsFileNotFoundError -from .pipeline import Whale -from .skim_dict_factory import MemMapSkimFactory, NumpyArraySkimFactory -from .skim_dictionary import NOT_IN_SKIM_ZONE_ID +from activitysim.core import config, pathbuilder, skim_dictionary, tracing, util +from activitysim.core.cleaning import recode_based_on_table +from activitysim.core.configuration.network import NetworkSettings, TAZ_Settings +from activitysim.core.skim_dict_factory import MemMapSkimFactory, NumpyArraySkimFactory +from activitysim.core.skim_dictionary import NOT_IN_SKIM_ZONE_ID skim_factories = { "NumpyArraySkimFactory": NumpyArraySkimFactory, @@ -72,7 +70,7 @@ class Network_LOS(object): """ def __init__(self, whale, los_settings_file_name=LOS_SETTINGS_FILE_NAME): - + self.whale = whale # Note: we require all skims to be of same dtype so they can share buffer - is that ok? # fixme is it ok to require skims be all the same type? if so, is this the right choice? self.skim_dtype_name = "float32" @@ -93,7 +91,8 @@ def __init__(self, whale, los_settings_file_name=LOS_SETTINGS_FILE_NAME): self.tvpb = None self.los_settings_file_name = los_settings_file_name - self.load_settings(whale) + self.load_settings() + self.sharrow_enabled = whale.settings.sharrow # dependency injection of skim factory (of type specified in skim_dict_factory setting) skim_dict_factory_name = self.setting("skim_dict_factory") @@ -119,7 +118,6 @@ def rebuild_tvpb_cache(self): return self.setting("rebuild_tvpb_cache") def setting(self, keys, default=""): - # if they dont specify a default, check the default defaults default = ( DEFAULT_SETTINGS.get(keys, "") @@ -131,32 +129,44 @@ def setting(self, keys, default=""): key_list = keys.split(".") s = self.los_settings for key in key_list[:-1]: - s = s.get(key) - if default == "": - assert isinstance( - s, dict - ), f"expected key '{key}' not found in '{keys}' in {self.los_settings_file_name}" + if isinstance(s, dict): + s = s.get(key, None) + else: + s = getattr(s, key, None) + if default == "" and s is None: + raise ValueError( + f"expected key '{key}' not found in '{keys}' in {self.los_settings_file_name}" + ) + # assert isinstance( + # s, dict + # ), f"expected key '{key}' not found in '{keys}' in {self.los_settings_file_name}" key = key_list[-1] # last key if default == "": - assert ( - key in s - ), f"Expected setting {keys} not found in in {LOS_SETTINGS_FILE_NAME}" + if isinstance(s, dict): + assert ( + key in s + ), f"Expected setting {keys} not found in in {LOS_SETTINGS_FILE_NAME}" + else: + assert hasattr(s, key) if isinstance(s, dict): return s.get(key, default) else: - return default + return getattr(s, key, default) - def load_settings(self, whale: Whale): + def load_settings(self): """ Read setting file and initialize object variables (see class docstring for list of object variables) """ - self.los_settings = whale.filesystem.read_settings_file( - self.los_settings_file_name, mandatory=True + self.los_settings = self.whale.filesystem.read_settings_file( + self.los_settings_file_name, + mandatory=True, + validator_class=NetworkSettings, ) + self.whale.network_settings = self.los_settings # validate skim_time_periods - self.skim_time_periods = whale.network_settings.skim_time_periods + self.skim_time_periods = self.whale.network_settings.skim_time_periods if "hours" in self.skim_time_periods: self.skim_time_periods["periods"] = self.skim_time_periods.pop("hours") warnings.warn( @@ -202,7 +212,9 @@ def load_skim_info(self): """ assert self.skim_dict_factory is not None # load taz skim_info - self.skims_info["taz"] = self.skim_dict_factory.load_skim_info("taz") + self.skims_info["taz"] = self.skim_dict_factory.load_skim_info( + self.whale, "taz" + ) if self.zone_system == THREE_ZONE: # load tap skim_info @@ -224,7 +236,6 @@ def load_data(self): # load maz tables if self.zone_system in [TWO_ZONE, THREE_ZONE]: - # maz file_name = self.setting("maz") self.maz_taz_df = pd.read_csv( @@ -252,7 +263,6 @@ def load_data(self): else maz_to_maz_tables ) for file_name in maz_to_maz_tables: - df = pd.read_csv(config.data_file_path(file_name, mandatory=True)) # recode MAZs if needed @@ -281,7 +291,6 @@ def load_data(self): # load tap tables if self.zone_system == THREE_ZONE: - # tap_df should already have been loaded by load_skim_info because, # during multiprocessing, it is required by TapTapUidCalculator to size TVPBCache # self.tap_df = pd.read_csv(config.data_file_path(self.setting('tap'), mandatory=True)) @@ -289,7 +298,6 @@ def load_data(self): # maz_to_tap_dfs - different sized sparse arrays with different columns, so we keep them seperate for mode, maz_to_tap_settings in self.setting("maz_to_tap").items(): - assert ( "table" in maz_to_tap_settings ), f"Expected setting maz_to_tap.{mode}.table not found in in {LOS_SETTINGS_FILE_NAME}" @@ -305,7 +313,6 @@ def load_data(self): # to only include the nearest tap to origin when more than one tap serves the same line distance_col = maz_to_tap_settings.get("tap_line_distance_col") if distance_col: - if self.tap_lines_df is None: # load tap_lines on demand (required if they specify tap_line_distance_col) tap_lines_file_name = self.setting( @@ -389,11 +396,11 @@ def load_data(self): self.maz_to_tap_dfs[mode] = df # create taz skim dict - if not config.setting("sharrow", False): + if not self.sharrow_enabled: assert "taz" not in self.skim_dicts # If offset_preprocessing was completed, then TAZ values # will be pre-offset and there's no need to re-offset them. - if config.setting("offset_preprocessing", False): + if self.whale.settings.offset_preprocessing: _override_offset_int = 0 else: _override_offset_int = None @@ -407,7 +414,7 @@ def load_data(self): # create MazSkimDict facade if self.zone_system in [TWO_ZONE, THREE_ZONE]: - if not config.setting("sharrow", False): + if not self.sharrow_enabled: # create MazSkimDict facade skim_dict # (must have already loaded dependencies: taz skim_dict, maz_to_maz_df, and maz_taz_df) assert "maz" not in self.skim_dicts @@ -427,7 +434,7 @@ def load_data(self): # create tap skim dict if self.zone_system == THREE_ZONE: - if not config.setting("sharrow", False): + if not self.sharrow_enabled: assert "tap" not in self.skim_dicts tap_skim_dict = self.create_skim_dict("tap") self.skim_dicts["tap"] = tap_skim_dict @@ -439,7 +446,7 @@ def load_data(self): else: self.skim_dicts["tap"] = self.get_skim_dict("tap") - def create_skim_dict(self, skim_tag, _override_offset_int=None): + def create_skim_dict(self, whale, skim_tag, _override_offset_int=None): """ Create a new SkimDict of type specified by skim_tag (e.g. 'taz', 'maz' or 'tap') @@ -471,7 +478,7 @@ def create_skim_dict(self, skim_tag, _override_offset_int=None): else: skim_info = self.skims_info[skim_tag] skim_data = self.skim_dict_factory.get_skim_data(skim_tag, skim_info) - skim_dict = skim_dictionary.SkimDict(skim_tag, skim_info, skim_data) + skim_dict = skim_dictionary.SkimDict(whale, skim_tag, skim_info, skim_data) logger.debug(f"create_skim_dict {skim_tag} omx_shape {skim_dict.omx_shape}") @@ -495,6 +502,8 @@ def omx_file_names(self, skim_tag): list of str """ file_names = self.setting(f"{skim_tag}_skims") + if isinstance(file_names, TAZ_Settings): + file_names = file_names.omx if isinstance(file_names, dict): for i in ("file", "files", "omx"): if i in file_names: @@ -644,12 +653,12 @@ def get_skim_dict(self, skim_tag): ------- SkimDict or subclass (e.g. MazSkimDict) """ - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = self.sharrow_enabled if sharrow_enabled and skim_tag in ("taz", "maz"): - skim_dataset = inject.get_injectable("skim_dataset") # non-global import avoids circular references from .skim_dataset import SkimDataset + skim_dataset = self.whale.get_injectable("skim_dataset") if skim_tag == "maz": return SkimDataset(skim_dataset) else: @@ -660,7 +669,7 @@ def get_skim_dict(self, skim_tag): del skim_dataset.attrs[f"dim_redirection_{dd}"] return SkimDataset(skim_dataset) elif sharrow_enabled and skim_tag in ("tap"): - tap_dataset = inject.get_injectable("tap_dataset") + tap_dataset = self.whale.get_injectable("tap_dataset") from .skim_dataset import SkimDataset return SkimDataset(tap_dataset) @@ -812,13 +821,13 @@ def skim_time_period_label(self, time_period): return result - def get_tazs(self): + def get_tazs(self, whale): # FIXME - should compute on init? if self.zone_system == ONE_ZONE: - tazs = inject.get_table("land_use").index.values + tazs = whale.get_dataframe("land_use").index.values else: try: - land_use_taz = inject.get_table("land_use_taz").to_frame() + land_use_taz = whale.get_dataframe("land_use_taz").to_frame() except (RuntimeError, KeyError): # land_use_taz is missing, use fallback tazs = self.maz_taz_df.TAZ.unique() @@ -844,17 +853,15 @@ def get_taps(self): assert isinstance(taps, np.ndarray) return taps - @property - def get_maz_to_taz_series(self): + def get_maz_to_taz_series(self, whale): """ pd.Series: Index is the MAZ, value is the corresponding TAZ """ - sharrow_enabled = config.setting("sharrow", False) - if sharrow_enabled: + if self.sharrow_enabled: # FIXME:SHARROW - this assumes that both MAZ and TAZ have been recoded to # zero-based indexes, but what if that was not done? # Should we check it and error out here or bravely march forward? - skim_dataset = inject.get_injectable("skim_dataset") + skim_dataset = whale.get_injectable("skim_dataset") maz_to_taz = skim_dataset["_digitized_otaz_of_omaz"].to_series() else: maz_to_taz = self.maz_taz_df[["MAZ", "TAZ"]].set_index("MAZ").TAZ @@ -879,7 +886,7 @@ def map_maz_to_taz(self, s): input_was_series = False else: input_was_series = True - out = s.map(self.get_maz_to_taz_series) + out = s.map(self.get_maz_to_taz_series(self.whale)) if np.issubdtype(out, np.floating): if out.isna().any(): raise KeyError("failed in mapping MAZ to TAZ") diff --git a/activitysim/core/mp_tasks.py b/activitysim/core/mp_tasks.py index ebbf1cb37..1ee1485c4 100644 --- a/activitysim/core/mp_tasks.py +++ b/activitysim/core/mp_tasks.py @@ -697,13 +697,13 @@ def coalesce_pipelines(sub_proc_names, slice_info): for table_name in mirrored_tables: df = mirrored_tables[table_name] info(f"adding mirrored table {table_name} {df.shape}") - pipeline.replace_table(table_name, df) + whale.add_table(table_name, df) # - concatenate omnibus tables and add them to pipeline for table_name in omnibus_tables: df = pd.concat(omnibus_tables[table_name], sort=False) info(f"adding omnibus table {table_name} {df.shape}") - pipeline.replace_table(table_name, df) + whale.add_table(table_name, df) pipeline.add_checkpoint(checkpoint_name) @@ -1071,7 +1071,7 @@ def allocate_shared_shadow_pricing_buffers(): return shadow_pricing_buffers -def allocate_shared_shadow_pricing_buffers_choice(): +def allocate_shared_shadow_pricing_buffers_choice(whale): """ This is called by the main process to allocate memory buffer to share with subprocs @@ -1090,7 +1090,9 @@ def allocate_shared_shadow_pricing_buffers_choice(): from activitysim.abm.tables import shadow_pricing shadow_pricing_buffers_choice = ( - shadow_pricing.buffers_for_shadow_pricing_choice(shadow_pricing_choice_info) + shadow_pricing.buffers_for_shadow_pricing_choice( + whale, shadow_pricing_choice_info + ) ) else: shadow_pricing_buffers_choice = {} @@ -1372,7 +1374,7 @@ def drop_breadcrumb(step_name, crumb, value=True): write_breadcrumbs(breadcrumbs) -def run_multiprocess(injectables): +def run_multiprocess(whale, injectables): """ run the steps in run_list, possibly resuming after checkpoint specified by resume_after @@ -1448,7 +1450,7 @@ def find_breadcrumb(crumb, default=None): # combine shared_shadow_pricing_buffers to pool choices across all processes t0 = tracing.print_elapsed_time() - shared_data_buffers.update(allocate_shared_shadow_pricing_buffers_choice()) + shared_data_buffers.update(allocate_shared_shadow_pricing_buffers_choice(whale)) t0 = tracing.print_elapsed_time("allocate shared shadow_pricing choice buffer", t0) mem.trace_memory_info("allocate_shared_shadow_pricing_buffers_choice.completed") diff --git a/activitysim/core/pathbuilder.py b/activitysim/core/pathbuilder.py index 1199f19c5..3ca78cc18 100644 --- a/activitysim/core/pathbuilder.py +++ b/activitysim/core/pathbuilder.py @@ -18,6 +18,7 @@ pathbuilder_cache, simulate, tracing, + workflow, ) from activitysim.core.pathbuilder_cache import memo from activitysim.core.util import reindex @@ -36,6 +37,7 @@ def compute_utilities( + whale: workflow.Whale, network_los, model_settings, choosers, @@ -49,8 +51,7 @@ def compute_utilities( """ trace_label = tracing.extend_trace_label(trace_label, "compute_utils") - with chunk.chunk_log(trace_label): - + with chunk.chunk_log(trace_label, settings=whale.settings): logger.debug( f"{trace_label} Running compute_utilities with {choosers.shape[0]} choosers" ) @@ -67,11 +68,11 @@ def compute_utilities( # - run preprocessor to annotate choosers preprocessor_settings = model_settings.get("PREPROCESSOR") if preprocessor_settings: - # don't want to alter caller's dataframe choosers = choosers.copy() expressions.assign_columns( + whale, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -79,6 +80,7 @@ def compute_utilities( ) utilities = simulate.eval_utilities( + whale, spec, choosers, locals_d=locals_dict, @@ -96,7 +98,6 @@ class TransitVirtualPathBuilder(object): """ def __init__(self, network_los): - self.network_los = network_los self.uid_calculator = pathbuilder_cache.TapTapUidCalculator(network_los) @@ -146,10 +147,9 @@ def units_for_recipe(self, recipe): def compute_maz_tap_utilities( self, recipe, maz_od_df, chooser_attributes, leg, mode, trace_label, trace ): - trace_label = tracing.extend_trace_label(trace_label, f"maz_tap_utils.{leg}") - with chunk.chunk_log(trace_label): + with chunk.chunk_log(trace_label, settings=whale.settings): maz_tap_settings = self.network_los.setting( f"TVPB_SETTINGS.{recipe}.maz_tap_settings.{mode}" ) @@ -195,7 +195,6 @@ def compute_maz_tap_utilities( chunk.log_df(trace_label, "utilities_df", utilities_df) if self.units_for_recipe(recipe) == "utility": - utilities_df[leg] = compute_utilities( self.network_los, maz_tap_settings, @@ -209,9 +208,10 @@ def compute_maz_tap_utilities( chunk.log_df(trace_label, "utilities_df", utilities_df) # annotated else: - assignment_spec = assign.read_assignment_spec( - file_name=config.config_file_path(maz_tap_settings["SPEC"]) + file_name=whale.filesystem.get_config_file_path( + maz_tap_settings["SPEC"] + ) ) results, _, _ = assign.assign_variables( @@ -233,7 +233,6 @@ def compute_maz_tap_utilities( def all_transit_paths( self, access_df, egress_df, chooser_attributes, trace_label, trace ): - trace_label = tracing.extend_trace_label(trace_label, "all_transit_paths") # deduped transit_df has one row per chooser for each boarding (btap) and alighting (atap) pair @@ -295,8 +294,7 @@ def compute_tap_tap_utilities( trace_label = tracing.extend_trace_label(trace_label, "compute_tap_tap_utils") - with chunk.chunk_log(trace_label): - + with chunk.chunk_log(trace_label, settings=whale.settings): model_constants = self.network_los.setting( f"TVPB_SETTINGS.{recipe}.CONSTANTS" ) @@ -320,7 +318,6 @@ def compute_tap_tap_utilities( # deduplicate transit_df to unique_transit_df with memo("#TVPB compute_tap_tap_utilities deduplicate transit_df"): - attribute_segments = self.network_los.setting( "TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attribute_segments" ) @@ -392,7 +389,6 @@ def compute_tap_tap_utilities( # redupe unique_transit_df back into transit_df with memo("#TVPB compute_tap_tap_utilities redupe transit_df"): - # idx = transit_df.index transit_df = pd.merge( transit_df, unique_utilities_df, left_on="uid", right_index=True @@ -453,8 +449,7 @@ def lookup_tap_tap_utilities( trace_label = tracing.extend_trace_label(trace_label, "lookup_tap_tap_utils") - with chunk.chunk_log(trace_label): - + with chunk.chunk_log(trace_label, settings=whale.settings): with memo("#TVPB CACHE lookup_tap_tap_utilities all_transit_paths"): transit_df = self.all_transit_paths( access_df, egress_df, chooser_attributes, trace_label, trace=False @@ -522,11 +517,9 @@ def compute_tap_tap_time( trace_label, trace, ): - trace_label = tracing.extend_trace_label(trace_label, "compute_tap_tap_time") - with chunk.chunk_log(trace_label): - + with chunk.chunk_log(trace_label, settings=whale.settings): model_constants = self.network_los.setting( f"TVPB_SETTINGS.{recipe}.CONSTANTS" ) @@ -547,12 +540,13 @@ def compute_tap_tap_time( locals_dict.update(model_constants) assignment_spec = assign.read_assignment_spec( - file_name=config.config_file_path(tap_tap_settings["SPEC"]) + file_name=whale.filesystem.get_config_file_path( + tap_tap_settings["SPEC"] + ) ) DEDUPE = True if DEDUPE: - # assign uid for reduping max_atap = transit_df.atap.max() + 1 transit_df["uid"] = transit_df.btap * max_atap + transit_df.atap @@ -621,9 +615,7 @@ def compute_tap_tap( trace_label, trace, ): - if self.units_for_recipe(recipe) == "utility": - if not self.tap_cache.is_open: with memo("#TVPB compute_tap_tap tap_cache.open"): self.tap_cache.open() @@ -675,11 +667,9 @@ def best_paths( trace_label, trace=False, ): - trace_label = tracing.extend_trace_label(trace_label, "best_paths") - with chunk.chunk_log(trace_label): - + with chunk.chunk_log(trace_label, settings=whale.settings): path_settings = self.network_los.setting( f"TVPB_SETTINGS.{recipe}.path_types.{path_type}" ) @@ -771,7 +761,6 @@ def build_virtual_path( trace=False, override_choices=None, ): - trace_label = tracing.extend_trace_label(trace_label, "build_virtual_path") # Tracing is implemented as a seperate, second call that operates ONLY on filter_targets @@ -921,7 +910,6 @@ def build_virtual_path( chunk.log_df(trace_label, "transit_df", None) if units == "utility": - # logsums with memo("#TVPB build_virtual_path logsums"): # one row per seq with utilities in columns @@ -980,12 +968,10 @@ def build_virtual_path( ) if want_choices: - # orig index to identify appropriate random number channel to use making choices utilities_df.index = orig.index with memo("#TVPB build_virtual_path make_choices"): - probs = logit.utils_to_probs( utilities_df, allow_zero_probs=True, trace_label=trace_label ) @@ -1000,9 +986,8 @@ def build_virtual_path( probs["choices"] = choices self.trace_df(probs, trace_label, "probs") else: - choices, rands = logit.make_choices( - probs, allow_bad_probs=True, trace_label=trace_label + whale, probs, allow_bad_probs=True, trace_label=trace_label ) chunk.log_df(trace_label, "rands", rands) @@ -1031,7 +1016,6 @@ def build_virtual_path( logsum_df["logsum"] = logsums else: - assert len(logsums) == len(orig) logsum_df = pd.DataFrame({"logsum": logsums}, index=orig.index) @@ -1080,13 +1064,11 @@ def get_tvpb_logsum( recipe="tour_mode_choice", trace_label=None, ): - # assume they have given us a more specific name (since there may be more than one active wrapper) trace_label = trace_label or "get_tvpb_logsum" trace_label = tracing.extend_trace_label(trace_label, path_type) - with chunk.chunk_log(trace_label): - + with chunk.chunk_log(trace_label, settings=whale.settings): logsum_df = self.build_virtual_path( recipe, path_type, @@ -1124,14 +1106,13 @@ def get_tvpb_logsum( return logsum_df def get_tvpb_best_transit_time(self, orig, dest, tod): - # FIXME lots of pathological knowledge here as we are only called by accessibility directly from expressions trace_label = tracing.extend_trace_label("accessibility.tvpb_best_time", tod) recipe = "accessibility" path_type = "WTW" - with chunk.chunk_log(trace_label): + with chunk.chunk_log(trace_label, settings=whale.settings): result = self.build_virtual_path( recipe, path_type, @@ -1173,7 +1154,6 @@ def wrap_logsum( trace_label=None, tag=None, ): - return TransitVirtualPathLogsumWrapper( self, orig_key, @@ -1204,7 +1184,6 @@ def __init__( trace_label, tag, ): - self.tvpb = pathbuilder assert hasattr(pathbuilder, "get_tvpb_logsum") @@ -1309,7 +1288,6 @@ def __getitem__(self, path_type): ) if (self.cache_choices) and (not all(logsum_df["logsum"] == UNAVAILABLE)): - # not tested on duplicate index because not currently needed # caching strategy does not require unique indexes but care would need to be taken to maintain alignment assert not orig.index.duplicated().any() diff --git a/activitysim/core/simulate.py b/activitysim/core/simulate.py index 6c0fb902d..bc2a6da55 100644 --- a/activitysim/core/simulate.py +++ b/activitysim/core/simulate.py @@ -11,8 +11,17 @@ import numpy as np import pandas as pd -from . import assign, chunk, config, logit, pathbuilder, pipeline, tracing, util -from .simulate_consts import ( +from activitysim.core import ( + assign, + chunk, + config, + logit, + pathbuilder, + tracing, + util, + workflow, +) +from activitysim.core.simulate_consts import ( ALT_LOSER_UTIL, SPEC_DESCRIPTION_NAME, SPEC_EXPRESSION_NAME, @@ -22,11 +31,11 @@ logger = logging.getLogger(__name__) -def random_rows(df, n): +def random_rows(whale: workflow.Whale, df, n): # only sample if df has more than n rows if len(df.index) > n: - prng = pipeline.get_rn_generator().get_global_rng() + prng = whale.get_rn_generator().get_global_rng() return df.take(prng.choice(len(df), size=n, replace=False)) else: @@ -49,15 +58,15 @@ def uniquify_spec_index(spec): assert spec.index.is_unique -def read_model_alts(file_name, set_index=None): - file_path = config.config_file_path(file_name) +def read_model_alts(whale: workflow.Whale, file_name, set_index=None): + file_path = whale.filesystem.get_config_file_path(file_name) df = pd.read_csv(file_path, comment="#") if set_index: df.set_index(set_index, inplace=True) return df -def read_model_spec(file_name): +def read_model_spec(whale: workflow.Whale, file_name: str): """ Read a CSV model specification into a Pandas DataFrame or Series. @@ -93,7 +102,7 @@ def read_model_spec(file_name): if not file_name.lower().endswith(".csv"): file_name = "%s.csv" % (file_name,) - file_path = config.config_file_path(file_name) + file_path = whale.filesystem.get_config_file_path(file_name) try: spec = pd.read_csv(file_path, comment="#") @@ -121,7 +130,7 @@ def read_model_spec(file_name): return spec -def read_model_coefficients(model_settings=None, file_name=None): +def read_model_coefficients(whale, model_settings=None, file_name=None): """ Read the coefficient file specified by COEFFICIENTS model setting """ @@ -138,7 +147,7 @@ def read_model_coefficients(model_settings=None, file_name=None): file_name = model_settings["COEFFICIENTS"] logger.debug(f"read_model_coefficients file_name {file_name}") - file_path = config.config_file_path(file_name) + file_path = whale.filesystem.get_config_file_path(file_name) try: coefficients = pd.read_csv(file_path, comment="#", index_col="coefficient_name") except ValueError: @@ -161,7 +170,10 @@ def read_model_coefficients(model_settings=None, file_name=None): return coefficients -def spec_for_segment(model_settings, spec_id, segment_name, estimator): +@workflow.func +def spec_for_segment( + whale: workflow.Whale, model_settings, spec_id, segment_name, estimator +): """ Select spec for specified segment from omnibus spec containing columns for each segment @@ -179,7 +191,7 @@ def spec_for_segment(model_settings, spec_id, segment_name, estimator): """ spec_file_name = model_settings[spec_id] - spec = read_model_spec(file_name=spec_file_name) + spec = read_model_spec(whale, file_name=spec_file_name) if len(spec.columns) > 1: # if spec is segmented @@ -203,14 +215,14 @@ def spec_for_segment(model_settings, spec_id, segment_name, estimator): return spec - coefficients = read_model_coefficients(model_settings) + coefficients = read_model_coefficients(whale, model_settings) - spec = eval_coefficients(spec, coefficients, estimator) + spec = eval_coefficients(whale, spec, coefficients, estimator) return spec -def read_model_coefficient_template(model_settings): +def read_model_coefficient_template(whale: workflow.Whale, model_settings): """ Read the coefficient template specified by COEFFICIENT_TEMPLATE model setting """ @@ -223,7 +235,7 @@ def read_model_coefficient_template(model_settings): coefficients_file_name = model_settings["COEFFICIENT_TEMPLATE"] - file_path = config.config_file_path(coefficients_file_name) + file_path = whale.filesystem.get_config_file_path(coefficients_file_name) try: template = pd.read_csv(file_path, comment="#", index_col="coefficient_name") except ValueError: @@ -250,13 +262,13 @@ def read_model_coefficient_template(model_settings): return template -def dump_mapped_coefficients(model_settings): +def dump_mapped_coefficients(whale: workflow.Whale, model_settings): """ dump template_df with coefficient values """ - coefficients_df = read_model_coefficients(model_settings) - template_df = read_model_coefficient_template(model_settings) + coefficients_df = read_model_coefficients(whale, model_settings) + template_df = read_model_coefficient_template(whale, model_settings) for c in template_df.columns: template_df[c] = template_df[c].map(coefficients_df.value) @@ -272,7 +284,8 @@ def dump_mapped_coefficients(model_settings): logger.info(f"wrote raw coefficients to {file_path}") -def get_segment_coefficients(model_settings, segment_name): +@workflow.func +def get_segment_coefficients(whale: workflow.Whale, model_settings, segment_name): """ Return a dict mapping generic coefficient names to segment-specific coefficient values @@ -325,7 +338,9 @@ def get_segment_coefficients(model_settings, segment_name): if legacy: constants = config.get_model_constants(model_settings) - legacy_coeffs_file_path = config.config_file_path(model_settings[legacy]) + legacy_coeffs_file_path = whale.filesystem.get_config_file_path( + model_settings[legacy] + ) omnibus_coefficients = pd.read_csv( legacy_coeffs_file_path, comment="#", index_col="coefficient_name" ) @@ -333,8 +348,8 @@ def get_segment_coefficients(model_settings, segment_name): omnibus_coefficients[segment_name], constants=constants ) else: - coefficients_df = read_model_coefficients(model_settings) - template_df = read_model_coefficient_template(model_settings) + coefficients_df = read_model_coefficients(whale, model_settings) + template_df = read_model_coefficient_template(whale, model_settings) coefficients_col = ( template_df[segment_name].map(coefficients_df.value).astype(float) ) @@ -380,7 +395,12 @@ def replace_coefficients(nest): return nest_spec -def eval_coefficients(spec, coefficients, estimator): +def eval_coefficients( + whale: workflow.Whale, + spec: pd.DataFrame, + coefficients: dict | pd.DataFrame, + estimator, +): spec = spec.copy() # don't clobber input spec @@ -399,7 +419,7 @@ def eval_coefficients(spec, coefficients, estimator): spec[c].apply(lambda x: eval(str(x), {}, coefficients)).astype(np.float32) ) - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = whale.settings.sharrow if sharrow_enabled: # keep all zero rows, reduces the number of unique flows to compile and store. return spec @@ -418,6 +438,7 @@ def eval_coefficients(spec, coefficients, estimator): def eval_utilities( + whale, spec, choosers, locals_d=None, @@ -475,7 +496,7 @@ def eval_utilities( """ start_time = time.time() - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = whale.settings.sharrow expression_values = None @@ -500,6 +521,7 @@ def eval_utilities( if locals_d is not None: locals_dict.update(locals_d) sh_util, sh_flow = apply_flow( + whale, spec_sh, choosers, locals_dict, @@ -875,46 +897,47 @@ def set_skim_wrapper_targets(df, skims): pass -def _check_for_variability(expression_values, trace_label): - """ - This is an internal method which checks for variability in each - expression - under the assumption that you probably wouldn't be using a - variable (in live simulations) if it had no variability. This is a - warning to the user that they might have constructed the variable - incorrectly. It samples 1000 rows in order to not hurt performance - - it's likely that if 1000 rows have no variability, the whole dataframe - will have no variability. - """ - - if trace_label is None: - trace_label = "_check_for_variability" - - sample = random_rows(expression_values, min(1000, len(expression_values))) - - no_variability = has_missing_vals = 0 - for i in range(len(sample.columns)): - v = sample.iloc[:, i] - if v.min() == v.max(): - col_name = sample.columns[i] - logger.info( - "%s: no variability (%s) in: %s" % (trace_label, v.iloc[0], col_name) - ) - no_variability += 1 - # FIXME - how could this happen? Not sure it is really a problem? - if np.count_nonzero(v.isnull().values) > 0: - col_name = sample.columns[i] - logger.info("%s: missing values in: %s" % (trace_label, col_name)) - has_missing_vals += 1 - - if no_variability > 0: - logger.warning( - "%s: %s columns have no variability" % (trace_label, no_variability) - ) - - if has_missing_vals > 0: - logger.warning( - "%s: %s columns have missing values" % (trace_label, has_missing_vals) - ) +# +# def _check_for_variability(expression_values, trace_label): +# """ +# This is an internal method which checks for variability in each +# expression - under the assumption that you probably wouldn't be using a +# variable (in live simulations) if it had no variability. This is a +# warning to the user that they might have constructed the variable +# incorrectly. It samples 1000 rows in order to not hurt performance - +# it's likely that if 1000 rows have no variability, the whole dataframe +# will have no variability. +# """ +# +# if trace_label is None: +# trace_label = "_check_for_variability" +# +# sample = random_rows(expression_values, min(1000, len(expression_values))) +# +# no_variability = has_missing_vals = 0 +# for i in range(len(sample.columns)): +# v = sample.iloc[:, i] +# if v.min() == v.max(): +# col_name = sample.columns[i] +# logger.info( +# "%s: no variability (%s) in: %s" % (trace_label, v.iloc[0], col_name) +# ) +# no_variability += 1 +# # FIXME - how could this happen? Not sure it is really a problem? +# if np.count_nonzero(v.isnull().values) > 0: +# col_name = sample.columns[i] +# logger.info("%s: missing values in: %s" % (trace_label, col_name)) +# has_missing_vals += 1 +# +# if no_variability > 0: +# logger.warning( +# "%s: %s columns have no variability" % (trace_label, no_variability) +# ) +# +# if has_missing_vals > 0: +# logger.warning( +# "%s: %s columns have missing values" % (trace_label, has_missing_vals) +# ) def compute_nested_exp_utilities(raw_utilities, nest_spec): @@ -1100,12 +1123,13 @@ def eval_mnl( assert not want_logsums trace_label = tracing.extend_trace_label(trace_label, "eval_mnl") - have_trace_targets = tracing.has_trace_targets(choosers) + have_trace_targets = tracing.has_trace_targets(whale, choosers) if have_trace_targets: tracing.trace_df(choosers, "%s.choosers" % trace_label) utilities = eval_utilities( + whale, spec, choosers, locals_d, @@ -1145,7 +1169,7 @@ def eval_mnl( probs=probs, choosers=choosers, spec=spec, trace_label=trace_label ) else: - choices, rands = logit.make_choices(probs, trace_label=trace_label) + choices, rands = logit.make_choices(whale, probs, trace_label=trace_label) del probs chunk.log_df(trace_label, "probs", None) @@ -1211,7 +1235,7 @@ def eval_nl( trace_label = tracing.extend_trace_label(trace_label, "eval_nl") assert trace_label - have_trace_targets = tracing.has_trace_targets(choosers) + have_trace_targets = tracing.has_trace_targets(whale, choosers) logit.validate_nest_spec(nest_spec, trace_label) @@ -1221,6 +1245,7 @@ def eval_nl( choosers, spec_sh = _preprocess_tvpb_logsums_on_choosers(choosers, spec, locals_d) raw_utilities = eval_utilities( + whale, spec_sh, choosers, locals_d, @@ -1314,7 +1339,9 @@ def eval_nl( trace_label=trace_label, ) else: - choices, rands = logit.make_choices(base_probabilities, trace_label=trace_label) + choices, rands = logit.make_choices( + whale, base_probabilities, trace_label=trace_label + ) del base_probabilities chunk.log_df(trace_label, "base_probabilities", None) @@ -1476,7 +1503,7 @@ def simple_simulate( result_list = [] # segment by person type and pick the right spec for each person type for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - choosers, chunk_size, trace_label + whale, choosers, chunk_size, trace_label ): choices = _simple_simulate( @@ -1568,7 +1595,7 @@ def eval_mnl_logsums(choosers, spec, locals_d, trace_label=None): # FIXME - untested and not currently used by any models... trace_label = tracing.extend_trace_label(trace_label, "eval_mnl_logsums") - have_trace_targets = tracing.has_trace_targets(choosers) + have_trace_targets = tracing.has_trace_targets(whale, choosers) logger.debug("running eval_mnl_logsums") @@ -1577,7 +1604,7 @@ def eval_mnl_logsums(choosers, spec, locals_d, trace_label=None): tracing.trace_df(choosers, "%s.choosers" % trace_label) utilities = eval_utilities( - spec, choosers, locals_d, trace_label, have_trace_targets + whale, spec, choosers, locals_d, trace_label, have_trace_targets ) chunk.log_df(trace_label, "utilities", utilities) @@ -1682,7 +1709,9 @@ def _replace_in_level(multiindex, level_name, *args, **kwargs): return choosers, spec_sh -def eval_nl_logsums(choosers, spec, nest_spec, locals_d, trace_label=None): +def eval_nl_logsums( + whale: workflow.Whale, choosers, spec, nest_spec, locals_d, trace_label=None +): """ like eval_nl except return logsums instead of making choices @@ -1693,7 +1722,7 @@ def eval_nl_logsums(choosers, spec, nest_spec, locals_d, trace_label=None): """ trace_label = tracing.extend_trace_label(trace_label, "eval_nl_logsums") - have_trace_targets = tracing.has_trace_targets(choosers) + have_trace_targets = tracing.has_trace_targets(whale, choosers) logit.validate_nest_spec(nest_spec, trace_label) @@ -1704,6 +1733,7 @@ def eval_nl_logsums(choosers, spec, nest_spec, locals_d, trace_label=None): tracing.trace_df(choosers, "%s.choosers" % trace_label) raw_utilities = eval_utilities( + whale, spec_sh, choosers, locals_d, @@ -1751,7 +1781,13 @@ def eval_nl_logsums(choosers, spec, nest_spec, locals_d, trace_label=None): def _simple_simulate_logsums( - choosers, spec, nest_spec, skims=None, locals_d=None, trace_label=None + whale: workflow.Whale, + choosers, + spec, + nest_spec, + skims=None, + locals_d=None, + trace_label=None, ): """ like simple_simulate except return logsums instead of making choices @@ -1769,13 +1805,15 @@ def _simple_simulate_logsums( logsums = eval_mnl_logsums(choosers, spec, locals_d, trace_label=trace_label) else: logsums = eval_nl_logsums( - choosers, spec, nest_spec, locals_d, trace_label=trace_label + whale, choosers, spec, nest_spec, locals_d, trace_label=trace_label ) return logsums +@workflow.func def simple_simulate_logsums( + whale: workflow.Whale, choosers, spec, nest_spec, @@ -1799,17 +1837,22 @@ def simple_simulate_logsums( result_list = [] # segment by person type and pick the right spec for each person type - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - choosers, chunk_size, trace_label, chunk_tag + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers( + whale, choosers, chunk_size, trace_label, chunk_tag ): logsums = _simple_simulate_logsums( - chooser_chunk, spec, nest_spec, skims, locals_d, chunk_trace_label + whale, chooser_chunk, spec, nest_spec, skims, locals_d, chunk_trace_label ) result_list.append(logsums) - chunk.log_df(trace_label, "result_list", result_list) + chunk_sizer.log_df(trace_label, "result_list", result_list) if len(result_list) > 1: logsums = pd.concat(result_list) diff --git a/activitysim/core/skim_dataset.py b/activitysim/core/skim_dataset.py index 9c28082fc..0cae4062b 100644 --- a/activitysim/core/skim_dataset.py +++ b/activitysim/core/skim_dataset.py @@ -1,15 +1,17 @@ import glob import logging import os +from pathlib import Path import numpy as np import openmatrix import pandas as pd import sharrow as sh +import xarray as xr -from . import config -from . import flow as __flow # noqa, keep this here for side effects? -from . import inject +from activitysim.core import config +from activitysim.core import flow as __flow +from activitysim.core import workflow logger = logging.getLogger(__name__) @@ -432,7 +434,7 @@ def _use_existing_backing_if_valid(backing, omx_file_paths, skim_tag): def _dedupe_time_periods(network_los_preload): - raw_time_periods = network_los_preload.los_settings["skim_time_periods"]["labels"] + raw_time_periods = network_los_preload.los_settings.skim_time_periods["labels"] # deduplicate time period names time_periods = [] for t in raw_time_periods: @@ -490,22 +492,27 @@ def _apply_digital_encoding(dataset, digital_encodings): return dataset -def _scan_for_unused_names(tokens): +def _scan_for_unused_names(whale, tokens): """ Scan all spec files to find unused skim variable names. Parameters ---------- + whale : Whale tokens : Collection[str] Returns ------- Set[str] """ - configs_dir_list = inject.get_injectable("configs_dir") + configs_dir_list = whale.filesystem.get_configs_dir() configs_dir_list = ( - [configs_dir_list] if isinstance(configs_dir_list, str) else configs_dir_list + [configs_dir_list] + if isinstance(configs_dir_list, (str, Path)) + else configs_dir_list ) + if isinstance(configs_dir_list, tuple): + configs_dir_list = list(configs_dir_list) assert isinstance(configs_dir_list, list) for directory in configs_dir_list: @@ -524,10 +531,10 @@ def _scan_for_unused_names(tokens): return tokens -def _drop_unused_names(dataset): +def _drop_unused_names(whale, dataset): logger.info("scanning for unused skims") tokens = set(dataset.variables.keys()) - set(dataset.coords.keys()) - unused_tokens = _scan_for_unused_names(tokens) + unused_tokens = _scan_for_unused_names(whale, tokens) if unused_tokens: baggage = dataset.digital_encoding.baggage(None) unused_tokens -= baggage @@ -583,7 +590,6 @@ def load_sparse_maz_skims( data_file_resolver = config.data_file_path if zone_system in [TWO_ZONE, THREE_ZONE]: - # maz maz_taz = pd.read_csv(data_file_resolver(maz2taz_file_name, mandatory=True)) maz_taz = maz_taz[["MAZ", "TAZ"]].set_index("MAZ").sort_index() @@ -623,7 +629,6 @@ def load_sparse_maz_skims( max_blend_distance = {"DEFAULT": max_blend_distance} for file_name in maz_to_maz_tables: - df = pd.read_csv(data_file_resolver(file_name, mandatory=True)) if remapper is not None: df.OMAZ = df.OMAZ.map(remapper.get) @@ -647,12 +652,13 @@ def load_sparse_maz_skims( return dataset -def load_skim_dataset_to_shared_memory(skim_tag="taz"): +def load_skim_dataset_to_shared_memory(whale, skim_tag="taz") -> xr.Dataset: """ Load skims from disk into shared memory. Parameters ---------- + whale : Whale skim_tag : str, default "taz" Returns @@ -662,17 +668,17 @@ def load_skim_dataset_to_shared_memory(skim_tag="taz"): from ..core.los import ONE_ZONE # TODO:SHARROW: taz and maz are the same - network_los_preload = inject.get_injectable("network_los_preload", None) + network_los_preload = whale.get_injectable("network_los_preload", None) if network_los_preload is None: raise ValueError("missing network_los_preload") # find which OMX files are to be used. - omx_file_paths = config.expand_input_file_list( + omx_file_paths = whale.filesystem.expand_input_file_list( network_los_preload.omx_file_names(skim_tag), ) zarr_file = network_los_preload.zarr_file_name(skim_tag) - if config.setting("disable_zarr", False): + if whale.settings.disable_zarr: # we can disable the zarr optimizations by setting the `disable_zarr` # flag in the master config file to True zarr_file = None @@ -694,10 +700,10 @@ def load_skim_dataset_to_shared_memory(skim_tag="taz"): ) backing = f"memmap:{mmap_file}" - land_use = inject.get_table("land_use") + land_use = whale.get_dataframe("land_use") - if f"_original_{land_use.index.name}" in land_use.to_frame(): - land_use_zone_ids = land_use.to_frame()[f"_original_{land_use.index.name}"] + if f"_original_{land_use.index.name}" in land_use: + land_use_zone_ids = land_use[f"_original_{land_use.index.name}"] remapper = dict(zip(land_use_zone_ids, land_use_zone_ids.index)) else: remapper = None @@ -766,7 +772,7 @@ def load_skim_dataset_to_shared_memory(skim_tag="taz"): ), ) - d = _drop_unused_names(d) + d = _drop_unused_names(whale, d) # apply non-zarr dependent digital encoding d = _apply_digital_encoding(d, skim_digital_encoding) @@ -817,11 +823,11 @@ def load_skim_dataset_to_shared_memory(skim_tag="taz"): return d.shm.to_shared_memory(backing, mode="r") -@inject.injectable(cache=True) -def skim_dataset(): - return load_skim_dataset_to_shared_memory() +@workflow.cached_object +def skim_dataset(whale: workflow.Whale) -> xr.Dataset: + return load_skim_dataset_to_shared_memory(whale) -@inject.injectable(cache=True) -def tap_dataset(): - return load_skim_dataset_to_shared_memory("tap") +@workflow.cached_object +def tap_dataset(whale: workflow.Whale) -> xr.Dataset: + return load_skim_dataset_to_shared_memory(whale, "tap") diff --git a/activitysim/core/skim_dict_factory.py b/activitysim/core/skim_dict_factory.py index 450b98d25..e1638fc6a 100644 --- a/activitysim/core/skim_dict_factory.py +++ b/activitysim/core/skim_dict_factory.py @@ -52,7 +52,7 @@ def shape(self): class SkimInfo(object): - def __init__(self, skim_tag, network_los): + def __init__(self, whale, skim_tag, network_los): """ skim_tag: str (e.g. 'TAZ') @@ -89,9 +89,9 @@ def __init__(self, skim_tag, network_los): self.block_offsets = None if skim_tag: - self.load_skim_info(skim_tag) + self.load_skim_info(whale, skim_tag) - def load_skim_info(self, skim_tag): + def load_skim_info(self, whale, skim_tag): """ Read omx files for skim (e.g. 'TAZ') and build skim_info dict @@ -103,7 +103,7 @@ def load_skim_info(self, skim_tag): omx_file_names = self.network_los.omx_file_names(skim_tag) - self.omx_file_paths = config.expand_input_file_list(omx_file_names) + self.omx_file_paths = whale.filesystem.expand_input_file_list(omx_file_names) # ignore any 3D skims not in skim_time_periods # specifically, load all skims except those with key2 not in dim3_tags_to_load @@ -265,8 +265,8 @@ def _skim_data_from_buffer(self, skim_info, skim_buffer): def _memmap_skim_data_path(self, skim_tag): return os.path.join(config.get_cache_dir(), f"cached_{skim_tag}.mmap") - def load_skim_info(self, skim_tag): - return SkimInfo(skim_tag, self.network_los) + def load_skim_info(self, whale, skim_tag): + return SkimInfo(whale, skim_tag, self.network_los) def _read_skims_from_omx(self, skim_info, skim_data): """ diff --git a/activitysim/core/skim_dictionary.py b/activitysim/core/skim_dictionary.py index a1897b536..b3ff80780 100644 --- a/activitysim/core/skim_dictionary.py +++ b/activitysim/core/skim_dictionary.py @@ -146,7 +146,7 @@ def map(self, zone_ids): return offsets -class SkimDict(object): +class SkimDict: """ A SkimDict object is a wrapper around a dict of multiple skim objects, where each object is identified by a key. @@ -154,7 +154,7 @@ class SkimDict(object): Note that keys are either strings or tuples of two strings (to support stacking of skims.) """ - def __init__(self, skim_tag, skim_info, skim_data): + def __init__(self, whale, skim_tag, skim_info, skim_data): logger.info(f"SkimDict init {skim_tag}") @@ -162,8 +162,8 @@ def __init__(self, skim_tag, skim_info, skim_data): self.skim_info = skim_info self.usage = set() # track keys of skims looked up - self.offset_mapper = ( - self._offset_mapper() + self.offset_mapper = self._offset_mapper( + whale ) # (in function so subclass can override) self.omx_shape = skim_info.omx_shape @@ -184,7 +184,7 @@ def __init__(self, skim_tag, skim_info, skim_data): f"SkimDict.build_3d_skim_block_offset_table registered {len(self.skim_dim3)} 3d keys" ) - def _offset_mapper(self): + def _offset_mapper(self, whale): """ Return an OffsetMapper to set self.offset_mapper for use with skims This allows subclasses (e.g. MazSkimDict) to 'tweak' the parent offset mapper. @@ -671,7 +671,7 @@ def __init__(self, skim_tag, network_los, taz_skim_dict): ) self.sparse_key_usage = set() - def _offset_mapper(self): + def _offset_mapper(self, whale): """ return an OffsetMapper to map maz zone_ids to taz skim indexes Specifically, an offset_series with MAZ zone_id index and TAZ skim array offset values @@ -684,13 +684,13 @@ def _offset_mapper(self): """ # use taz offset_mapper to create series mapping directly from MAZ to TAZ skim index - taz_offset_mapper = super()._offset_mapper() - maz_taz = self.network_los.get_maz_to_taz_series + taz_offset_mapper = super()._offset_mapper(whale) + maz_taz = self.network_los.get_maz_to_taz_series(whale) maz_to_skim_offset = taz_offset_mapper.map(maz_taz) if isinstance(maz_to_skim_offset, np.ndarray): maz_to_skim_offset = pd.Series( - maz_to_skim_offset, self.network_los.get_maz_to_taz_series.index + maz_to_skim_offset, self.network_los.get_maz_to_taz_series(whale).index ) # bug # MAZ diff --git a/activitysim/core/steps/output.py b/activitysim/core/steps/output.py index 8b8a8e3be..04de513b1 100644 --- a/activitysim/core/steps/output.py +++ b/activitysim/core/steps/output.py @@ -6,8 +6,7 @@ import numpy as np import pandas as pd -from activitysim.core import config, inject, pipeline -from activitysim.core.config import setting +from activitysim.core import config, inject, workflow logger = logging.getLogger(__name__) @@ -32,7 +31,6 @@ def track_skim_usage(output_dir): mode = "wb" if sys.version_info < (3,) else "w" with open(config.output_file_path("skim_usage.txt"), mode) as output_file: - print("\n### skim_dict usage", file=output_file) for key in skim_dict.get_skim_usage(): print(key, file=output_file) @@ -52,7 +50,7 @@ def track_skim_usage(output_dir): print(key, file=output_file) -def previous_write_data_dictionary(output_dir): +def previous_write_data_dictionary(whale: workflow.Whale, output_dir): """ Write table_name, number of rows, columns, and bytes for each checkpointed table @@ -67,13 +65,12 @@ def previous_write_data_dictionary(output_dir): csv_format = model_settings.get("csv_format", "data_dict.csv") if txt_format: - output_file_path = config.output_file_path(txt_format) pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 - output_tables = pipeline.checkpointed_tables() + output_tables = whale.checkpointed_tables() # write data dictionary for all checkpointed_tables @@ -156,11 +153,9 @@ def write_data_dictionary(output_dir): # annotate schema.info with name of checkpoint columns were first seen for _, row in pipeline.get_checkpoints().iterrows(): - checkpoint_name = row[pipeline.CHECKPOINT_NAME] for table_name in table_names: - # no change to table in this checkpoint if row.get(table_name, None) != checkpoint_name: continue @@ -190,7 +185,6 @@ def write_data_dictionary(output_dir): if txt_format: with open(config.output_file_path(txt_format), "w") as output_file: - # get max schema column widths from omnibus table col_width = {c: schema_df[c].str.len().max() + 2 for c in schema_df} @@ -215,7 +209,7 @@ def write_data_dictionary(output_dir): print(f"{info}\n", file=output_file) -def write_tables(output_dir): +def write_tables(whale, output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. @@ -258,9 +252,7 @@ def write_tables(output_dir): """ - output_tables_settings_name = "output_tables" - - output_tables_settings = setting(output_tables_settings_name) + output_tables_settings = whale.settings.output_tables if output_tables_settings is None: logger.info("No output_tables specified in settings file. Nothing to write.") @@ -285,7 +277,6 @@ def write_tables(output_dir): ) for table_name in output_tables_list: - if not isinstance(table_name, str): table_decode_cols = table_name.get("decode_columns", {}) table_name = table_name["tablename"] diff --git a/activitysim/core/test/extensions/steps.py b/activitysim/core/test/extensions/steps.py index baa894c69..05eaa79fe 100644 --- a/activitysim/core/test/extensions/steps.py +++ b/activitysim/core/test/extensions/steps.py @@ -1,31 +1,31 @@ import pandas as pd -from activitysim.core import inject, pipeline, tracing +from activitysim.core import inject, pipeline, tracing, workflow -@inject.step() -def step1(): +@workflow.step +def step1(whale: workflow.Whale): table1 = pd.DataFrame({"c": [1, 2, 3]}) inject.add_table("table1", table1) -@inject.step() -def step2(): +@workflow.step +def step2(whale: workflow.Whale): table1 = pd.DataFrame({"c": [2, 4, 6]}) inject.add_table("table2", table1) -@inject.step() -def step3(): +@workflow.step +def step3(whale: workflow.Whale): table1 = pd.DataFrame({"c": [3, 6, 9]}) inject.add_table("table3", table1) -@inject.step() -def step_add_col(): +@workflow.step +def step_add_col(whale: workflow.Whale): table_name = inject.get_step_arg("table_name") assert table_name is not None @@ -39,11 +39,11 @@ def step_add_col(): table[col_name] = table.index + (1000 * len(table.columns)) - pipeline.replace_table(table_name, table) + whale.add_table(table_name, table) -@inject.step() -def step_forget_tab(): +@workflow.step +def step_forget_tab(whale: workflow.Whale): table_name = inject.get_step_arg("table_name") assert table_name is not None @@ -53,8 +53,8 @@ def step_forget_tab(): pipeline.drop_table(table_name) -@inject.step() -def create_households(trace_hh_id): +@workflow.step +def create_households(whale: workflow.Whale, trace_hh_id): df = pd.DataFrame({"household_id": [1, 2, 3], "home_zone_id": {100, 100, 101}}) inject.add_table("households", df) diff --git a/activitysim/core/test/test_assign.py b/activitysim/core/test/test_assign.py index 381871106..b6b92764a 100644 --- a/activitysim/core/test/test_assign.py +++ b/activitysim/core/test/test_assign.py @@ -52,7 +52,9 @@ def data(data_name): def test_read_model_spec(): - spec = assign.read_assignment_spec(config.config_file_path("assignment_spec.csv")) + spec = assign.read_assignment_spec( + whale.filesystem.get_config_file_path("assignment_spec.csv") + ) assert len(spec) == 8 @@ -61,7 +63,9 @@ def test_read_model_spec(): def test_assign_variables(capsys, data): - spec = assign.read_assignment_spec(config.config_file_path("assignment_spec.csv")) + spec = assign.read_assignment_spec( + whale.filesystem.get_config_file_path("assignment_spec.csv") + ) locals_d = {"CONSTANT": 7, "_shadow": 99} @@ -111,7 +115,7 @@ def test_assign_variables(capsys, data): def test_assign_variables_aliased(capsys, data): spec = assign.read_assignment_spec( - config.config_file_path("assignment_spec_alias_df.csv") + whale.filesystem.get_config_file_path("assignment_spec_alias_df.csv") ) locals_d = {"CONSTANT": 7, "_shadow": 99} @@ -156,7 +160,7 @@ def test_assign_variables_failing(capsys, data): tracing.config_logger(basic=True) spec = assign.read_assignment_spec( - config.config_file_path("assignment_spec_failing.csv") + whale.filesystem.get_config_file_path("assignment_spec_failing.csv") ) locals_d = { diff --git a/activitysim/core/test/test_logit.py b/activitysim/core/test/test_logit.py index 8253149e1..9f881181d 100644 --- a/activitysim/core/test/test_logit.py +++ b/activitysim/core/test/test_logit.py @@ -114,7 +114,7 @@ def test_make_choices_only_one(): probs = pd.DataFrame( [[1, 0, 0], [0, 1, 0]], columns=["a", "b", "c"], index=["x", "y"] ) - choices, rands = logit.make_choices(probs) + choices, rands = logit.make_choices(whale, probs) pdt.assert_series_equal( choices, pd.Series([0, 1], index=["x", "y"]), check_dtype=False @@ -123,7 +123,7 @@ def test_make_choices_only_one(): def test_make_choices_real_probs(utilities): probs = logit.utils_to_probs(utilities, trace_label=None) - choices, rands = logit.make_choices(probs) + choices, rands = logit.make_choices(whale, probs) pdt.assert_series_equal( choices, diff --git a/activitysim/core/test/test_skim.py b/activitysim/core/test/test_skim.py index a1e47779a..e235b82c2 100644 --- a/activitysim/core/test/test_skim.py +++ b/activitysim/core/test/test_skim.py @@ -35,7 +35,7 @@ def test_skims(data): skim_info.omx_shape = omx_shape skim_info.dtype_name = "int" - skim_dict = skim_dictionary.SkimDict("taz", skim_info, skim_data) + skim_dict = skim_dictionary.SkimDict(whale, "taz", skim_info, skim_data) skim_dict.offset_mapper.set_offset_int(0) # default is -1 skims = skim_dict.wrap("taz_l", "taz_r") @@ -73,7 +73,7 @@ def test_3dskims(data): skim_info.dtype_name = "int" skim_info.key1_block_offsets = {"SOV": 0} - skim_dict = skim_dictionary.SkimDict("taz", skim_info, skim_data) + skim_dict = skim_dictionary.SkimDict(whale, "taz", skim_info, skim_data) skim_dict.offset_mapper.set_offset_int(0) # default is -1 skims3d = skim_dict.wrap_3d(orig_key="taz_l", dest_key="taz_r", dim3_key="period") diff --git a/activitysim/core/test/test_timetable.py b/activitysim/core/test/test_timetable.py index e8ac8e555..0e02e05af 100644 --- a/activitysim/core/test/test_timetable.py +++ b/activitysim/core/test/test_timetable.py @@ -56,7 +56,7 @@ def tdd_alts(): def test_basic(persons, tdd_alts): - with chunk.chunk_log("test_basic", base=True): + with chunk.chunk_log("test_basic", base=True, settings=whale.settings): person_windows = tt.create_timetable_windows(persons, tdd_alts) diff --git a/activitysim/core/timetable.py b/activitysim/core/timetable.py index a2106bbe2..c70d80893 100644 --- a/activitysim/core/timetable.py +++ b/activitysim/core/timetable.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from activitysim.core import chunk, pipeline +from activitysim.core import chunk, workflow logger = logging.getLogger(__name__) @@ -442,7 +442,7 @@ def get_windows_df(self): # assert (self.windows_df.values == self.windows).all() return self.windows_df - def replace_table(self): + def replace_table(self, whale: workflow.Whale): """ Save or replace windows_df DataFrame to pipeline with saved table name (specified when object instantiated.) @@ -464,7 +464,7 @@ def replace_table(self): # get windows_df from bottleneck function in case updates to self.person_window # do not write through to pandas dataframe - pipeline.replace_table(self.windows_table_name, self.get_windows_df()) + whale.add_table(self.windows_table_name, self.get_windows_df()) def tour_available(self, window_row_ids, tdds): """ @@ -632,7 +632,7 @@ def adjacent_window_run_length(self, window_row_ids, periods, before): assert len(window_row_ids) == len(periods) trace_label = "tt.adjacent_window_run_length" - with chunk.chunk_log(trace_label): + with chunk.chunk_log(trace_label, settings=whale.settings): available_run_length = _available_run_length_2( self.windows, self.window_row_ix._mapper, diff --git a/activitysim/core/tracing.py b/activitysim/core/tracing.py index d00fe1614..1aa416380 100644 --- a/activitysim/core/tracing.py +++ b/activitysim/core/tracing.py @@ -14,7 +14,8 @@ import pandas as pd import yaml -from ..core.workflow.steps import workflow_cached_object, workflow_step +from activitysim.core import workflow + from . import config # Configurations @@ -177,7 +178,7 @@ def config_logger(basic=False, whale=None): log_config_file = None else: if whale is None: - log_config_file = config.config_file_path( + log_config_file = whale.filesystem.get_config_file_path( LOGGING_CONF_FILE_NAME, mandatory=False ) else: @@ -256,8 +257,8 @@ def print_summary(label, df, describe=False, value_counts=False): logger.info("%s summary:\n%s" % (label, df.describe())) -@workflow_step(inplace=True) -def initialize_traceable_tables(whale): +@workflow.step +def initialize_traceable_tables(whale: workflow.Whale): whale.set("traceable_table_ids", {}) @@ -538,15 +539,15 @@ def get_trace_target(whale, df, slicer, column=None): Parameters ---------- + whale : Whale df: pandas.DataFrame dataframe to slice slicer: str name of column or index to use for slicing + column : Any Returns ------- - (target, column) tuple - target : int or list of ints id or ids that identify tracer target rows column : str @@ -589,7 +590,8 @@ def get_trace_target(whale, df, slicer, column=None): return target_ids, column -def trace_targets(whale, df, slicer=None, column=None): +@workflow.func +def trace_targets(whale: workflow.Whale, df, slicer=None, column=None): target_ids, column = get_trace_target(whale, df, slicer, column) @@ -606,9 +608,10 @@ def trace_targets(whale, df, slicer=None, column=None): return targets -def has_trace_targets(df, slicer=None, column=None): +@workflow.func +def has_trace_targets(whale: workflow.Whale, df, slicer=None, column=None): - target_ids, column = get_trace_target(df, slicer, column) + target_ids, column = get_trace_target(whale, df, slicer, column) if target_ids is None: found = False diff --git a/activitysim/core/workflow/__init__.py b/activitysim/core/workflow/__init__.py index e746ba454..e8f9db45d 100644 --- a/activitysim/core/workflow/__init__.py +++ b/activitysim/core/workflow/__init__.py @@ -1 +1,5 @@ -from .steps import workflow_cached_object, workflow_step, workflow_table +from .state import Whale +from .steps import func +from .steps import workflow_cached_object as cached_object +from .steps import workflow_step as step +from .steps import workflow_table as table diff --git a/activitysim/core/pipeline.py b/activitysim/core/workflow/state.py similarity index 87% rename from activitysim/core/pipeline.py rename to activitysim/core/workflow/state.py index 9c24e5f02..e36d5cd5c 100644 --- a/activitysim/core/pipeline.py +++ b/activitysim/core/workflow/state.py @@ -1,6 +1,5 @@ # ActivitySim # See full license in LICENSE.txt. -import contextlib import datetime as dt import logging import os @@ -9,12 +8,12 @@ from typing import Any import pandas as pd -from pypyr.context import Context +import xarray as xr +from pypyr.context import Context, KeyNotInContextError -from ..core.configuration import FileSystem, NetworkSettings, Settings -from ..core.exceptions import PipelineAccessError -from ..core.workflow.steps import run_named_step -from ..core.workflow.util import get_formatted_or_default +from activitysim.core.configuration import FileSystem, NetworkSettings, Settings +from activitysim.core.exceptions import WhaleAccessError +from activitysim.core.workflow.steps import run_named_step logger = logging.getLogger(__name__) @@ -58,6 +57,28 @@ def split_arg(s, sep, default=""): return arg, val +class WhaleAttr: + def __init__(self, member_type): + self.member_type = member_type + + def __set_name__(self, owner, name): + self.name = name + + def __get__(self, instance, objtype=None): + try: + return instance.context[self.name] + except (KeyError, AttributeError): + raise WhaleAccessError(f"{self.name} not initialized for this whale") + + def __set__(self, instance, value): + if not isinstance(value, self.member_type): + raise TypeError(f"{self.name} must be {self.member_type} not {type(value)}") + instance.context[self.name] = value + + def __delete__(self, instance): + self.__set__(instance, None) + + class Whale: def __init__(self, context=None): if context is None: @@ -75,7 +96,7 @@ def init_state(self, pipeline_file_format="parquet"): # array of checkpoint dicts self.checkpoints = [] - from .random import Random + from activitysim.core.random import Random # TOP? self.context["prng"] = Random() @@ -84,52 +105,56 @@ def init_state(self, pipeline_file_format="parquet"): self.pipeline_store = None self._is_open = False - from .tracing import initialize_traceable_tables + from activitysim.core.tracing import initialize_traceable_tables # TOP? initialize_traceable_tables(self) self.context["_salient_tables"] = {} - @property - def filesystem(self) -> FileSystem: - try: - return self.context["filesystem"] - except KeyError: - raise PipelineAccessError("filesystem not initialized for this pipeline") - - @filesystem.setter - def filesystem(self, fs: FileSystem): - if not isinstance(fs, FileSystem): - raise TypeError(f"filesystem must be FileSystem not {type(fs)}") - self.context["filesystem"] = fs - - @property - def settings(self) -> Settings: - try: - return self.context["settings"] - except KeyError: - raise PipelineAccessError("settings not initialized for this pipeline") - - @settings.setter - def settings(self, s: Settings): - if not isinstance(s, Settings): - raise TypeError(f"settings must be Settings not {type(s)}") - self.context["settings"] = s - - @property - def network_settings(self) -> NetworkSettings: - try: - return self.context["network_settings"] - except KeyError: - raise PipelineAccessError( - "network_settings not initialized for this pipeline" - ) + filesystem = WhaleAttr(FileSystem) + settings = WhaleAttr(Settings) + network_settings = WhaleAttr(NetworkSettings) - @network_settings.setter - def network_settings(self, s: NetworkSettings): - if not isinstance(s, NetworkSettings): - raise TypeError(f"settings must be NetworkSettings not {type(s)}") - self.context["network_settings"] = s + # @property + # def filesystem(self) -> FileSystem: + # try: + # return self.context["filesystem"] + # except KeyError: + # raise WhaleAccessError("filesystem not initialized for this pipeline") + # + # @filesystem.setter + # def filesystem(self, fs: FileSystem): + # if not isinstance(fs, FileSystem): + # raise TypeError(f"filesystem must be FileSystem not {type(fs)}") + # self.context["filesystem"] = fs + # + # @property + # def settings(self) -> Settings: + # try: + # return self.context["settings"] + # except KeyError: + # raise WhaleAccessError("settings not initialized for this pipeline") + # + # @settings.setter + # def settings(self, s: Settings): + # if not isinstance(s, Settings): + # raise TypeError(f"settings must be Settings not {type(s)}") + # self.context["settings"] = s + # + # @property + # def network_settings(self) -> NetworkSettings: + # try: + # return self.context["network_settings"] + # except KeyError: + # raise WhaleAccessError( + # "network_settings not initialized for this pipeline" + # ) + # + # @network_settings.setter + # def network_settings(self, s: NetworkSettings): + # if not isinstance(s, NetworkSettings): + # raise TypeError(f"settings must be NetworkSettings not {type(s)}") + # self.context["network_settings"] = s _RUNNABLE_STEPS = {} _LOADABLE_TABLES = {} @@ -144,7 +169,7 @@ def existing_table_names(self): return self.existing_table_status.keys() @property - def existing_table_status(self): + def existing_table_status(self) -> dict: return self.context["_salient_tables"] def uncheckpointed_table_names(self): @@ -179,7 +204,7 @@ def load_table(self, tablename, overwrite=False, swallow_errors=False): logger.debug(f"loading table {tablename}") try: t = self._LOADABLE_TABLES[tablename](self.context) - except PipelineAccessError: + except WhaleAccessError: if not swallow_errors: raise else: @@ -206,15 +231,21 @@ def access(self, key, initializer): def get(self, key, default: Any = NO_DEFAULT): if default == NO_DEFAULT: try: - return self.context.get_formatted(key) - except KeyError: - alt_result = getattr(self.filesystem, key, NO_DEFAULT) - if alt_result == NO_DEFAULT: + result = self.context[key] + except (KeyError, KeyNotInContextError): + result = getattr(self.filesystem, key, None) + if result is None: + if key in self._LOADABLE_TABLES: + result = self._LOADABLE_TABLES[key](self.context) + elif key in self._LOADABLE_OBJECTS: + result = self._LOADABLE_OBJECTS[key](self.context) + if result is None: raise - else: - return alt_result else: - return get_formatted_or_default(self.context, key, default) + result = self.context.get(key, default) + if not isinstance(result, (xr.Dataset, xr.DataArray, pd.DataFrame, pd.Series)): + result = self.context.get_formatted_value(result) + return result def set(self, key, value): self.context[key] = value @@ -627,6 +658,11 @@ def load_checkpoint(self, checkpoint_name): logger.debug("adding channel %s" % (table_name,)) self.rng().add_channel(table_name, loaded_tables[table_name]) + @property + def current_model_name(self) -> str: + """Name of the currently running model.""" + return self.rng().step_name + def run_model(self, model_name): """ Run the specified model and add checkpoint for model_name @@ -674,7 +710,7 @@ def run_model(self, model_name): self.trace_memory_info(f"pipeline.run_model {model_name} start") - from .tracing import print_elapsed_time + from activitysim.core.tracing import print_elapsed_time t0 = print_elapsed_time() logger.info(f"#run_model running step {step_name}") @@ -807,7 +843,7 @@ def intermediate_checkpoint(self, checkpoint_name=None): return checkpoint_name in checkpoints def trace_memory_info(self, event): - from .mem import trace_memory_info + from activitysim.core.mem import trace_memory_info return trace_memory_info(event, whale=self) @@ -834,7 +870,7 @@ def run(self, models, resume_after=None, memory_sidecar_process=None): returns: nothing, but with pipeline open """ - from .tracing import print_elapsed_time + from activitysim.core.tracing import print_elapsed_time t0 = print_elapsed_time() @@ -867,7 +903,7 @@ def run(self, models, resume_after=None, memory_sidecar_process=None): self.run_model(model) self.trace_memory_info(f"pipeline.run after {model}") - from .tracing import log_runtime + from activitysim.core.tracing import log_runtime log_runtime(self, model_name=model, start_time=t1) @@ -988,42 +1024,42 @@ def get_checkpoints(self): return df - def replace_table(self, table_name, df): - """ - Add or replace a orca table, removing any existing added orca columns - - The use case for this function is a method that calls to_frame on an orca table, modifies - it and then saves the modified. - - orca.to_frame returns a copy, so no changes are saved, and adding multiple column with - add_column adds them in an indeterminate order. - - Simply replacing an existing the table "behind the pipeline's back" by calling orca.add_table - risks pipeline to failing to detect that it has changed, and thus not checkpoint the changes. - - Parameters - ---------- - table_name : str - orca/pipeline table name - df : pandas DataFrame - """ - - assert self.is_open, f"Pipeline is not open." - - if df.columns.duplicated().any(): - logger.error( - "replace_table: dataframe '%s' has duplicate columns: %s" - % (table_name, df.columns[df.columns.duplicated()]) - ) - - raise RuntimeError( - "replace_table: dataframe '%s' has duplicate columns: %s" - % (table_name, df.columns[df.columns.duplicated()]) - ) - - self.rewrap(table_name, df) - - self.replaced_tables[table_name] = True + # def replace_table(self, table_name, df): + # """ + # Add or replace a orca table, removing any existing added orca columns + # + # The use case for this function is a method that calls to_frame on an orca table, modifies + # it and then saves the modified. + # + # orca.to_frame returns a copy, so no changes are saved, and adding multiple column with + # add_column adds them in an indeterminate order. + # + # Simply replacing an existing the table "behind the pipeline's back" by calling orca.add_table + # risks pipeline to failing to detect that it has changed, and thus not checkpoint the changes. + # + # Parameters + # ---------- + # table_name : str + # orca/pipeline table name + # df : pandas DataFrame + # """ + # + # assert self.is_open, f"Pipeline is not open." + # + # if df.columns.duplicated().any(): + # logger.error( + # "replace_table: dataframe '%s' has duplicate columns: %s" + # % (table_name, df.columns[df.columns.duplicated()]) + # ) + # + # raise RuntimeError( + # "replace_table: dataframe '%s' has duplicate columns: %s" + # % (table_name, df.columns[df.columns.duplicated()]) + # ) + # + # self.rewrap(table_name, df) + # + # self.replaced_tables[table_name] = True def extend_table(self, table_name, df, axis=0): """ @@ -1066,7 +1102,7 @@ def extend_table(self, table_name, df, axis=0): for c in missing_df_str_columns: df[c] = df[c].fillna("") - self.replace_table(table_name, df) + self.add_table(table_name, df) return df @@ -1076,11 +1112,7 @@ def drop_table(self, table_name): if self.is_table(table_name): logger.debug("drop_table dropping orca table '%s'" % table_name) self.context.pop(table_name, None) - self._TABLES.pop(table_name, None) - - if table_name in self.replaced_tables: - logger.debug("drop_table forgetting replaced_tables '%s'" % table_name) - del self.replaced_tables[table_name] + self.existing_table_status.pop(table_name) if table_name in self.last_checkpoint: logger.debug( @@ -1158,7 +1190,7 @@ def cleanup_pipeline(self): final_pipeline_file_path.joinpath(CHECKPOINT_TABLE_NAME, "None.parquet") ) - from .tracing import delete_output_files + from activitysim.core.tracing import delete_output_files logger.debug(f"deleting all pipeline files except {final_pipeline_file_path}") delete_output_files(self, "h5", ignore=[final_pipeline_file_path]) @@ -1167,6 +1199,6 @@ def cleanup_pipeline(self): # @contextlib.contextmanager def chunk_log(self, *args, **kwargs): - from .chunk import chunk_log + from activitysim.core.chunk import chunk_log return chunk_log(*args, **kwargs, settings=self.settings) diff --git a/activitysim/core/workflow/steps.py b/activitysim/core/workflow/steps.py index 843152be6..5457d3a8b 100644 --- a/activitysim/core/workflow/steps.py +++ b/activitysim/core/workflow/steps.py @@ -3,7 +3,7 @@ import importlib.util import logging import time -from inspect import getfullargspec +from inspect import get_annotations, getfullargspec from typing import Callable, Mapping from pypyr.context import Context @@ -165,8 +165,9 @@ def __call__(self, wrapped_func): The function being decorated. It should return a dictionary of context updates. """ - from ..pipeline import Whale + from activitysim.core.workflow import Whale + _validate_workflow_function(wrapped_func) if self._step_name is None: self._step_name = wrapped_func.__name__ logger.debug(f"found workflow_{self._kind}: {self._step_name}") @@ -314,6 +315,36 @@ def __new__(cls, wrapped_func=None, *, step_name=None): ) +def _validate_workflow_function(f): + from activitysim.core.workflow import Whale + + argspec = getfullargspec(f) + if argspec.args[0] != "whale": + raise SyntaxError("workflow.func must have `whale` as the first argument") + if argspec.annotations.get("whale") is not Whale: + raise SyntaxError( + "workflow.func must have `Whale` as the first argument annotation" + ) + + +def func(function): + """ + Wrapper for a simple workflow function. + """ + from activitysim.core.workflow import Whale + + _validate_workflow_function(function) + + def wrapper(whale, *args, **kwargs): + if not isinstance(whale, Whale): + raise TypeError( + "workflow functions must have a Whale as the first argument" + ) + return function(whale, *args, **kwargs) + + return wrapper + + # def workflow_table(func): # """ # Decorator for functions that initialize tables. diff --git a/activitysim/estimation/larch/cdap.py b/activitysim/estimation/larch/cdap.py index fdb801de0..b1e40a752 100644 --- a/activitysim/estimation/larch/cdap.py +++ b/activitysim/estimation/larch/cdap.py @@ -326,7 +326,7 @@ def read_yaml(filename, **kwargs): if person_type_map is None: raise KeyError("PERSON_TYPE_MAP missing from cdap_settings.yaml") - person_rank = cdap.assign_cdap_rank(persons, person_type_map) + person_rank = cdap.assign_cdap_rank(whale, persons, person_type_map) coefficients = read_csv( coefficients_file, diff --git a/activitysim/examples/example_estimation/scripts/infer.py b/activitysim/examples/example_estimation/scripts/infer.py index f60b94900..1075de496 100644 --- a/activitysim/examples/example_estimation/scripts/infer.py +++ b/activitysim/examples/example_estimation/scripts/infer.py @@ -10,7 +10,7 @@ import yaml from activitysim.abm.models.util import canonical_ids as cid -from activitysim.abm.models.util import tour_frequency as tf +from activitysim.core import workflow from activitysim.core.util import reindex logger = logging.getLogger(__name__) @@ -69,7 +69,6 @@ def unmangle_ids(ids): def infer_cdap_activity(persons, tours, joint_tour_participants): - mandatory_tour_types = ["work", "school"] non_mandatory_tour_types = [ "escort", @@ -118,7 +117,6 @@ def infer_cdap_activity(persons, tours, joint_tour_participants): def infer_mandatory_tour_frequency(persons, tours): - num_work_tours = ( tours[tours.tour_type == "work"] .groupby("person_id") @@ -404,7 +402,6 @@ def read_tdd_alts(): def patch_tour_ids(persons, tours, joint_tour_participants): def set_tour_index(tours, parent_tour_num_col, is_joint): - group_cols = ["person_id", "tour_category", "tour_type"] if "parent_tour_num" in tours: @@ -565,7 +562,6 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): def infer_atwork_subtour_frequency(configs_dir, tours): - # first column is 'atwork_subtour_frequency' nickname, remaining columns are trip type counts alts = pd.read_csv( os.path.join(configs_dir, "atwork_subtour_frequency_alternatives.csv"), @@ -640,7 +636,7 @@ def infer_atwork_subtour_frequency(configs_dir, tours): return atwork_subtour_frequency -def patch_trip_ids(tours, trips): +def patch_trip_ids(whale: workflow.Whale, tours, trips): """ replace survey trip_ids with asim standard trip_id replace survey tour_id foreign key with asim standard tour_id @@ -672,7 +668,7 @@ def patch_trip_ids(tours, trips): + 1 ) - cid.set_trip_index(trips) + cid.set_trip_index(whale, trips) assert trips.index.name == ASIM_TRIP_ID trips = trips.reset_index().rename(columns={"trip_id": ASIM_TRIP_ID}) @@ -681,7 +677,6 @@ def patch_trip_ids(tours, trips): def infer_stop_frequency(configs_dir, tours, trips): - # alt,out,in # 0out_0in,0,0 # 0out_1in,0,1 @@ -707,7 +702,6 @@ def infer_stop_frequency(configs_dir, tours, trips): def read_tables(input_dir, tables): - for table, info in tables.items(): table = pd.read_csv( os.path.join(input_dir, info["file_name"]), index_col=info.get("index") @@ -730,7 +724,6 @@ def read_tables(input_dir, tables): def check_controls(table_name, column_name): - table = survey_tables[table_name].get("table") c_table = control_tables[table_name].get("table") @@ -755,7 +748,6 @@ def check_controls(table_name, column_name): def infer(configs_dir, input_dir, output_dir): - households, persons, tours, joint_tour_participants, trips = read_tables( input_dir, survey_tables ) diff --git a/activitysim/examples/prototype_sandag_xborder/extensions/reassign_tour_purpose.py b/activitysim/examples/prototype_sandag_xborder/extensions/reassign_tour_purpose.py index 74e909856..594102c8c 100644 --- a/activitysim/examples/prototype_sandag_xborder/extensions/reassign_tour_purpose.py +++ b/activitysim/examples/prototype_sandag_xborder/extensions/reassign_tour_purpose.py @@ -5,13 +5,13 @@ import numpy as np import pandas as pd -from activitysim.core import config, inject, pipeline +from activitysim.core import config, workflow logger = logging.getLogger(__name__) -@inject.step() -def reassign_tour_purpose_by_poe(tours, chunk_size, trace_hh_id): +@workflow.step +def reassign_tour_purpose_by_poe(whale: workflow.Whale, tours, chunk_size, trace_hh_id): """ Simulates tour purpose choices after tour origin has been assigned. This @@ -20,7 +20,9 @@ def reassign_tour_purpose_by_poe(tours, chunk_size, trace_hh_id): """ trace_label = "reassign_tour_purpose_by_poe" - probs_df = pd.read_csv(config.config_file_path("tour_purpose_probs_by_poe.csv")) + probs_df = pd.read_csv( + whale.filesystem.get_config_file_path("tour_purpose_probs_by_poe.csv") + ) probs_df.columns = [ col if col in ["Purpose", "Description"] else int(col) for col in probs_df.columns @@ -36,7 +38,7 @@ def reassign_tour_purpose_by_poe(tours, chunk_size, trace_hh_id): num_tours = len(group) purpose_probs = probs_df[poe] purpose_cum_probs = purpose_probs.values.cumsum() - rands = pipeline.get_rn_generator().random_for_df(group) + rands = whale.get_rn_generator().random_for_df(group) purpose_scaled_probs = np.subtract(purpose_cum_probs, rands) purpose = np.argmax((purpose_scaled_probs + 1.0).astype("i4"), axis=1) tours_df.loc[group.index, "purpose_id"] = purpose @@ -48,6 +50,6 @@ def reassign_tour_purpose_by_poe(tours, chunk_size, trace_hh_id): tours["tour_category"] = "non_mandatory" tours.loc[tours["tour_type"].isin(["home", "work"]), "tour_category"] = "mandatory" - pipeline.replace_table("tours", tours) + whale.add_table("tours", tours) return From 55c805c2d7ac9390e728fa79de24aa033b2858cf Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Fri, 10 Feb 2023 15:08:28 -0600 Subject: [PATCH 007/419] refactoring [ci-skip] --- activitysim/core/simulate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/activitysim/core/simulate.py b/activitysim/core/simulate.py index bc2a6da55..e94a3fd51 100644 --- a/activitysim/core/simulate.py +++ b/activitysim/core/simulate.py @@ -517,7 +517,7 @@ def eval_utilities( from .flow import apply_flow # import inside func to prevent circular imports locals_dict = {} - locals_dict.update(config.get_global_constants()) + locals_dict.update(whale.get_global_constants()) if locals_d is not None: locals_dict.update(locals_d) sh_util, sh_flow = apply_flow( From a794002fa48e9d1886692c3fede5323961c4ed13 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Fri, 10 Feb 2023 17:19:58 -0600 Subject: [PATCH 008/419] refactoring [skip ci] --- activitysim/abm/misc.py | 14 -- activitysim/abm/models/accessibility.py | 2 +- .../abm/models/atwork_subtour_destination.py | 11 +- .../abm/models/atwork_subtour_frequency.py | 10 +- .../abm/models/atwork_subtour_mode_choice.py | 6 +- .../abm/models/atwork_subtour_scheduling.py | 6 +- activitysim/abm/models/auto_ownership.py | 8 +- activitysim/abm/models/cdap.py | 6 +- .../abm/models/disaggregate_accessibility.py | 11 +- activitysim/abm/models/free_parking.py | 8 +- activitysim/abm/models/initialize.py | 6 +- activitysim/abm/models/initialize_tours.py | 7 +- .../abm/models/joint_tour_composition.py | 8 +- .../abm/models/joint_tour_destination.py | 9 +- .../abm/models/joint_tour_frequency.py | 8 +- .../abm/models/joint_tour_participation.py | 9 +- .../abm/models/joint_tour_scheduling.py | 6 +- activitysim/abm/models/location_choice.py | 50 +++--- .../abm/models/mandatory_scheduling.py | 8 +- .../abm/models/mandatory_tour_frequency.py | 8 +- .../abm/models/non_mandatory_destination.py | 12 +- .../abm/models/non_mandatory_scheduling.py | 9 +- .../models/non_mandatory_tour_frequency.py | 9 +- .../abm/models/parking_location_choice.py | 6 +- activitysim/abm/models/school_escorting.py | 13 +- activitysim/abm/models/stop_frequency.py | 5 +- .../abm/models/telecommute_frequency.py | 3 +- activitysim/abm/models/tour_mode_choice.py | 2 +- activitysim/abm/models/tour_od_choice.py | 9 +- .../models/tour_scheduling_probabilistic.py | 4 +- .../abm/models/transit_pass_ownership.py | 3 +- .../abm/models/transit_pass_subsidy.py | 3 +- .../abm/models/trip_departure_choice.py | 1 + activitysim/abm/models/trip_destination.py | 22 +-- activitysim/abm/models/trip_mode_choice.py | 2 +- activitysim/abm/models/trip_purpose.py | 6 +- .../models/trip_purpose_and_destination.py | 9 +- activitysim/abm/models/trip_scheduling.py | 3 +- .../abm/models/trip_scheduling_choice.py | 9 +- activitysim/abm/models/util/cdap.py | 48 ++++-- activitysim/abm/models/util/estimation.py | 4 +- activitysim/abm/models/util/mode.py | 1 + .../models/util/probabilistic_scheduling.py | 2 +- activitysim/abm/models/util/test/test_cdap.py | 16 +- .../test/test_vectorize_tour_scheduling.py | 2 +- .../abm/models/util/tour_destination.py | 24 ++- activitysim/abm/models/util/tour_od.py | 6 +- .../abm/models/util/tour_scheduling.py | 5 +- .../models/util/vectorize_tour_scheduling.py | 3 +- activitysim/abm/models/vehicle_allocation.py | 3 +- activitysim/abm/models/vehicle_type_choice.py | 3 +- activitysim/abm/models/work_from_home.py | 9 +- .../abm/tables/disaggregate_accessibility.py | 4 +- activitysim/abm/tables/shadow_pricing.py | 6 +- activitysim/abm/tables/skims.py | 5 +- activitysim/abm/tables/table_dict.py | 22 +-- activitysim/abm/tables/vehicles.py | 2 +- activitysim/benchmarking/componentwise.py | 14 +- activitysim/cli/run.py | 6 +- activitysim/core/chunk.py | 81 +++------- activitysim/core/configuration/filesystem.py | 34 ++++ activitysim/core/expressions.py | 11 +- .../core/interaction_sample_simulate.py | 55 ++++--- activitysim/core/interaction_simulate.py | 4 +- activitysim/core/logit.py | 2 +- activitysim/core/los.py | 5 +- activitysim/core/mem.py | 6 +- activitysim/core/mp_tasks.py | 2 +- activitysim/core/pathbuilder.py | 3 +- activitysim/core/simulate.py | 153 ++++++++++++------ activitysim/core/skim_dataset.py | 4 +- activitysim/core/steps/output.py | 2 +- activitysim/core/test/extensions/steps.py | 8 +- activitysim/core/test/test_simulate.py | 4 +- activitysim/core/test/test_tracing.py | 2 +- activitysim/core/tracing.py | 23 ++- other_resources/scripts/simulation.py | 2 +- other_resources/verification/simulation.py | 2 +- 78 files changed, 512 insertions(+), 407 deletions(-) diff --git a/activitysim/abm/misc.py b/activitysim/abm/misc.py index b5cf20c67..99c9579d1 100644 --- a/activitysim/abm/misc.py +++ b/activitysim/abm/misc.py @@ -60,20 +60,6 @@ def override_hh_ids(whale: workflow.Whale): return household_ids -# @workflow_object -# def trace_hh_id(whale: Whale): -# -# id = whale.settings.trace_hh_id -# -# if id and not isinstance(id, int): -# logger.warning( -# "setting trace_hh_id is wrong type, should be an int, but was %s" % type(id) -# ) -# id = None -# -# return id - - @workflow.cached_object def trace_od(whale: workflow.Whale): diff --git a/activitysim/abm/models/accessibility.py b/activitysim/abm/models/accessibility.py index 32d61abb9..47f848e78 100644 --- a/activitysim/abm/models/accessibility.py +++ b/activitysim/abm/models/accessibility.py @@ -108,7 +108,7 @@ def compute_accessibilities_for_zones( if trace_assigned_locals: tracing.write_csv( - trace_assigned_locals, file_name="accessibility_locals" + whale, trace_assigned_locals, file_name="accessibility_locals" ) return accessibility_df diff --git a/activitysim/abm/models/atwork_subtour_destination.py b/activitysim/abm/models/atwork_subtour_destination.py index 9ec233800..98bb2ba41 100644 --- a/activitysim/abm/models/atwork_subtour_destination.py +++ b/activitysim/abm/models/atwork_subtour_destination.py @@ -12,7 +12,7 @@ @workflow.step def atwork_subtour_destination( - whale: workflow.Whale, tours, persons_merged, network_los, chunk_size, trace_hh_id + whale: workflow.Whale, tours, persons_merged, network_los, chunk_size ): trace_label = "atwork_subtour_destination" model_settings_file_name = "atwork_subtour_destination.yaml" @@ -33,8 +33,7 @@ def atwork_subtour_destination( sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + whale.settings.want_dest_choice_sample_tables and sample_table_name is not None ) persons_merged = persons_merged.to_frame() @@ -47,7 +46,7 @@ def atwork_subtour_destination( tracing.no_results("atwork_subtour_destination") return - estimator = estimation.manager.begin_estimation("atwork_subtour_destination") + estimator = estimation.manager.begin_estimation(whale, "atwork_subtour_destination") if estimator: estimator.write_coefficients(model_settings=model_settings) # estimator.write_spec(model_settings, tag='SAMPLE_SPEC') @@ -62,6 +61,7 @@ def atwork_subtour_destination( estimator.write_model_settings(model_settings, model_settings_file_name) choices_df, save_sample_df = tour_destination.run_tour_destination( + whale, subtours, persons_merged, want_logsums, @@ -70,7 +70,6 @@ def atwork_subtour_destination( network_los, estimator, chunk_size, - trace_hh_id, trace_label, ) @@ -100,7 +99,7 @@ def atwork_subtour_destination( # save_sample_df.set_index(model_settings['ALT_DEST_COL_NAME'], append=True, inplace=True) whale.extend_table(sample_table_name, save_sample_df) - if trace_hh_id: + if whale.settings.trace_hh_id: tracing.trace_df( tours, label="atwork_subtour_destination", columns=["destination"] ) diff --git a/activitysim/abm/models/atwork_subtour_frequency.py b/activitysim/abm/models/atwork_subtour_frequency.py index 34354498a..e72c57ca0 100644 --- a/activitysim/abm/models/atwork_subtour_frequency.py +++ b/activitysim/abm/models/atwork_subtour_frequency.py @@ -19,9 +19,7 @@ def add_null_results(whale, trace_label, tours): @workflow.step -def atwork_subtour_frequency( - whale: workflow.Whale, tours, persons_merged, chunk_size, trace_hh_id -): +def atwork_subtour_frequency(whale: workflow.Whale, tours, persons_merged, chunk_size): """ This model predicts the frequency of making at-work subtour tours (alternatives for this model come from a separate csv file which is @@ -30,8 +28,7 @@ def atwork_subtour_frequency( trace_label = "atwork_subtour_frequency" model_settings_file_name = "atwork_subtour_frequency.yaml" - - tours = tours.to_frame() + trace_hh_id = whale.settings.trace_hh_id work_tours = tours[tours.tour_type == "work"] # - if no work_tours @@ -40,7 +37,7 @@ def atwork_subtour_frequency( return model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("atwork_subtour_frequency") + estimator = estimation.manager.begin_estimation(whale, "atwork_subtour_frequency") model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) @@ -80,6 +77,7 @@ def atwork_subtour_frequency( estimator.write_choosers(work_tours) choices = simulate.simple_simulate( + whale, choosers=work_tours, spec=model_spec, nest_spec=nest_spec, diff --git a/activitysim/abm/models/atwork_subtour_mode_choice.py b/activitysim/abm/models/atwork_subtour_mode_choice.py index 32c40b20a..05faa174d 100644 --- a/activitysim/abm/models/atwork_subtour_mode_choice.py +++ b/activitysim/abm/models/atwork_subtour_mode_choice.py @@ -15,7 +15,7 @@ @workflow.step def atwork_subtour_mode_choice( - whale: workflow.Whale, tours, persons_merged, network_los, chunk_size, trace_hh_id + whale: workflow.Whale, tours, persons_merged, network_los, chunk_size ): """ At-work subtour mode choice simulate @@ -23,6 +23,8 @@ def atwork_subtour_mode_choice( trace_label = "atwork_subtour_mode_choice" + trace_hh_id = whale.settings.trace_hh_id + model_settings_file_name = "tour_mode_choice.yaml" model_settings = config.read_model_settings(model_settings_file_name) @@ -119,7 +121,7 @@ def atwork_subtour_mode_choice( network_los.setting("TVPB_SETTINGS.tour_mode_choice.CONSTANTS") ) - estimator = estimation.manager.begin_estimation("atwork_subtour_mode_choice") + estimator = estimation.manager.begin_estimation(whale, "atwork_subtour_mode_choice") if estimator: estimator.write_coefficients(model_settings=model_settings) estimator.write_coefficients_template(model_settings=model_settings) diff --git a/activitysim/abm/models/atwork_subtour_scheduling.py b/activitysim/abm/models/atwork_subtour_scheduling.py index c5c41f3dc..7cbe03b28 100644 --- a/activitysim/abm/models/atwork_subtour_scheduling.py +++ b/activitysim/abm/models/atwork_subtour_scheduling.py @@ -27,7 +27,6 @@ def atwork_subtour_scheduling( tdd_alts, skim_dict, chunk_size, - trace_hh_id, ): """ This model predicts the departure time and duration of each activity for at work subtours tours @@ -35,8 +34,7 @@ def atwork_subtour_scheduling( trace_label = "atwork_subtour_scheduling" model_settings_file_name = "tour_scheduling_atwork.yaml" - - tours = tours.to_frame() + trace_hh_id = whale.settings.trace_hh_id subtours = tours[tours.tour_category == "atwork"] # - if no atwork subtours @@ -45,7 +43,7 @@ def atwork_subtour_scheduling( return model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("atwork_subtour_scheduling") + estimator = estimation.manager.begin_estimation(whale, "atwork_subtour_scheduling") model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) sharrow_skip = model_settings.get("sharrow_skip") diff --git a/activitysim/abm/models/auto_ownership.py b/activitysim/abm/models/auto_ownership.py index 234c327ac..646bc5b7d 100644 --- a/activitysim/abm/models/auto_ownership.py +++ b/activitysim/abm/models/auto_ownership.py @@ -10,7 +10,7 @@ @workflow.step def auto_ownership_simulate( - whale: workflow.Whale, households, households_merged, chunk_size, trace_hh_id + whale: workflow.Whale, households, households_merged, chunk_size ): """ Auto ownership is a standard model which predicts how many cars a household @@ -19,8 +19,9 @@ def auto_ownership_simulate( trace_label = "auto_ownership_simulate" model_settings_file_name = "auto_ownership.yaml" model_settings = config.read_model_settings(model_settings_file_name) + trace_hh_id = whale.settings.trace_hh_id - estimator = estimation.manager.begin_estimation("auto_ownership") + estimator = estimation.manager.begin_estimation(whale, "auto_ownership") model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) @@ -41,9 +42,10 @@ def auto_ownership_simulate( estimator.write_coefficients(coefficients_df, model_settings) estimator.write_choosers(choosers) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = whale.settings.log_alt_losers choices = simulate.simple_simulate( + whale, choosers=choosers, spec=model_spec, nest_spec=nest_spec, diff --git a/activitysim/abm/models/cdap.py b/activitysim/abm/models/cdap.py index b8b811d7b..f7eb40020 100644 --- a/activitysim/abm/models/cdap.py +++ b/activitysim/abm/models/cdap.py @@ -13,7 +13,7 @@ @workflow.step def cdap_simulate( - whale: workflow.Whale, persons_merged, persons, households, chunk_size, trace_hh_id + whale: workflow.Whale, persons_merged, persons, households, chunk_size ): """ CDAP stands for Coordinated Daily Activity Pattern, which is a choice of @@ -27,11 +27,12 @@ def cdap_simulate( trace_label = "cdap" model_settings = config.read_model_settings("cdap.yaml") + trace_hh_id = whale.settings.trace_hh_id person_type_map = model_settings.get("PERSON_TYPE_MAP", None) assert ( person_type_map is not None ), f"Expected to find PERSON_TYPE_MAP setting in cdap.yaml" - estimator = estimation.manager.begin_estimation("cdap") + estimator = estimation.manager.begin_estimation(whale, "cdap") cdap_indiv_spec = simulate.read_model_spec( file_name=model_settings["INDIV_AND_HHSIZE1_SPEC"] @@ -130,6 +131,7 @@ def cdap_simulate( logger.info("Running cdap_simulate with %d persons", len(persons_merged.index)) choices = cdap.run_cdap( + whale, persons=persons_merged, person_type_map=person_type_map, cdap_indiv_spec=cdap_indiv_spec, diff --git a/activitysim/abm/models/disaggregate_accessibility.py b/activitysim/abm/models/disaggregate_accessibility.py index 4ca3ce11b..da07a73cf 100644 --- a/activitysim/abm/models/disaggregate_accessibility.py +++ b/activitysim/abm/models/disaggregate_accessibility.py @@ -524,7 +524,7 @@ def inject_tables(self): inject.get_injectable("traceable_tables") + list(self.proto_pop.keys()), ) for tablename, df in self.proto_pop.items(): - inject.add_table(tablename, df) + whale.add_table(tablename, df) self.whale.get_rn_generator().add_channel(tablename, df) tracing.register_traceable_table(tablename, df) @@ -567,7 +567,7 @@ def merge_persons(self): self.proto_pop["proto_persons_merged"] = persons_merged # Store in pipeline - inject.add_table("proto_persons_merged", persons_merged) + whale.add_table("proto_persons_merged", persons_merged) def get_disaggregate_logsums( @@ -592,10 +592,10 @@ def get_disaggregate_logsums( model_settings["SAMPLE_SIZE"] = disagg_model_settings.get( "DESTINATION_SAMPLE_SIZE" ) - estimator = estimation.manager.begin_estimation(trace_label) + estimator = estimation.manager.begin_estimation(whale, trace_label) if estimator: location_choice.write_estimation_specs( - estimator, model_settings, model_name + ".yaml" + whale, estimator, model_settings, model_name + ".yaml" ) # Append table references in settings with "proto_" @@ -629,7 +629,6 @@ def get_disaggregate_logsums( model_settings=model_settings, chunk_size=chunk_size, chunk_tag=trace_label, - trace_hh_id=trace_hh_id, trace_label=trace_label, skip_choice=True, ) @@ -646,6 +645,7 @@ def get_disaggregate_logsums( tours = tours[tours.tour_category == "non_mandatory"] _logsums, _ = tour_destination.run_tour_destination( + whale, tours, persons_merged, want_logsums=True, @@ -654,7 +654,6 @@ def get_disaggregate_logsums( network_los=network_los, estimator=estimator, chunk_size=chunk_size, - trace_hh_id=trace_hh_id, trace_label=trace_label, skip_choice=True, ) diff --git a/activitysim/abm/models/free_parking.py b/activitysim/abm/models/free_parking.py index c9f891186..965ebca4c 100644 --- a/activitysim/abm/models/free_parking.py +++ b/activitysim/abm/models/free_parking.py @@ -9,20 +9,19 @@ @workflow.step -def free_parking( - whale: workflow.Whale, persons_merged, persons, chunk_size, trace_hh_id -): +def free_parking(whale: workflow.Whale, persons_merged, persons, chunk_size): """ """ trace_label = "free_parking" model_settings_file_name = "free_parking.yaml" + trace_hh_id = whale.settings.trace_hh_id choosers = persons_merged.to_frame() choosers = choosers[choosers.workplace_zone_id > -1] logger.info("Running %s with %d persons", trace_label, len(choosers)) model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("free_parking") + estimator = estimation.manager.begin_estimation(whale, "free_parking") constants = config.get_model_constants(model_settings) @@ -56,6 +55,7 @@ def free_parking( estimator.write_choosers(choosers) choices = simulate.simple_simulate( + whale, choosers=choosers, spec=model_spec, nest_spec=nest_spec, diff --git a/activitysim/abm/models/initialize.py b/activitysim/abm/models/initialize.py index 465a95147..b6e3eaeec 100644 --- a/activitysim/abm/models/initialize.py +++ b/activitysim/abm/models/initialize.py @@ -180,7 +180,7 @@ def preload_injectables(): inject.add_step("write_data_dictionary", write_data_dictionary) inject.add_step("write_tables", write_tables) - table_list = config.setting("input_table_list") + table_list = whale.settings.input_table_list # default ActivitySim table names and indices if table_list is None: @@ -194,7 +194,7 @@ def preload_injectables(): inject.add_injectable("settings", new_settings) # FIXME undocumented feature - if config.setting("write_raw_tables"): + if whale.settings.write_raw_tables: # write raw input tables as csv (before annotation) csv_dir = config.output_file_path("raw_tables") if not os.path.exists(csv_dir): @@ -207,7 +207,7 @@ def preload_injectables(): t0 = tracing.print_elapsed_time() - if config.setting("benchmarking", False): + if whale.settings.benchmarking: # we don't want to pay for skim_dict inside any model component during # benchmarking, so we'll preload skim_dict here. Preloading is not needed # for regular operation, as activitysim components can load-on-demand. diff --git a/activitysim/abm/models/initialize_tours.py b/activitysim/abm/models/initialize_tours.py index c8607fc27..4e360a756 100644 --- a/activitysim/abm/models/initialize_tours.py +++ b/activitysim/abm/models/initialize_tours.py @@ -75,11 +75,10 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): @workflow.step -def initialize_tours( - whale: workflow.Whale, network_los, households, persons, trace_hh_id -): +def initialize_tours(whale: workflow.Whale, network_los, households, persons): trace_label = "initialize_tours" + trace_hh_id = whale.settings.trace_hh_id tours = read_input_table(whale, "tours") # FIXME can't use households_sliced injectable as flag like persons table does in case of resume_after. @@ -110,7 +109,7 @@ def initialize_tours( assert tours.index.name == "tour_id" # replace table function with dataframe - inject.add_table("tours", tours) + whale.add_table("tours", tours) whale.get_rn_generator().add_channel("tours", tours) diff --git a/activitysim/abm/models/joint_tour_composition.py b/activitysim/abm/models/joint_tour_composition.py index 681d9e217..0db8a099b 100644 --- a/activitysim/abm/models/joint_tour_composition.py +++ b/activitysim/abm/models/joint_tour_composition.py @@ -19,7 +19,7 @@ def add_null_results(whale, trace_label, tours): @workflow.step def joint_tour_composition( - whale: workflow.Whale, tours, households, persons, chunk_size, trace_hh_id + whale: workflow.Whale, tours, households, persons, chunk_size ): """ This model predicts the makeup of the travel party (adults, children, or mixed). @@ -27,7 +27,6 @@ def joint_tour_composition( trace_label = "joint_tour_composition" model_settings_file_name = "joint_tour_composition.yaml" - tours = tours.to_frame() joint_tours = tours[tours.tour_category == "joint"] # - if no joint tours @@ -36,7 +35,7 @@ def joint_tour_composition( return model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("joint_tour_composition") + estimator = estimation.manager.begin_estimation(whale, "joint_tour_composition") # - only interested in households with joint_tours households = households.to_frame() @@ -86,6 +85,7 @@ def joint_tour_composition( estimator.write_choosers(joint_tours_merged) choices = simulate.simple_simulate( + whale, choosers=joint_tours_merged, spec=model_spec, nest_spec=nest_spec, @@ -116,7 +116,7 @@ def joint_tour_composition( "joint_tour_composition", joint_tours.composition, value_counts=True ) - if trace_hh_id: + if whale.settings.trace_hh_id: tracing.trace_df( joint_tours, label="joint_tour_composition.joint_tours", diff --git a/activitysim/abm/models/joint_tour_destination.py b/activitysim/abm/models/joint_tour_destination.py index 5c5c2927e..11c474703 100644 --- a/activitysim/abm/models/joint_tour_destination.py +++ b/activitysim/abm/models/joint_tour_destination.py @@ -20,7 +20,6 @@ def joint_tour_destination( households_merged, network_los, chunk_size, - trace_hh_id, ): """ Given the tour generation from the above, each tour needs to have a @@ -31,14 +30,14 @@ def joint_tour_destination( trace_label = "joint_tour_destination" model_settings_file_name = "joint_tour_destination.yaml" model_settings = config.read_model_settings(model_settings_file_name) + trace_hh_id = whale.settings.trace_hh_id logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME") want_logsums = logsum_column_name is not None sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + whale.settings.want_dest_choice_sample_tables and sample_table_name is not None ) # choosers are tours - in a sense tours are choosing their destination @@ -52,7 +51,7 @@ def joint_tour_destination( tracing.no_results("joint_tour_destination") return - estimator = estimation.manager.begin_estimation("joint_tour_destination") + estimator = estimation.manager.begin_estimation(whale, "joint_tour_destination") if estimator: estimator.write_coefficients(model_settings=model_settings) # estimator.write_spec(model_settings, tag='SAMPLE_SPEC') @@ -67,6 +66,7 @@ def joint_tour_destination( estimator.write_model_settings(model_settings, model_settings_file_name) choices_df, save_sample_df = tour_destination.run_tour_destination( + whale, tours, persons_merged, want_logsums, @@ -75,7 +75,6 @@ def joint_tour_destination( network_los, estimator, chunk_size, - trace_hh_id, trace_label, ) diff --git a/activitysim/abm/models/joint_tour_frequency.py b/activitysim/abm/models/joint_tour_frequency.py index f8cdd98a6..e8500538e 100644 --- a/activitysim/abm/models/joint_tour_frequency.py +++ b/activitysim/abm/models/joint_tour_frequency.py @@ -14,17 +14,16 @@ @workflow.step -def joint_tour_frequency( - whale: workflow.Whale, households, persons, chunk_size, trace_hh_id -): +def joint_tour_frequency(whale: workflow.Whale, households, persons, chunk_size): """ This model predicts the frequency of making fully joint trips (see the alternatives above). """ trace_label = "joint_tour_frequency" model_settings_file_name = "joint_tour_frequency.yaml" + trace_hh_id = whale.settings.trace_hh_id - estimator = estimation.manager.begin_estimation("joint_tour_frequency") + estimator = estimation.manager.begin_estimation(whale, "joint_tour_frequency") model_settings = config.read_model_settings(model_settings_file_name) @@ -79,6 +78,7 @@ def joint_tour_frequency( estimator.write_choosers(multi_person_households) choices = simulate.simple_simulate( + whale, choosers=multi_person_households, spec=model_spec, nest_spec=nest_spec, diff --git a/activitysim/abm/models/joint_tour_participation.py b/activitysim/abm/models/joint_tour_participation.py index d9806a076..3df221ad1 100644 --- a/activitysim/abm/models/joint_tour_participation.py +++ b/activitysim/abm/models/joint_tour_participation.py @@ -187,6 +187,7 @@ def participants_chooser(probs, choosers, spec, trace_label): diagnostic_cols = ["tour_id", "household_id", "composition", "adult"] unsatisfied_candidates = candidates[diagnostic_cols].join(probs) tracing.write_csv( + whale, unsatisfied_candidates, file_name="%s.UNSATISFIED" % trace_label, transpose=False, @@ -265,15 +266,14 @@ def add_null_results(whale, model_settings, trace_label): @workflow.step -def joint_tour_participation( - whale: workflow.Whale, tours, persons_merged, chunk_size, trace_hh_id -): +def joint_tour_participation(whale: workflow.Whale, tours, persons_merged, chunk_size): """ Predicts for each eligible person to participate or not participate in each joint tour. """ trace_label = "joint_tour_participation" model_settings_file_name = "joint_tour_participation.yaml" model_settings = config.read_model_settings(model_settings_file_name) + trace_hh_id = whale.settings.trace_hh_id tours = tours.to_frame() joint_tours = tours[tours.tour_category == "joint"] @@ -313,7 +313,7 @@ def joint_tour_participation( # - simple_simulate - estimator = estimation.manager.begin_estimation("joint_tour_participation") + estimator = estimation.manager.begin_estimation(whale, "joint_tour_participation") model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) @@ -339,6 +339,7 @@ def joint_tour_participation( candidates["chunk_id"] = reindex(household_chunk_ids, candidates.household_id) choices = simulate.simple_simulate_by_chunk_id( + whale, choosers=candidates, spec=model_spec, nest_spec=nest_spec, diff --git a/activitysim/abm/models/joint_tour_scheduling.py b/activitysim/abm/models/joint_tour_scheduling.py index f2fb41796..6350f64fd 100644 --- a/activitysim/abm/models/joint_tour_scheduling.py +++ b/activitysim/abm/models/joint_tour_scheduling.py @@ -16,7 +16,7 @@ @workflow.step def joint_tour_scheduling( - whale: workflow.Whale, tours, persons_merged, tdd_alts, chunk_size, trace_hh_id + whale: workflow.Whale, tours, persons_merged, tdd_alts, chunk_size ): """ This model predicts the departure time and duration of each joint tour @@ -26,7 +26,7 @@ def joint_tour_scheduling( model_settings_file_name = "joint_tour_scheduling.yaml" model_settings = config.read_model_settings(model_settings_file_name) - tours = tours.to_frame() + trace_hh_id = whale.settings.trace_hh_id joint_tours = tours[tours.tour_category == "joint"] # - if no joint tours @@ -71,7 +71,7 @@ def joint_tour_scheduling( timetable = inject.get_injectable("timetable") - estimator = estimation.manager.begin_estimation("joint_tour_scheduling") + estimator = estimation.manager.begin_estimation(whale, "joint_tour_scheduling") model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) sharrow_skip = model_settings.get("sharrow_skip", False) diff --git a/activitysim/abm/models/location_choice.py b/activitysim/abm/models/location_choice.py index fa28b1e7f..6742de5b4 100644 --- a/activitysim/abm/models/location_choice.py +++ b/activitysim/abm/models/location_choice.py @@ -541,7 +541,7 @@ def run_location_logsums( def run_location_simulate( - whale, + whale: workflow.Whale, segment_name, persons_merged, location_sample_df, @@ -611,12 +611,17 @@ def run_location_simulate( estimator.write_interaction_sample_alternatives(alternatives) spec = simulate.spec_for_segment( - model_settings, spec_id="SPEC", segment_name=segment_name, estimator=estimator + whale, + model_settings, + spec_id="SPEC", + segment_name=segment_name, + estimator=estimator, ) log_alt_losers = whale.settings.log_alt_losers choices = interaction_sample_simulate( + whale, choosers, alternatives, spec=spec, @@ -644,7 +649,7 @@ def run_location_simulate( def run_location_choice( - whale, + whale: workflow.Whale, persons_merged_df, network_los, shadow_price_calculator, @@ -654,7 +659,6 @@ def run_location_choice( model_settings, chunk_size, chunk_tag, - trace_hh_id, trace_label, skip_choice=False, ): @@ -675,7 +679,6 @@ def run_location_choice( estimator: Estimator object model_settings : dict chunk_size : int - trace_hh_id : int trace_label : str Returns @@ -761,7 +764,7 @@ def run_location_choice( ) if estimator: - if trace_hh_id: + if whale.settings.trace_hh_id: estimation_trace_label = tracing.extend_trace_label( trace_label, f"estimation.{segment_name}.modeled_choices" ) @@ -803,7 +806,7 @@ def run_location_choice( f"{trace_label} segment {segment_name} estimation: override logsums" ) - if trace_hh_id: + if whale.settings.trace_hh_id: estimation_trace_label = tracing.extend_trace_label( trace_label, f"estimation.{segment_name}.survey_choices" ) @@ -838,7 +841,7 @@ def run_location_choice( def iterate_location_choice( - whale, + whale: workflow.Whale, model_settings, persons_merged, persons, @@ -846,7 +849,6 @@ def iterate_location_choice( network_los, estimator, chunk_size, - trace_hh_id, locutor, trace_label, ): @@ -863,7 +865,6 @@ def iterate_location_choice( persons : injected table network_los : los.Network_LOS chunk_size : int - trace_hh_id : int locutor : bool whether this process is the privileged logger of shadow_pricing when multiprocessing trace_label : str @@ -936,7 +937,6 @@ def iterate_location_choice( model_settings=model_settings, chunk_size=chunk_size, chunk_tag=chunk_tag, - trace_hh_id=trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, "i%s" % iteration), ) @@ -975,7 +975,7 @@ def iterate_location_choice( ) if locutor: - spc.write_trace_files(iteration) + spc.write_trace_files(whale, iteration) if spc.use_shadow_pricing and spc.check_fit(iteration): logging.info( @@ -990,11 +990,11 @@ def iterate_location_choice( # - shadow price table if locutor: if spc.use_shadow_pricing and "SHADOW_PRICE_TABLE" in model_settings: - inject.add_table(model_settings["SHADOW_PRICE_TABLE"], spc.shadow_prices) + whale.add_table(model_settings["SHADOW_PRICE_TABLE"], spc.shadow_prices) if "MODELED_SIZE_TABLE" in model_settings: - inject.add_table(model_settings["MODELED_SIZE_TABLE"], spc.modeled_size) + whale.add_table(model_settings["MODELED_SIZE_TABLE"], spc.modeled_size) - persons_df = persons.to_frame() + persons_df = persons # add the choice values to the dest_choice_column in persons dataframe # We only chose school locations for the subset of persons who go to school @@ -1032,12 +1032,12 @@ def iterate_location_choice( whale.add_table("persons", persons_df) - if trace_hh_id: + if whale.settings.trace_hh_id: tracing.trace_df(persons_df, label=trace_label, warn_if_empty=True) # - annotate households table if "annotate_households" in model_settings: - households_df = households.to_frame() + households_df = households expressions.assign_columns( whale, df=households_df, @@ -1046,7 +1046,7 @@ def iterate_location_choice( ) whale.add_table("households", households_df) - if trace_hh_id: + if whale.settings.trace_hh_id: tracing.trace_df(households_df, label=trace_label, warn_if_empty=True) if logsum_column_name: @@ -1065,7 +1065,6 @@ def workplace_location( households, network_los, chunk_size, - trace_hh_id, locutor, ): """ @@ -1077,9 +1076,11 @@ def workplace_location( trace_label = "workplace_location" model_settings = whale.filesystem.read_model_settings("workplace_location.yaml") - estimator = estimation.manager.begin_estimation("workplace_location") + estimator = estimation.manager.begin_estimation(whale, whale, "workplace_location") if estimator: - write_estimation_specs(estimator, model_settings, "workplace_location.yaml") + write_estimation_specs( + whale, estimator, model_settings, "workplace_location.yaml" + ) # FIXME - debugging code to test multiprocessing failure handling # process_name = multiprocessing.current_process().name @@ -1091,6 +1092,7 @@ def workplace_location( locutor = False iterate_location_choice( + whale, model_settings, persons_merged, persons, @@ -1098,7 +1100,6 @@ def workplace_location( network_los, estimator, chunk_size, - trace_hh_id, locutor, trace_label, ) @@ -1126,9 +1127,9 @@ def school_location( trace_label = "school_location" model_settings = whale.filesystem.read_model_settings("school_location.yaml") - estimator = estimation.manager.begin_estimation(whale, "school_location") + estimator = estimation.manager.begin_estimation(whale, whale, "school_location") if estimator: - write_estimation_specs(estimator, model_settings, "school_location.yaml") + write_estimation_specs(whale, estimator, model_settings, "school_location.yaml") # disable locutor for benchmarking if whale.settings.benchmarking: @@ -1143,7 +1144,6 @@ def school_location( network_los, estimator, chunk_size, - whale.settings.trace_hh_id, locutor, trace_label, ) diff --git a/activitysim/abm/models/mandatory_scheduling.py b/activitysim/abm/models/mandatory_scheduling.py index fbed6b27d..0a88d10bf 100644 --- a/activitysim/abm/models/mandatory_scheduling.py +++ b/activitysim/abm/models/mandatory_scheduling.py @@ -14,7 +14,7 @@ @workflow.step def mandatory_tour_scheduling( - whale: workflow.Whale, tours, persons_merged, tdd_alts, chunk_size, trace_hh_id + whale: workflow.Whale, tours, persons_merged, tdd_alts, chunk_size ): """ This model predicts the departure time and duration of each activity for mandatory tours @@ -22,10 +22,8 @@ def mandatory_tour_scheduling( model_name = "mandatory_tour_scheduling" trace_label = model_name + trace_hh_id = whale.settings.trace_hh_id - persons_merged = persons_merged.to_frame() - - tours = tours.to_frame() mandatory_tours = tours[tours.tour_category == "mandatory"] # - if no mandatory_tours @@ -50,13 +48,13 @@ def mandatory_tour_scheduling( ) choices = run_tour_scheduling( + whale, model_name, mandatory_tours, persons_merged, tdd_alts, tour_segment_col, chunk_size, - trace_hh_id, ) assign_in_place(tours, choices) diff --git a/activitysim/abm/models/mandatory_tour_frequency.py b/activitysim/abm/models/mandatory_tour_frequency.py index 3b072f369..201b8c751 100644 --- a/activitysim/abm/models/mandatory_tour_frequency.py +++ b/activitysim/abm/models/mandatory_tour_frequency.py @@ -35,15 +35,14 @@ def add_null_results(whale, trace_label, mandatory_tour_frequency_settings): @workflow.step -def mandatory_tour_frequency( - whale: workflow.Whale, persons_merged, chunk_size, trace_hh_id -): +def mandatory_tour_frequency(whale: workflow.Whale, persons_merged, chunk_size): """ This model predicts the frequency of making mandatory trips (see the alternatives above) - these trips include work and school in some combination. """ trace_label = "mandatory_tour_frequency" model_settings_file_name = "mandatory_tour_frequency.yaml" + trace_hh_id = whale.settings.trace_hh_id model_settings = config.read_model_settings(model_settings_file_name) @@ -70,7 +69,7 @@ def mandatory_tour_frequency( trace_label=trace_label, ) - estimator = estimation.manager.begin_estimation("mandatory_tour_frequency") + estimator = estimation.manager.begin_estimation(whale, "mandatory_tour_frequency") model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) @@ -88,6 +87,7 @@ def mandatory_tour_frequency( estimator.write_choosers(choosers) choices = simulate.simple_simulate( + whale, choosers=choosers, spec=model_spec, nest_spec=nest_spec, diff --git a/activitysim/abm/models/non_mandatory_destination.py b/activitysim/abm/models/non_mandatory_destination.py index 7f400e63e..2852eae44 100644 --- a/activitysim/abm/models/non_mandatory_destination.py +++ b/activitysim/abm/models/non_mandatory_destination.py @@ -14,7 +14,7 @@ @workflow.step def non_mandatory_tour_destination( - whale: workflow.Whale, tours, persons_merged, network_los, chunk_size, trace_hh_id + whale: workflow.Whale, tours, persons_merged, network_los, chunk_size ): """ Given the tour generation from the above, each tour needs to have a @@ -25,14 +25,14 @@ def non_mandatory_tour_destination( trace_label = "non_mandatory_tour_destination" model_settings_file_name = "non_mandatory_tour_destination.yaml" model_settings = config.read_model_settings(model_settings_file_name) + trace_hh_id = whale.settings.trace_hh_id logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME") want_logsums = logsum_column_name is not None sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + whale.settings.want_dest_choice_sample_tables and sample_table_name is not None ) tours = tours.to_frame() @@ -58,7 +58,9 @@ def non_mandatory_tour_destination( tracing.no_results(trace_label) return - estimator = estimation.manager.begin_estimation("non_mandatory_tour_destination") + estimator = estimation.manager.begin_estimation( + whale, "non_mandatory_tour_destination" + ) if estimator: estimator.write_coefficients(model_settings=model_settings) # estimator.write_spec(model_settings, tag='SAMPLE_SPEC') @@ -73,6 +75,7 @@ def non_mandatory_tour_destination( estimator.write_model_settings(model_settings, model_settings_file_name) choices_df, save_sample_df = tour_destination.run_tour_destination( + whale, non_mandatory_tours, persons_merged, want_logsums, @@ -81,7 +84,6 @@ def non_mandatory_tour_destination( network_los, estimator, chunk_size, - trace_hh_id, trace_label, ) diff --git a/activitysim/abm/models/non_mandatory_scheduling.py b/activitysim/abm/models/non_mandatory_scheduling.py index 7ad7f547e..84a999225 100644 --- a/activitysim/abm/models/non_mandatory_scheduling.py +++ b/activitysim/abm/models/non_mandatory_scheduling.py @@ -13,7 +13,7 @@ @workflow.step def non_mandatory_tour_scheduling( - whale: workflow.Whale, tours, persons_merged, tdd_alts, chunk_size, trace_hh_id + whale: workflow.Whale, tours, persons_merged, tdd_alts, chunk_size ): """ This model predicts the departure time and duration of each activity for non-mandatory tours @@ -21,10 +21,7 @@ def non_mandatory_tour_scheduling( model_name = "non_mandatory_tour_scheduling" trace_label = model_name - - persons_merged = persons_merged.to_frame() - - tours = tours.to_frame() + trace_hh_id = whale.settings.trace_hh_id non_mandatory_tours = tours[tours.tour_category == "non_mandatory"] # - if no mandatory_tours @@ -35,13 +32,13 @@ def non_mandatory_tour_scheduling( tour_segment_col = None choices = run_tour_scheduling( + whale, model_name, non_mandatory_tours, persons_merged, tdd_alts, tour_segment_col, chunk_size, - trace_hh_id, ) assign_in_place(tours, choices) diff --git a/activitysim/abm/models/non_mandatory_tour_frequency.py b/activitysim/abm/models/non_mandatory_tour_frequency.py index 0b83563b7..97f41d257 100644 --- a/activitysim/abm/models/non_mandatory_tour_frequency.py +++ b/activitysim/abm/models/non_mandatory_tour_frequency.py @@ -31,7 +31,7 @@ def extension_probs(whale: workflow.Whale): def extend_tour_counts( - whale: workflow.Whale, persons, tour_counts, alternatives, trace_hh_id, trace_label + whale: workflow.Whale, persons, tour_counts, alternatives, trace_label ): """ extend tour counts based on a probability table @@ -67,12 +67,13 @@ def extend_tour_counts( """ assert tour_counts.index.name == persons.index.name + trace_hh_id = whale.settings.trace_hh_id PROBABILITY_COLUMNS = ["0_tours", "1_tours", "2_tours"] JOIN_COLUMNS = ["ptype", "has_mandatory_tour", "has_joint_tour"] TOUR_TYPE_COL = "nonmandatory_tour_type" - probs_spec = extension_probs() + probs_spec = extension_probs(whale) persons = persons[JOIN_COLUMNS] # only extend if there are 1 - 4 non_mandatory tours to start with @@ -201,7 +202,7 @@ def non_mandatory_tour_frequency( continue estimator = estimation.manager.begin_estimation( - model_name=segment_name, bundle_name="non_mandatory_tour_frequency" + whale, model_name=segment_name, bundle_name="non_mandatory_tour_frequency" ) coefficients_df = simulate.read_model_coefficients(segment_settings) @@ -233,7 +234,7 @@ def non_mandatory_tour_frequency( estimator.set_chooser_id(chooser_segment.index.name) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = whale.settings.log_alt_losers choices = interaction_simulate( whale, diff --git a/activitysim/abm/models/parking_location_choice.py b/activitysim/abm/models/parking_location_choice.py index 79b4ba8e3..cb46c95f5 100644 --- a/activitysim/abm/models/parking_location_choice.py +++ b/activitysim/abm/models/parking_location_choice.py @@ -127,6 +127,7 @@ def parking_destination_simulate( locals_dict["PARKING"] = skims["op_skims"].dest_key parking_locations = interaction_sample_simulate( + whale, choosers=trips, alternatives=destination_sample, spec=spec, @@ -214,8 +215,7 @@ def run_parking_destination( parking_location_column_name = model_settings["ALT_DEST_COL_NAME"] sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + whale.settings.want_dest_choice_sample_tables and sample_table_name is not None ) choosers = trips[trips[chooser_filter_column]] @@ -283,7 +283,6 @@ def parking_location( land_use, network_los, chunk_size, - trace_hh_id, ): """ Given a set of trips, each trip needs to have a parking location if @@ -292,6 +291,7 @@ def parking_location( trace_label = "parking_location" model_settings = config.read_model_settings("parking_location_choice.yaml") + trace_hh_id = whale.settings.trace_hh_id alt_destination_col_name = model_settings["ALT_DEST_COL_NAME"] preprocessor_settings = model_settings.get("PREPROCESSOR", None) diff --git a/activitysim/abm/models/school_escorting.py b/activitysim/abm/models/school_escorting.py index aef7d84ac..78bd260a7 100644 --- a/activitysim/abm/models/school_escorting.py +++ b/activitysim/abm/models/school_escorting.py @@ -333,7 +333,6 @@ def school_escorting( persons, tours, chunk_size, - trace_hh_id, ): """ school escorting model @@ -361,11 +360,7 @@ def school_escorting( trace_label = "school_escorting_simulate" model_settings_file_name = "school_escorting.yaml" model_settings = config.read_model_settings(model_settings_file_name) - - persons = persons.to_frame() - households = households.to_frame() - households_merged = households_merged.to_frame() - tours = tours.to_frame() + trace_hh_id = whale.settings.trace_hh_id alts = simulate.read_model_alts(whale, model_settings["ALTS"], set_index="Alt") @@ -384,7 +379,9 @@ def school_escorting( choices = None for stage_num, stage in enumerate(school_escorting_stages): stage_trace_label = trace_label + "_" + stage - estimator = estimation.manager.begin_estimation("school_escorting_" + stage) + estimator = estimation.manager.begin_estimation( + whale, "school_escorting_" + stage + ) model_spec_raw = simulate.read_model_spec( file_name=model_settings[stage.upper() + "_SPEC"] @@ -444,7 +441,7 @@ def school_escorting( estimator.write_coefficients(coefficients_df, model_settings) estimator.write_choosers(choosers) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = whale.settings.log_alt_losers choices = interaction_simulate( whale, diff --git a/activitysim/abm/models/stop_frequency.py b/activitysim/abm/models/stop_frequency.py index 0d89c7677..172331dbb 100644 --- a/activitysim/abm/models/stop_frequency.py +++ b/activitysim/abm/models/stop_frequency.py @@ -19,7 +19,6 @@ def stop_frequency( stop_frequency_alts, network_los, chunk_size, - trace_hh_id, ): """ stop frequency model @@ -49,6 +48,7 @@ def stop_frequency( trace_label = "stop_frequency" model_settings_file_name = "stop_frequency.yaml" + trace_hh_id = whale.settings.trace_hh_id model_settings = config.read_model_settings(model_settings_file_name) @@ -118,7 +118,7 @@ def stop_frequency( ) estimator = estimation.manager.begin_estimation( - model_name=segment_name, bundle_name="stop_frequency" + whale, model_name=segment_name, bundle_name="stop_frequency" ) segment_spec = simulate.read_model_spec(file_name=segment_settings["SPEC"]) @@ -145,6 +145,7 @@ def stop_frequency( estimator.set_chooser_id(chooser_segment.index.name) choices = simulate.simple_simulate( + whale, choosers=chooser_segment, spec=segment_spec, nest_spec=nest_spec, diff --git a/activitysim/abm/models/telecommute_frequency.py b/activitysim/abm/models/telecommute_frequency.py index 5e52c4b37..213f7c3dc 100755 --- a/activitysim/abm/models/telecommute_frequency.py +++ b/activitysim/abm/models/telecommute_frequency.py @@ -31,7 +31,7 @@ def telecommute_frequency( logger.info("Running %s with %d persons", trace_label, len(choosers)) model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("telecommute_frequency") + estimator = estimation.manager.begin_estimation(whale, "telecommute_frequency") constants = config.get_model_constants(model_settings) @@ -65,6 +65,7 @@ def telecommute_frequency( estimator.write_choosers(choosers) choices = simulate.simple_simulate( + whale, choosers=choosers, spec=model_spec, nest_spec=nest_spec, diff --git a/activitysim/abm/models/tour_mode_choice.py b/activitysim/abm/models/tour_mode_choice.py index b3b91fc79..c8c841cd2 100644 --- a/activitysim/abm/models/tour_mode_choice.py +++ b/activitysim/abm/models/tour_mode_choice.py @@ -276,7 +276,7 @@ def tour_mode_choice_simulate( if whale.get_rn_generator().step_name != "tour_mode_choice_simulate": estimator = None else: - estimator = estimation.manager.begin_estimation("tour_mode_choice") + estimator = estimation.manager.begin_estimation(whale, "tour_mode_choice") if estimator: estimator.write_coefficients(model_settings=model_settings) estimator.write_coefficients_template(model_settings=model_settings) diff --git a/activitysim/abm/models/tour_od_choice.py b/activitysim/abm/models/tour_od_choice.py index 6dd9fb9a4..a3cc9d9fc 100644 --- a/activitysim/abm/models/tour_od_choice.py +++ b/activitysim/abm/models/tour_od_choice.py @@ -19,7 +19,6 @@ def tour_od_choice( land_use, network_los, chunk_size, - trace_hh_id, ): """Simulates joint origin/destination choice for all tours. @@ -46,8 +45,6 @@ def tour_od_choice( lazy-loaded activitysim.los.Network_LOS object chunk_size simulation chunk size, set in main settings.yaml - trace_hh_id : int - households to trace, set in main settings.yaml """ trace_label = "tour_od_choice" @@ -56,11 +53,11 @@ def tour_od_choice( origin_col_name = model_settings["ORIG_COL_NAME"] dest_col_name = model_settings["DEST_COL_NAME"] alt_id_col = tour_od.get_od_id_col(origin_col_name, dest_col_name) + trace_hh_id = whale.settings.trace_hh_id sample_table_name = model_settings.get("OD_CHOICE_SAMPLE_TABLE_NAME") want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + whale.settings.want_dest_choice_sample_tables and sample_table_name is not None ) logsum_column_name = model_settings.get("OD_CHOICE_LOGSUM_COLUMN_NAME", None) @@ -71,7 +68,7 @@ def tour_od_choice( # interaction_sample_simulate insists choosers appear in same order as alts tours = tours.sort_index() - estimator = estimation.manager.begin_estimation("tour_od_choice") + estimator = estimation.manager.begin_estimation(whale, "tour_od_choice") if estimator: estimator.write_coefficients(model_settings=model_settings) estimator.write_spec(model_settings, tag="SAMPLE_SPEC") diff --git a/activitysim/abm/models/tour_scheduling_probabilistic.py b/activitysim/abm/models/tour_scheduling_probabilistic.py index 3de0357fe..609897ddc 100644 --- a/activitysim/abm/models/tour_scheduling_probabilistic.py +++ b/activitysim/abm/models/tour_scheduling_probabilistic.py @@ -106,7 +106,9 @@ def tour_scheduling_probabilistic( # trip_scheduling is a probabilistic model ane we don't support estimation, # but we do need to override choices in estimation mode - estimator = estimation.manager.begin_estimation("tour_scheduling_probabilistic") + estimator = estimation.manager.begin_estimation( + whale, "tour_scheduling_probabilistic" + ) if estimator: estimator.write_spec(model_settings, tag="PROBS_SPEC") estimator.write_model_settings(model_settings, model_settings_file_name) diff --git a/activitysim/abm/models/transit_pass_ownership.py b/activitysim/abm/models/transit_pass_ownership.py index c9728b8eb..76624e3cd 100644 --- a/activitysim/abm/models/transit_pass_ownership.py +++ b/activitysim/abm/models/transit_pass_ownership.py @@ -25,7 +25,7 @@ def transit_pass_ownership( logger.info("Running %s with %d persons", trace_label, len(choosers)) model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("transit_pass_ownership") + estimator = estimation.manager.begin_estimation(whale, "transit_pass_ownership") constants = config.get_model_constants(model_settings) @@ -59,6 +59,7 @@ def transit_pass_ownership( estimator.write_choosers(choosers) choices = simulate.simple_simulate( + whale, choosers=choosers, spec=model_spec, nest_spec=nest_spec, diff --git a/activitysim/abm/models/transit_pass_subsidy.py b/activitysim/abm/models/transit_pass_subsidy.py index 4a07a803a..b7ab2ae71 100644 --- a/activitysim/abm/models/transit_pass_subsidy.py +++ b/activitysim/abm/models/transit_pass_subsidy.py @@ -25,7 +25,7 @@ def transit_pass_subsidy( logger.info("Running %s with %d persons", trace_label, len(choosers)) model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("transit_pass_subsidy") + estimator = estimation.manager.begin_estimation(whale, "transit_pass_subsidy") constants = config.get_model_constants(model_settings) @@ -59,6 +59,7 @@ def transit_pass_subsidy( estimator.write_choosers(choosers) choices = simulate.simple_simulate( + whale, choosers=choosers, spec=model_spec, nest_spec=nest_spec, diff --git a/activitysim/abm/models/trip_departure_choice.py b/activitysim/abm/models/trip_departure_choice.py index 38cb58e78..fdb8c47f0 100644 --- a/activitysim/abm/models/trip_departure_choice.py +++ b/activitysim/abm/models/trip_departure_choice.py @@ -425,6 +425,7 @@ def apply_stage_two_model(whale, omnibus_spec, trips, chunk_size, trace_label): i, chooser_chunk, chunk_trace_label, + chunk_sizer, ) in chunk.adaptive_chunked_choosers_by_chunk_id( side_trips, chunk_size, trace_label ): diff --git a/activitysim/abm/models/trip_destination.py b/activitysim/abm/models/trip_destination.py index 6a69eb025..7361a3bdd 100644 --- a/activitysim/abm/models/trip_destination.py +++ b/activitysim/abm/models/trip_destination.py @@ -477,7 +477,6 @@ def destination_presample( network_los, estimator, chunk_size, - trace_hh_id, trace_label, ): trace_label = tracing.extend_trace_label(trace_label, "presample") @@ -548,7 +547,6 @@ def trip_destination_sample( skim_hotel, estimator, chunk_size, - trace_hh_id, trace_label, ): """ @@ -589,6 +587,7 @@ def trip_destination_sample( ) choices = destination_presample( + whale, primary_purpose, trips, alternatives, @@ -598,12 +597,12 @@ def trip_destination_sample( network_los, estimator, chunk_size, - trace_hh_id, trace_label, ) else: choices = destination_sample( + whale, primary_purpose, trips, alternatives, @@ -815,7 +814,6 @@ def trip_destination_simulate( skim_hotel, estimator, chunk_size, - trace_hh_id, trace_label, ): """ @@ -864,6 +862,7 @@ def trip_destination_simulate( log_alt_losers = whale.settings.log_alt_losers destinations = interaction_sample_simulate( + whale, choosers=trips, alternatives=destination_sample, spec=spec, @@ -917,7 +916,6 @@ def choose_trip_destination( skim_hotel, estimator, chunk_size, - trace_hh_id, trace_label, ): logger.info("choose_trip_destination %s with %d trips", trace_label, trips.shape[0]) @@ -935,7 +933,6 @@ def choose_trip_destination( skim_hotel=skim_hotel, estimator=estimator, chunk_size=chunk_size, - trace_hh_id=trace_hh_id, trace_label=trace_label, ) @@ -980,7 +977,6 @@ def choose_trip_destination( skim_hotel=skim_hotel, estimator=estimator, chunk_size=chunk_size, - trace_hh_id=trace_hh_id, trace_label=trace_label, ) @@ -1136,7 +1132,6 @@ def run_trip_destination( tours_merged, estimator, chunk_size, - trace_hh_id, trace_label, fail_some_trips_for_testing=False, ): @@ -1315,7 +1310,6 @@ def run_trip_destination( skim_hotel, estimator, chunk_size, - trace_hh_id, trace_label=tracing.extend_trace_label( nth_trace_label, primary_purpose ), @@ -1428,7 +1422,7 @@ def trip_destination( trips_df, school_escort_trips ) - estimator = estimation.manager.begin_estimation("trip_destination") + estimator = estimation.manager.begin_estimation(whale, "trip_destination") if estimator: estimator.write_coefficients(model_settings=model_settings) @@ -1451,16 +1445,12 @@ def trip_destination( tours_merged_df, estimator=estimator, chunk_size=chunk_size, - trace_hh_id=trace_hh_id, trace_label=trace_label, fail_some_trips_for_testing=fail_some_trips_for_testing, ) # testing feature t0 make sure at least one trip fails so trip_purpose_and_destination model is run - if ( - config.setting("testing_fail_trip_destination", False) - and not trips_df.failed.any() - ): + if whale.settings.testing_fail_trip_destination and not trips_df.failed.any(): if (trips_df.trip_num < trips_df.trip_count).sum() == 0: raise RuntimeError( "can't honor 'testing_fail_trip_destination' setting because no intermediate trips" @@ -1479,7 +1469,7 @@ def trip_destination( file_name = f"{trace_label}_failed_trips" logger.info("writing failed trips to %s", file_name) tracing.write_csv( - trips_df[trips_df.failed], file_name=file_name, transpose=False + whale, trips_df[trips_df.failed], file_name=file_name, transpose=False ) if estimator: diff --git a/activitysim/abm/models/trip_mode_choice.py b/activitysim/abm/models/trip_mode_choice.py index edcaf4695..dc710f64a 100644 --- a/activitysim/abm/models/trip_mode_choice.py +++ b/activitysim/abm/models/trip_mode_choice.py @@ -154,7 +154,7 @@ def trip_mode_choice( if whale.current_model_name != "trip_mode_choice": estimator = None else: - estimator = estimation.manager.begin_estimation("trip_mode_choice") + estimator = estimation.manager.begin_estimation(whale, "trip_mode_choice") if estimator: estimator.write_coefficients(model_settings=model_settings) estimator.write_coefficients_template(model_settings=model_settings) diff --git a/activitysim/abm/models/trip_purpose.py b/activitysim/abm/models/trip_purpose.py index 0d8dbde23..1ece7f8d0 100644 --- a/activitysim/abm/models/trip_purpose.py +++ b/activitysim/abm/models/trip_purpose.py @@ -130,7 +130,9 @@ def choose_intermediate_trip_purpose( file_name, ) ) - tracing.write_csv(unmatched_choosers, file_name=file_name, transpose=False) + tracing.write_csv( + whale, unmatched_choosers, file_name=file_name, transpose=False + ) raise RuntimeError( "Some trips could not be matched to probs based on join columns %s." % probs_join_cols @@ -277,7 +279,7 @@ def trip_purpose(whale: workflow.Whale, trips, chunk_size, trace_hh_id): trips_df, school_escort_trips ) - estimator = estimation.manager.begin_estimation("trip_purpose") + estimator = estimation.manager.begin_estimation(whale, "trip_purpose") if estimator: chooser_cols_for_estimation = [ "person_id", diff --git a/activitysim/abm/models/trip_purpose_and_destination.py b/activitysim/abm/models/trip_purpose_and_destination.py index 5e6130d88..072baf689 100644 --- a/activitysim/abm/models/trip_purpose_and_destination.py +++ b/activitysim/abm/models/trip_purpose_and_destination.py @@ -23,10 +23,10 @@ def run_trip_purpose_and_destination( trips_df, tours_merged_df, chunk_size, - trace_hh_id, trace_label, ): assert not trips_df.empty + trace_hh_id = whale.settings.trace_hh_id choices = run_trip_purpose( whale, @@ -45,7 +45,6 @@ def run_trip_purpose_and_destination( tours_merged_df, estimator=None, chunk_size=chunk_size, - trace_hh_id=trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, "destination"), ) @@ -118,8 +117,8 @@ def trip_purpose_and_destination( # if we didn't, but it is enabled, it is probably a configuration error # if we just estimated trip_purpose, it isn't clear what they are trying to do , nor how to handle it assert not ( - estimation.manager.begin_estimation("trip_purpose") - or estimation.manager.begin_estimation("trip_destination") + estimation.manager.begin_estimation(whale, "trip_purpose") + or estimation.manager.begin_estimation(whale, "trip_destination") ) processed_trips = [] @@ -168,7 +167,7 @@ def trip_purpose_and_destination( file_name = "%s_i%s_failed_trips" % (trace_label, i) logger.info("writing failed trips to %s" % file_name) tracing.write_csv( - trips_df[trips_df.failed], file_name=file_name, transpose=False + whale, trips_df[trips_df.failed], file_name=file_name, transpose=False ) # if max iterations reached, add remaining trips to processed_trips and give up diff --git a/activitysim/abm/models/trip_scheduling.py b/activitysim/abm/models/trip_scheduling.py index 6e7f73d71..080da2c01 100644 --- a/activitysim/abm/models/trip_scheduling.py +++ b/activitysim/abm/models/trip_scheduling.py @@ -423,7 +423,7 @@ def trip_scheduling(whale: workflow.Whale, trips, tours, chunk_size, trace_hh_id # trip_scheduling is a probabilistic model ane we don't support estimation, # but we do need to override choices in estimation mode - estimator = estimation.manager.begin_estimation("trip_scheduling") + estimator = estimation.manager.begin_estimation(whale, "trip_scheduling") if estimator: estimator.write_spec(model_settings, tag="PROBS_SPEC") estimator.write_model_settings(model_settings, model_settings_file_name) @@ -466,6 +466,7 @@ def trip_scheduling(whale: workflow.Whale, trips, tours, chunk_size, trace_hh_id chunk_i, trips_chunk, chunk_trace_label, + chunk_sizer, ) in chunk.adaptive_chunked_choosers_by_chunk_id( trips_df, chunk_size, trace_label, trace_label ): diff --git a/activitysim/abm/models/trip_scheduling_choice.py b/activitysim/abm/models/trip_scheduling_choice.py index 7a8edb2c7..d7fbb986f 100644 --- a/activitysim/abm/models/trip_scheduling_choice.py +++ b/activitysim/abm/models/trip_scheduling_choice.py @@ -258,7 +258,12 @@ def run_trip_scheduling_choice( if len(indirect_tours) > 0: # Iterate through the chunks result_list = [] - for i, choosers, chunk_trace_label in chunk.adaptive_chunked_choosers( + for ( + i, + choosers, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers( whale, indirect_tours, chunk_size, trace_label ): # Sort the choosers and get the schedule alternatives @@ -271,6 +276,7 @@ def run_trip_scheduling_choice( # Run the simulation choices = _interaction_sample_simulate( + whale, choosers=choosers, alternatives=schedules, spec=spec, @@ -284,6 +290,7 @@ def run_trip_scheduling_choice( trace_label=chunk_trace_label, trace_choice_name="trip_schedule_stage_1", estimator=None, + chunk_sizer=chunk_sizer, ) assert len(choices.index) == len(choosers.index) diff --git a/activitysim/abm/models/util/cdap.py b/activitysim/abm/models/util/cdap.py index c4993dcd0..d10945091 100644 --- a/activitysim/abm/models/util/cdap.py +++ b/activitysim/abm/models/util/cdap.py @@ -168,7 +168,14 @@ def assign_cdap_rank( def individual_utilities( - persons, cdap_indiv_spec, locals_d, trace_hh_id=None, trace_label=None + whale: workflow.Whale, + persons, + cdap_indiv_spec, + locals_d, + trace_hh_id=None, + trace_label=None, + *, + chunk_sizer, ): """ Calculate CDAP utilities for all individuals. @@ -190,7 +197,12 @@ def individual_utilities( # calculate single person utilities indiv_utils = simulate.eval_utilities( - whale, cdap_indiv_spec, persons, locals_d, trace_label=trace_label + whale, + cdap_indiv_spec, + persons, + locals_d, + trace_label=trace_label, + chunk_sizer=chunk_sizer, ) # add columns from persons to facilitate building household interactions @@ -626,7 +638,14 @@ def hh_choosers(indiv_utils, hhsize): def household_activity_choices( - indiv_utils, interaction_coefficients, hhsize, trace_hh_id=None, trace_label=None + whale: workflow.Whale, + indiv_utils, + interaction_coefficients, + hhsize, + trace_hh_id=None, + trace_label=None, + *, + chunk_sizer, ): """ Calculate household utilities for each activity pattern alternative for households of hhsize @@ -673,7 +692,9 @@ def household_activity_choices( trace_label=trace_label, ) - utils = simulate.eval_utilities(whale, spec, choosers, trace_label=trace_label) + utils = simulate.eval_utilities( + whale, spec, choosers, trace_label=trace_label, chunk_sizer=chunk_sizer + ) if len(utils.index) == 0: return pd.Series(dtype="float64") @@ -861,6 +882,7 @@ def extra_hh_member_choices( def _run_cdap( + whale: workflow.Whale, persons, person_type_map, cdap_indiv_spec, @@ -869,6 +891,8 @@ def _run_cdap( locals_d, trace_hh_id, trace_label, + *, + chunk_sizer, ): """ Implements core run_cdap functionality on persons df (or chunked subset thereof) @@ -886,17 +910,19 @@ def _run_cdap( # persons with cdap_rank 1..MAX_HHSIZE will be have their activities chose by CDAP model # extra household members, will have activities assigned by in fixed proportions assign_cdap_rank(whale, persons, person_type_map, trace_hh_id, trace_label) - chunk.log_df(trace_label, "persons", persons) + chunk_sizer.log_df(trace_label, "persons", persons) # Calculate CDAP utilities for each individual, ignoring interactions # ind_utils has index of 'person_id' and a column for each alternative # i.e. three columns 'M' (Mandatory), 'N' (NonMandatory), 'H' (Home) indiv_utils = individual_utilities( + whale, persons[persons.cdap_rank <= MAX_HHSIZE], cdap_indiv_spec, locals_d, trace_hh_id, trace_label, + chunk_sizer=chunk_sizer, ) chunk.log_df(trace_label, "indiv_utils", indiv_utils) @@ -916,11 +942,11 @@ def _run_cdap( hh_choices_list.append(choices) del indiv_utils - chunk.log_df(trace_label, "indiv_utils", None) + chunk_sizer.log_df(trace_label, "indiv_utils", None) # concat all the household choices into a single series indexed on _hh_index_ hh_activity_choices = pd.concat(hh_choices_list) - chunk.log_df(trace_label, "hh_activity_choices", hh_activity_choices) + chunk_sizer.log_df(trace_label, "hh_activity_choices", hh_activity_choices) # unpack the household activity choice list into choices for each (non-extra) household member # resulting series contains one activity per individual hh member, indexed on _persons_index_ @@ -940,7 +966,7 @@ def _run_cdap( person_choices = pd.concat([cdap_person_choices, extra_person_choices]) persons["cdap_activity"] = person_choices - chunk.log_df(trace_label, "persons", persons) + chunk_sizer.log_df(trace_label, "persons", persons) # if DUMP: # tracing.trace_df(hh_activity_choices, '%s.DUMP.hh_activity_choices' % trace_label, @@ -951,12 +977,13 @@ def _run_cdap( result = persons[["cdap_rank", "cdap_activity"]] del persons - chunk.log_df(trace_label, "persons", None) + chunk_sizer.log_df(trace_label, "persons", None) return result def run_cdap( + whale: workflow.Whale, persons, person_type_map, cdap_indiv_spec, @@ -1011,9 +1038,11 @@ def run_cdap( i, persons_chunk, chunk_trace_label, + chunk_sizer, ) in chunk.adaptive_chunked_choosers_by_chunk_id(persons, chunk_size, trace_label): cdap_results = _run_cdap( + whale, persons_chunk, person_type_map, cdap_indiv_spec, @@ -1022,6 +1051,7 @@ def run_cdap( locals_d, trace_hh_id, chunk_trace_label, + chunk_sizer=chunk_sizer, ) result_list.append(cdap_results) diff --git a/activitysim/abm/models/util/estimation.py b/activitysim/abm/models/util/estimation.py index 5a3ffd06c..7d02457b3 100644 --- a/activitysim/abm/models/util/estimation.py +++ b/activitysim/abm/models/util/estimation.py @@ -514,7 +514,9 @@ def initialize_settings(self, whale): self.settings_initialized = True - def begin_estimation(self, whale, model_name, bundle_name=None) -> Estimator: + def begin_estimation( + self, whale: workflow.Whale, model_name: str, bundle_name=None + ) -> Estimator: """ begin estimating of model_name is specified as model to estimate, otherwise return False diff --git a/activitysim/abm/models/util/mode.py b/activitysim/abm/models/util/mode.py index f050cbeef..f1455f0ae 100644 --- a/activitysim/abm/models/util/mode.py +++ b/activitysim/abm/models/util/mode.py @@ -53,6 +53,7 @@ def mode_choice_simulate( want_logsums = logsum_column_name is not None choices = simulate.simple_simulate( + whale, choosers=choosers, spec=spec, nest_spec=nest_spec, diff --git a/activitysim/abm/models/util/probabilistic_scheduling.py b/activitysim/abm/models/util/probabilistic_scheduling.py index 4de9a5686..9ecea3ce3 100644 --- a/activitysim/abm/models/util/probabilistic_scheduling.py +++ b/activitysim/abm/models/util/probabilistic_scheduling.py @@ -84,7 +84,7 @@ def _report_bad_choices(bad_row_map, df, filename, trace_label, trace_choosers=N filename = "%s.%s" % (trace_label, filename) logger.info("dumping %s" % filename) - tracing.write_csv(df, file_name=filename, transpose=False) + tracing.write_csv(whale, df, file_name=filename, transpose=False) # log the indexes of the first MAX_PRINT offending rows MAX_PRINT = 0 diff --git a/activitysim/abm/models/util/test/test_cdap.py b/activitysim/abm/models/util/test/test_cdap.py index 55b3ad23c..fd195d8f9 100644 --- a/activitysim/abm/models/util/test/test_cdap.py +++ b/activitysim/abm/models/util/test/test_cdap.py @@ -85,10 +85,14 @@ def test_individual_utilities(people, model_settings): with chunk.chunk_log( "test_individual_utilities", base=True, settings=whale.settings - ): + ) as chunk_sizer: cdap.assign_cdap_rank(whale, people, person_type_map) individual_utils = cdap.individual_utilities( - people, cdap_indiv_and_hhsize1, locals_d=None + whale, + people, + cdap_indiv_and_hhsize1, + locals_d=None, + chunk_sizer=chunk_sizer, ) individual_utils = individual_utils[["M", "N", "H"]] @@ -143,10 +147,14 @@ def test_build_cdap_spec_hhsize2(whale: workflow.Whale, people, model_settings): with chunk.chunk_log( "test_build_cdap_spec_hhsize2", base=True, settings=whale.settings - ): + ) as chunk_sizer: cdap.assign_cdap_rank(whale, people, person_type_map) indiv_utils = cdap.individual_utilities( - people, cdap_indiv_and_hhsize1, locals_d=None + whale, + people, + cdap_indiv_and_hhsize1, + locals_d=None, + chunk_sizer=chunk_sizer, ) choosers = cdap.hh_choosers(indiv_utils, hhsize=hhsize) diff --git a/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py b/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py index 4e4325b05..96fc4ee4c 100644 --- a/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py +++ b/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py @@ -63,7 +63,7 @@ def test_vts(): persons = pd.DataFrame({"income": [20, 30, 25]}, index=[1, 2, 3]) - inject.add_table("persons", persons) + whale.add_table("persons", persons) spec = pd.DataFrame({"Coefficient": [1.2]}, index=["income"]) spec.index.name = "Expression" diff --git a/activitysim/abm/models/util/tour_destination.py b/activitysim/abm/models/util/tour_destination.py index b22f2e241..8b3f8b931 100644 --- a/activitysim/abm/models/util/tour_destination.py +++ b/activitysim/abm/models/util/tour_destination.py @@ -64,6 +64,7 @@ def dest_size_terms_df(self, segment_name, trace_label): def _destination_sample( + whale: workflow.Whale, spec_segment_name, choosers, destination_size_terms, @@ -86,7 +87,7 @@ def _destination_sample( logger.info("running %s with %d tours", trace_label, len(choosers)) sample_size = model_settings["SAMPLE_SIZE"] - if config.setting("disable_destination_sampling", False) or ( + if whale.settings.disable_destination_sampling or ( estimator and estimator.want_unsampled_alternatives ): # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count @@ -106,7 +107,7 @@ def _destination_sample( if constants is not None: locals_d.update(constants) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = whale.settings.log_alt_losers choices = interaction_sample( whale, @@ -135,6 +136,7 @@ def _destination_sample( def destination_sample( + whale: workflow.Whale, spec_segment_name, choosers, model_settings, @@ -161,6 +163,7 @@ def destination_sample( alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] choices = _destination_sample( + whale, spec_segment_name, choosers, destination_size_terms, @@ -456,6 +459,7 @@ def choose_MAZ_for_TAZ(whale: workflow.Whale, taz_sample, MAZ_size_terms, trace_ def destination_presample( + whale: workflow.Whale, spec_segment_name, choosers, model_settings, @@ -489,6 +493,7 @@ def destination_presample( skims = skim_dict.wrap(ORIG_TAZ, DEST_TAZ) taz_sample = _destination_sample( + whale, spec_segment_name, choosers, TAZ_size_terms, @@ -512,6 +517,7 @@ def destination_presample( def run_destination_sample( + whale, spec_segment_name, tours, persons_merged, @@ -547,7 +553,7 @@ def run_destination_sample( # by default, enable presampling for multizone systems, unless they disable it in settings file pre_sample_taz = not (network_los.zone_system == los.ONE_ZONE) - if pre_sample_taz and not config.setting("want_dest_choice_presampling", True): + if pre_sample_taz and not whale.settings.want_dest_choice_presampling: pre_sample_taz = False logger.info( f"Disabled destination zone presampling for {trace_label} " @@ -560,6 +566,7 @@ def run_destination_sample( ) choices = destination_presample( + whale, spec_segment_name, choosers, model_settings, @@ -572,6 +579,7 @@ def run_destination_sample( else: choices = destination_sample( + whale, spec_segment_name, choosers, model_settings, @@ -590,6 +598,7 @@ def run_destination_sample( def run_destination_logsums( + whale: workflow.Whale, tour_purpose, persons_merged, destination_sample, @@ -662,6 +671,7 @@ def run_destination_logsums( def run_destination_simulate( + whale: workflow.Whale, spec_segment_name, tours, persons_merged, @@ -746,9 +756,10 @@ def run_destination_simulate( tracing.dump_df(DUMP, choosers, trace_label, "choosers") - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = whale.settings.log_alt_losers choices = interaction_sample_simulate( + whale, choosers, destination_sample, spec=model_spec, @@ -774,6 +785,7 @@ def run_destination_simulate( def run_tour_destination( + whale: workflow.Whale, tours, persons_merged, want_logsums, @@ -782,7 +794,6 @@ def run_tour_destination( network_los, estimator, chunk_size, - trace_hh_id, trace_label, skip_choice=False, ): @@ -821,6 +832,7 @@ def run_tour_destination( # - destination_sample spec_segment_name = segment_name # spec_segment_name is segment_name location_sample_df = run_destination_sample( + whale, spec_segment_name, choosers, persons_merged, @@ -835,6 +847,7 @@ def run_tour_destination( # - destination_logsums tour_purpose = segment_name # tour_purpose is segment_name location_sample_df = run_destination_logsums( + whale, tour_purpose, persons_merged, location_sample_df, @@ -847,6 +860,7 @@ def run_tour_destination( # - destination_simulate spec_segment_name = segment_name # spec_segment_name is segment_name choices = run_destination_simulate( + whale, spec_segment_name, choosers, persons_merged, diff --git a/activitysim/abm/models/util/tour_od.py b/activitysim/abm/models/util/tour_od.py index e774eef20..41cf80f6b 100644 --- a/activitysim/abm/models/util/tour_od.py +++ b/activitysim/abm/models/util/tour_od.py @@ -142,7 +142,7 @@ def _od_sample( logger.info("running %s with %d tours", trace_label, len(choosers)) sample_size = model_settings["SAMPLE_SIZE"] - if config.setting("disable_destination_sampling", False) or ( + if whale.settings.disable_destination_sampling or ( estimator and estimator.want_unsampled_alternatives ): # FIXME interaction_sample will return unsampled complete alternatives @@ -717,7 +717,7 @@ def run_od_sample( # by default, enable presampling for multizone systems, unless they disable it in settings file pre_sample_taz = not (network_los.zone_system == los.ONE_ZONE) - if pre_sample_taz and not config.setting("want_dest_choice_presampling", True): + if pre_sample_taz and not whale.settings.want_dest_choice_presampling: pre_sample_taz = False logger.info( f"Disabled destination zone presampling for {trace_label} " @@ -924,6 +924,7 @@ def run_od_logsums( def run_od_simulate( + whale: workflow.Whale, spec_segment_name, tours, od_sample, @@ -1009,6 +1010,7 @@ def run_od_simulate( tracing.dump_df(DUMP, choosers, trace_label, "choosers") choices = interaction_sample_simulate( + whale, choosers, od_sample, spec=model_spec, diff --git a/activitysim/abm/models/util/tour_scheduling.py b/activitysim/abm/models/util/tour_scheduling.py index ed6f89105..11ab76671 100644 --- a/activitysim/abm/models/util/tour_scheduling.py +++ b/activitysim/abm/models/util/tour_scheduling.py @@ -19,7 +19,6 @@ def run_tour_scheduling( tdd_alts, tour_segment_col, chunk_size, - trace_hh_id, ): trace_label = model_name model_settings_file_name = f"{model_name}.yaml" @@ -67,7 +66,7 @@ def run_tour_scheduling( # estimator for this tour_segment estimator = estimation.manager.begin_estimation( - model_name=bundle_name, bundle_name=bundle_name + whale, model_name=bundle_name, bundle_name=bundle_name ) spec_file_name = spec_settings["SPEC"] @@ -108,7 +107,7 @@ def run_tour_scheduling( assert "TOUR_SPEC_SEGMENTS" not in model_settings assert tour_segment_col is None - estimator = estimation.manager.begin_estimation(model_name) + estimator = estimation.manager.begin_estimation(whale, model_name) spec_file_name = model_settings["SPEC"] model_spec = simulate.read_model_spec(file_name=spec_file_name) diff --git a/activitysim/abm/models/util/vectorize_tour_scheduling.py b/activitysim/abm/models/util/vectorize_tour_scheduling.py index b198f61a0..23ad3224a 100644 --- a/activitysim/abm/models/util/vectorize_tour_scheduling.py +++ b/activitysim/abm/models/util/vectorize_tour_scheduling.py @@ -757,9 +757,10 @@ def _schedule_tours( estimator.set_alt_id(choice_column) estimator.write_interaction_sample_alternatives(alt_tdd) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = whale.settings.log_alt_losers choices = interaction_sample_simulate( + whale, tours, alt_tdd, spec, diff --git a/activitysim/abm/models/vehicle_allocation.py b/activitysim/abm/models/vehicle_allocation.py index 227b78c79..73e88e3eb 100644 --- a/activitysim/abm/models/vehicle_allocation.py +++ b/activitysim/abm/models/vehicle_allocation.py @@ -114,7 +114,7 @@ def vehicle_allocation( logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME") - estimator = estimation.manager.begin_estimation("vehicle_allocation") + estimator = estimation.manager.begin_estimation(whale, "vehicle_allocation") model_spec_raw = simulate.read_model_spec(whale, file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) @@ -195,6 +195,7 @@ def vehicle_allocation( locals_dict.update({"occup": occup}) choices = simulate.simple_simulate( + whale, choosers=choosers, spec=model_spec, nest_spec=nest_spec, diff --git a/activitysim/abm/models/vehicle_type_choice.py b/activitysim/abm/models/vehicle_type_choice.py index d31f3278c..f1378d8f5 100644 --- a/activitysim/abm/models/vehicle_type_choice.py +++ b/activitysim/abm/models/vehicle_type_choice.py @@ -406,6 +406,7 @@ def iterate_vehicle_type_choice( # each alternative as a distinct column in the .csv elif simulation_type == "simple_simulate": choices = simulate.simple_simulate( + whale, choosers=choosers, spec=model_spec, log_alt_losers=log_alt_losers, @@ -517,7 +518,7 @@ def vehicle_type_choice( model_settings_file_name = "vehicle_type_choice.yaml" model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("vehicle_type") + estimator = estimation.manager.begin_estimation(whale, "vehicle_type") model_spec_raw = simulate.read_model_spec(whale, file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) diff --git a/activitysim/abm/models/work_from_home.py b/activitysim/abm/models/work_from_home.py index ddd28d209..33d0f8044 100755 --- a/activitysim/abm/models/work_from_home.py +++ b/activitysim/abm/models/work_from_home.py @@ -11,9 +11,7 @@ @workflow.step -def work_from_home( - whale: workflow.Whale, persons_merged, persons, chunk_size, trace_hh_id -): +def work_from_home(whale: workflow.Whale, persons_merged, persons, chunk_size): """ This model predicts whether a person (worker) works from home. The output from this model is TRUE (if works from home) or FALSE (works away from home). @@ -32,7 +30,7 @@ def work_from_home( choosers = choosers[choosers[chooser_filter_column_name]] logger.info("Running %s with %d persons", trace_label, len(choosers)) - estimator = estimation.manager.begin_estimation("work_from_home") + estimator = estimation.manager.begin_estimation(whale, "work_from_home") constants = config.get_model_constants(model_settings) work_from_home_alt = model_settings["WORK_FROM_HOME_ALT"] @@ -93,6 +91,7 @@ def work_from_home( ) choices = simulate.simple_simulate( + whale, choosers=choosers, spec=model_spec, nest_spec=nest_spec, @@ -176,5 +175,5 @@ def work_from_home( tracing.print_summary("work_from_home", persons.work_from_home, value_counts=True) - if trace_hh_id: + if whale.settings.trace_hh_id: tracing.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/abm/tables/disaggregate_accessibility.py b/activitysim/abm/tables/disaggregate_accessibility.py index d16eaee0f..83b87c13b 100644 --- a/activitysim/abm/tables/disaggregate_accessibility.py +++ b/activitysim/abm/tables/disaggregate_accessibility.py @@ -97,7 +97,7 @@ def maz_centroids(whale: workflow.Whale): logger.info("loaded maz_centroids %s" % (df.shape,)) # replace table function with dataframe - inject.add_table("maz_centroids", df) + whale.add_table("maz_centroids", df) return df @@ -266,7 +266,7 @@ def disaggregate_accessibility(whale: workflow.Whale): assert any(merge_df[accessibility_cols].isnull()) # Inject merged accessibilities so that it can be included in persons_merged function - inject.add_table("disaggregate_accessibility", merge_df[accessibility_cols]) + whale.add_table("disaggregate_accessibility", merge_df[accessibility_cols]) return merge_df[accessibility_cols] diff --git a/activitysim/abm/tables/shadow_pricing.py b/activitysim/abm/tables/shadow_pricing.py index 1316e4cca..862eaa785 100644 --- a/activitysim/abm/tables/shadow_pricing.py +++ b/activitysim/abm/tables/shadow_pricing.py @@ -634,6 +634,7 @@ def check_fit(self, iteration): if write_choices: tracing.write_csv( + whale, self.choices_by_iteration, "%s_choices_by_shadow_price_iteration" % self.model_selector, transpose=False, @@ -884,7 +885,7 @@ def dest_size_terms(self, segment): return size_terms - def write_trace_files(self, iteration): + def write_trace_files(self, whale: workflow.Whale, iteration): """ Write trace files for this iteration Writes desired_size, modeled_size, and shadow_prices tables @@ -901,12 +902,14 @@ def write_trace_files(self, iteration): if iteration == 1: # write desired_size only on first iteration, as it doesn't change tracing.write_csv( + whale, self.desired_size, "shadow_price_%s_desired_size" % self.model_selector, transpose=False, ) tracing.write_csv( + whale, self.modeled_size, "shadow_price_%s_modeled_size_%s" % (self.model_selector, iteration), transpose=False, @@ -914,6 +917,7 @@ def write_trace_files(self, iteration): if self.use_shadow_pricing: tracing.write_csv( + whale, self.shadow_prices, "shadow_price_%s_shadow_prices_%s" % (self.model_selector, iteration), transpose=False, diff --git a/activitysim/abm/tables/skims.py b/activitysim/abm/tables/skims.py index 07022ee11..797d5f219 100644 --- a/activitysim/abm/tables/skims.py +++ b/activitysim/abm/tables/skims.py @@ -31,8 +31,9 @@ def network_los( @workflow.cached_object -def skim_dict(whale: workflow.Whale, network_los): - return network_los.get_default_skim_dict() +def skim_dict(whale: workflow.Whale, network_los: los.Network_LOS): + result = network_los.get_default_skim_dict() + return result @workflow.cached_object diff --git a/activitysim/abm/tables/table_dict.py b/activitysim/abm/tables/table_dict.py index 2b1cb9086..e46f75fca 100644 --- a/activitysim/abm/tables/table_dict.py +++ b/activitysim/abm/tables/table_dict.py @@ -4,7 +4,7 @@ from collections import OrderedDict from activitysim.abm.models.util import canonical_ids as cid -from activitysim.core import inject +from activitysim.core import inject, workflow logger = logging.getLogger(__name__) @@ -14,14 +14,14 @@ """ -@inject.injectable() -def rng_channels(): +@workflow.cached_object +def rng_channels(whale: workflow.Whale): return cid.RANDOM_CHANNELS -@inject.injectable() -def traceable_tables(): +@workflow.cached_object +def traceable_tables(whale: workflow.Whale): # names of all traceable tables ordered by dependency on household_id # e.g. 'persons' has to be registered AFTER 'households' @@ -29,20 +29,20 @@ def traceable_tables(): return cid.TRACEABLE_TABLES -@inject.injectable() -def traceable_table_indexes(): +@workflow.cached_object +def traceable_table_indexes(whale: workflow.Whale): # traceable_table_indexes is OrderedDict {: } # so we can find first registered table to slice by ref_col return OrderedDict() -@inject.injectable() -def traceable_table_ids(): +@workflow.cached_object +def traceable_table_ids(whale: workflow.Whale): # traceable_table_ids is dict {: [, ]} return dict() -@inject.injectable() -def canonical_table_index_names(): +@workflow.cached_object +def canonical_table_index_names(whale: workflow.Whale): # traceable_table_ids is dict {: [, ]} return cid.CANONICAL_TABLE_INDEX_NAMES diff --git a/activitysim/abm/tables/vehicles.py b/activitysim/abm/tables/vehicles.py index 498ce3faf..384b3dc70 100644 --- a/activitysim/abm/tables/vehicles.py +++ b/activitysim/abm/tables/vehicles.py @@ -37,7 +37,7 @@ def vehicles(whale: workflow.Whale, households): vehicles.set_index("vehicle_id", inplace=True) # replace table function with dataframe - inject.add_table("vehicles", vehicles) + whale.add_table("vehicles", vehicles) whale.get_rn_generator().add_channel("vehicles", vehicles) tracing.register_traceable_table("vehicles", vehicles) diff --git a/activitysim/benchmarking/componentwise.py b/activitysim/benchmarking/componentwise.py index 0ae04807b..cfba48651 100644 --- a/activitysim/benchmarking/componentwise.py +++ b/activitysim/benchmarking/componentwise.py @@ -103,7 +103,7 @@ def setup_component( # Extract the resume_after argument based on the model immediately # prior to the component being benchmarked. - models = config.setting("models") + models = whale.settings.models try: component_index = models.index(component_name) except ValueError: @@ -115,7 +115,7 @@ def setup_component( else: resume_after = None - if config.setting("multiprocess", False): + if whale.settings.multiprocess: raise NotImplementedError( "multiprocess component benchmarking is not yet implemented" ) @@ -157,7 +157,7 @@ def setup_component( def run_component(whale, component_name): logger.info("run_component: %s", component_name) try: - if config.setting("multiprocess", False): + if whale.settings.multiprocess: raise NotImplementedError( "multiprocess component benchmarking is not yet implemented" ) @@ -186,7 +186,7 @@ def teardown_component(whale, component_name): logger.info("dropping table %s", table_name) whale.drop_table(table_name) - if config.setting("multiprocess", False): + if whale.settings.multiprocess: raise NotImplementedError("multiprocess benchmarking is not yet implemented") else: whale.close_pipeline() @@ -298,11 +298,11 @@ def pre_run( logger.info(f"MODELS: {config.setting('models')}") - if config.setting("multiprocess", False): + if whale.settings.multiprocess: logger.info("run multi-process complete simulation") else: logger.info("run single process simulation") - whale.run(models=config.setting("models")) + whale.run(models=whale.settings.models) whale.close_pipeline() tracing.print_elapsed_time("prerun required models for checkpointing", t0) @@ -327,7 +327,7 @@ def run_multiprocess(): # assert not pipeline.is_open() # - # if config.setting("cleanup_pipeline_after_run", False): + # if whale.settings.cleanup_pipeline_after_run: # pipeline.cleanup_pipeline() diff --git a/activitysim/cli/run.py b/activitysim/cli/run.py index f8d601c92..3a822998b 100644 --- a/activitysim/cli/run.py +++ b/activitysim/cli/run.py @@ -307,7 +307,7 @@ def run(args): memory_sidecar_process = None # legacy support for run_list setting nested 'models' and 'resume_after' settings - # if config.setting("run_list"): + # if whale.settings.run_list: # warnings.warn( # "Support for 'run_list' settings group will be removed.\n" # "The run_list.steps setting is renamed 'models'.\n" @@ -315,7 +315,7 @@ def run(args): # "Specify both 'models' and 'resume_after' directly in settings config file.", # FutureWarning, # ) - # run_list = config.setting("run_list") + # run_list = whale.settings.run_list # if "steps" in run_list: # assert not config.setting( # "models" @@ -336,7 +336,7 @@ def run(args): # cleanup if not resuming if not resume_after: cleanup_output_files(whale) - elif config.setting("cleanup_trace_files_on_resume", False): + elif whale.settings.cleanup_trace_files_on_resume: tracing.delete_trace_files(whale) tracing.config_logger( diff --git a/activitysim/core/chunk.py b/activitysim/core/chunk.py index c96d76ca4..3fba39817 100644 --- a/activitysim/core/chunk.py +++ b/activitysim/core/chunk.py @@ -15,8 +15,8 @@ import pandas as pd import xarray as xr -from . import config, mem, tracing, util -from .util import GB +from activitysim.core import config, mem, tracing, util, workflow +from activitysim.core.util import GB logger = logging.getLogger(__name__) @@ -131,9 +131,7 @@ def chunk_method(): method = SETTINGS.get("chunk_method") if method is None: - method = SETTINGS.setdefault( - "chunk_method", config.setting("chunk_method", DEFAULT_CHUNK_METHOD) - ) + method = SETTINGS.setdefault("chunk_method", whale.settings.chunk_method) assert ( method in CHUNK_METHODS ), f"chunk_method setting '{method}' not recognized. Should be one of: {CHUNK_METHODS}" @@ -148,7 +146,7 @@ def chunk_metric(): def chunk_training_mode(): training_mode = SETTINGS.setdefault( - "chunk_training_mode", config.setting("chunk_training_mode", MODE_ADAPTIVE) + "chunk_training_mode", whale.settings.chunk_training_mode ) if not training_mode: training_mode = MODE_CHUNKLESS @@ -173,7 +171,7 @@ def default_initial_rows_per_chunk(): def min_available_chunk_ratio(): return SETTINGS.setdefault( - "min_available_chunk_ratio", config.setting("min_available_chunk_ratio", 0) + "min_available_chunk_ratio", whale.settings.min_available_chunk_ratio ) @@ -181,9 +179,7 @@ def keep_chunk_logs(): # if we are overwriting MEM_LOG_FILE then presumably we want to delete any subprocess files default = LOG_FILE_NAME == OMNIBUS_LOG_FILE_NAME - return SETTINGS.setdefault( - "keep_chunk_logs", config.setting("keep_chunk_logs", default) - ) + return SETTINGS.setdefault("keep_chunk_logs", whale.settings.keep_chunk_logs) def trace_label_for_chunk(trace_label, chunk_size, i): @@ -217,7 +213,6 @@ def overhead_for_chunk_method(overhead, method=None): """ def hybrid(xss, bytes): - # this avoids pessimistic underchunking on second chunk without pre-existing cache # but it tends to overshoot on a trained runs # hybrid_overhead = np.maximum(bytes, (xss + bytes) / 2) @@ -242,7 +237,6 @@ def hybrid(xss, bytes): def consolidate_logs(): - glob_file_name = config.log_file_path(f"*{LOG_FILE_NAME}", prefix=False) glob_files = glob.glob(glob_file_name) @@ -323,8 +317,7 @@ def consolidate_logs(): omnibus_df.to_csv(log_dir_output_path, mode="w", index=False) if (chunk_training_mode() == MODE_RETRAIN) or not _HISTORIAN.have_cached_history: - - if config.setting("resume_after"): + if whale.settings.resume_after: # FIXME logger.warning( f"Not updating chunk_log cache directory because resume_after" @@ -345,13 +338,11 @@ class ChunkHistorian(object): """ def __init__(self): - self.chunk_log_path = None self.have_cached_history = None self.cached_history_df = None def load_cached_history(self): - if chunk_training_mode() == MODE_RETRAIN: # don't need cached history if retraining return @@ -398,18 +389,15 @@ def load_cached_history(self): ) def cached_history_for_chunk_tag(self, chunk_tag): - history = {} self.load_cached_history() if self.have_cached_history: - try: df = self.cached_history_df[ self.cached_history_df[C_CHUNK_TAG] == chunk_tag ] if len(df) > 0: - if len(df) > 1: # don't expect this, but not fatal logger.warning( @@ -429,7 +417,6 @@ def cached_history_for_chunk_tag(self, chunk_tag): return history def cached_row_size(self, chunk_tag): - row_size = 0 cached_history = self.cached_history_for_chunk_tag(chunk_tag) @@ -443,7 +430,6 @@ def cached_row_size(self, chunk_tag): return row_size def write_history(self, history, chunk_tag): - assert chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS) history_df = pd.DataFrame.from_dict(history) @@ -488,7 +474,6 @@ def __init__(self, trace_label, chunk_size, baseline_rss, baseline_uss, headroom self.total_bytes = 0 def audit(self, msg, bytes=0, rss=0, uss=0, from_rss_monitor=False): - assert chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS) MAX_OVERDRAFT = 0.2 @@ -596,7 +581,6 @@ def size_it(df): self.total_bytes = sum(self.tables.values()) def check_local_hwm(self, hwm_trace_label, rss, uss, total_bytes): - assert chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS) from_rss_monitor = total_bytes is None @@ -650,7 +634,6 @@ def get_hwm_bytes(self): def log_rss(trace_label, force=False): - if chunk_training_mode() == MODE_CHUNKLESS: # no memory tracing at all in chunkless mode return @@ -674,7 +657,6 @@ def log_rss(trace_label, force=False): def log_df(trace_label, table_name, df): - if chunk_training_mode() in (MODE_PRODUCTION, MODE_CHUNKLESS): return @@ -721,7 +703,6 @@ def __init__( chunk_size=0, chunk_training_mode="disabled", ): - self.depth = len(CHUNK_SIZERS) + 1 self.chunk_training_mode = chunk_training_mode @@ -795,7 +776,6 @@ def __init__( ) def close(self): - if self.chunk_training_mode == MODE_CHUNKLESS: return @@ -808,12 +788,10 @@ def close(self): assert _chunk_sizer == self def available_headroom(self, xss): - headroom = self.base_chunk_size - xss # adjust deficient headroom to min_chunk_size if headroom < self.min_chunk_size: - if self.base_chunk_size > 0: logger.warning( f"Not enough memory for minimum chunk_size without exceeding specified chunk_size. " @@ -827,7 +805,6 @@ def available_headroom(self, xss): return headroom def initial_rows_per_chunk(self): - # whatever the TRAINING_MODE, use cache to determine initial_row_size # (presumably preferable to default_initial_rows_per_chunk) self.initial_row_size = _HISTORIAN.cached_row_size(self.chunk_tag) @@ -837,7 +814,6 @@ def initial_rows_per_chunk(self): estimated_number_of_chunks = 1 self.initial_row_size = 0 else: - # we should be a base chunker assert len(CHUNK_LEDGERS) == 0, f"len(CHUNK_LEDGERS): {len(CHUNK_LEDGERS)}" @@ -895,7 +871,6 @@ def adaptive_rows_per_chunk(self, i): prev_uss = self.uss if self.chunk_training_mode != MODE_PRODUCTION: - if chunk_metric() == USS: self.rss, self.uss = mem.get_rss(force_garbage_collect=True, uss=True) else: @@ -915,7 +890,6 @@ def adaptive_rows_per_chunk(self, i): observed_row_size = self.initial_row_size overhead = self.cum_overhead.copy() else: - # calculate overhead for this chunk iteration overhead = {} overhead[BYTES] = self.chunk_ledger.get_hwm_bytes() @@ -991,7 +965,6 @@ def adaptive_rows_per_chunk(self, i): @contextmanager def ledger(self): - # don't do anything in chunkless mode if self.chunk_training_mode == MODE_CHUNKLESS: yield @@ -1031,9 +1004,7 @@ def ledger(self): ) # make sure we get at least one reading finally: - if mem_monitor is not None: - if not mem_monitor.is_alive(): logger.error(f"mem_monitor for {self.trace_label} died!") bug # bug @@ -1051,7 +1022,6 @@ def ledger(self): self.chunk_ledger = None def log_rss(self, trace_label, force=False): - if self.chunk_training_mode == MODE_CHUNKLESS: # no memory tracing at all in chunkless mode return @@ -1074,7 +1044,6 @@ def log_rss(self, trace_label, force=False): c.check_local_hwm(hwm_trace_label, rss, uss, total_bytes=None) def log_df(self, trace_label, table_name, df): - if self.chunk_training_mode in (MODE_PRODUCTION, MODE_CHUNKLESS): return @@ -1100,7 +1069,6 @@ def log_df(self, trace_label, table_name, df): @contextmanager def chunk_log(trace_label, chunk_tag=None, base=False, settings=None): - # With `base=True` this method can be used to instantiate # a ChunkSizer class object without actually chunking. This # avoids breaking the assertion below. @@ -1129,7 +1097,6 @@ def chunk_log(trace_label, chunk_tag=None, base=False, settings=None): chunk_sizer.initial_rows_per_chunk() with chunk_sizer.ledger(): - yield chunk_sizer if _chunk_training_mode != MODE_CHUNKLESS: @@ -1140,14 +1107,12 @@ def chunk_log(trace_label, chunk_tag=None, base=False, settings=None): @contextmanager def chunk_log_skip(): - yield None def adaptive_chunked_choosers(whale, choosers, chunk_size, trace_label, chunk_tag=None): - # generator to iterate over choosers if whale.settings.chunk_training_mode == MODE_CHUNKLESS: @@ -1176,14 +1141,12 @@ def adaptive_chunked_choosers(whale, choosers, chunk_size, trace_label, chunk_ta i = offset = 0 while offset < num_choosers: - i += 1 assert offset + rows_per_chunk <= num_choosers chunk_trace_label = trace_label_for_chunk(trace_label, chunk_size, i) with chunk_sizer.ledger(): - # grab the next chunk based on current rows_per_chunk chooser_chunk = choosers[offset : offset + rows_per_chunk] @@ -1206,7 +1169,12 @@ def adaptive_chunked_choosers(whale, choosers, chunk_size, trace_label, chunk_ta def adaptive_chunked_choosers_and_alts( - choosers, alternatives, chunk_size, trace_label, chunk_tag=None + whale: workflow.Whale, + choosers, + alternatives, + chunk_size, + trace_label, + chunk_tag=None, ): """ generator to iterate over choosers and alternatives in chunk_size chunks @@ -1241,12 +1209,15 @@ def adaptive_chunked_choosers_and_alts( chunk of alternatives for chooser chunk """ - if chunk_training_mode() == MODE_CHUNKLESS: + if whale.settings.chunk_training_mode == MODE_CHUNKLESS: # The adaptive chunking logic is expensive and sometimes results # in needless data copying. So we short circuit it entirely # when chunking is disabled. logger.info(f"Running chunkless with {len(choosers)} choosers") - yield 0, choosers, alternatives, trace_label + chunk_sizer = ChunkSizer( + "chunkless", trace_label, 0, 0, whale.settings.chunk_training_mode + ) + yield 0, choosers, alternatives, trace_label, chunk_sizer return check_assertions = False @@ -1302,7 +1273,6 @@ def adaptive_chunked_choosers_and_alts( chunk_trace_label = trace_label_for_chunk(trace_label, chunk_size, i) with chunk_sizer.ledger(): - chooser_chunk = choosers[offset : offset + rows_per_chunk] alt_end = alt_chunk_ends[offset + rows_per_chunk] @@ -1321,7 +1291,7 @@ def adaptive_chunked_choosers_and_alts( f"with {len(chooser_chunk)} of {num_choosers} choosers" ) - yield i, chooser_chunk, alternative_chunk, chunk_trace_label + yield i, chooser_chunk, alternative_chunk, chunk_trace_label, chunk_sizer offset += rows_per_chunk alt_offset = alt_end @@ -1336,7 +1306,7 @@ def adaptive_chunked_choosers_and_alts( def adaptive_chunked_choosers_by_chunk_id( - choosers, chunk_size, trace_label, chunk_tag=None + whale: workflow.Whale, choosers, chunk_size, trace_label, chunk_tag=None ): # generator to iterate over choosers in chunk_size chunks # like chunked_choosers but based on chunk_id field rather than dataframe length @@ -1344,12 +1314,15 @@ def adaptive_chunked_choosers_by_chunk_id( # all have to be included in the same chunk) # FIXME - we pathologically know name of chunk_id col in households table - if chunk_training_mode() == MODE_CHUNKLESS: + if whale.settings.chunk_training_mode == MODE_CHUNKLESS: # The adaptive chunking logic is expensive and sometimes results # in needless data copying. So we short circuit it entirely # when chunking is disabled. logger.info(f"Running chunkless with {len(choosers)} choosers") - yield 0, choosers, trace_label + chunk_sizer = ChunkSizer( + "chunkless", trace_label, 0, 0, whale.settings.chunk_training_mode + ) + yield 0, choosers, trace_label, chunk_sizer return chunk_tag = chunk_tag or trace_label @@ -1363,14 +1336,12 @@ def adaptive_chunked_choosers_by_chunk_id( i = offset = 0 while offset < num_choosers: - i += 1 assert offset + rows_per_chunk <= num_choosers chunk_trace_label = trace_label_for_chunk(trace_label, chunk_size, i) with chunk_sizer.ledger(): - chooser_chunk = choosers[ choosers["chunk_id"].between(offset, offset + rows_per_chunk - 1) ] @@ -1380,7 +1351,7 @@ def adaptive_chunked_choosers_by_chunk_id( f"with {rows_per_chunk} of {num_choosers} choosers" ) - yield i, chooser_chunk, chunk_trace_label + yield i, chooser_chunk, chunk_trace_label, chunk_sizer offset += rows_per_chunk diff --git a/activitysim/core/configuration/filesystem.py b/activitysim/core/configuration/filesystem.py index 2468fcae6..53fdbe472 100644 --- a/activitysim/core/configuration/filesystem.py +++ b/activitysim/core/configuration/filesystem.py @@ -1,6 +1,7 @@ import glob import logging import os +import struct import time from pathlib import Path @@ -191,6 +192,39 @@ def get_log_file_path(self, file_name) -> Path: return Path(file_path) + def get_trace_file_path(self, file_name): + """ + Get the complete path to a trace file. + + Parameters + ---------- + file_name : str + Base name of the trace file. + + Returns + ------- + Path + """ + + output_dir = self.get_output_dir() + + # - check for trace subfolder, create it if missing + trace_dir = output_dir.joinpath("trace") + if not trace_dir.exists(): + trace_dir.mkdir(parents=True) + + # construct a unique tail string from the time + # this is a convenience for opening multiple similarly named trace files + tail = hex(struct.unpack(" Path: """ Get the cache directory, creating it if needed. diff --git a/activitysim/core/expressions.py b/activitysim/core/expressions.py index 9055f96c1..79201a625 100644 --- a/activitysim/core/expressions.py +++ b/activitysim/core/expressions.py @@ -101,9 +101,12 @@ def compute_columns(whale, df, model_settings, locals_dict={}, trace_label=None) # FIXME a number of asim model preprocessors want skim_dict - should they request it in model_settings.TABLES? if whale.settings.sharrow: - _locals_dict["skim_dict"] = whale.get("skim_dataset_dict", None) + from activitysim.core.flow import skim_dataset_dict + from activitysim.core.skim_dataset import skim_dataset + + _locals_dict["skim_dict"] = whale.get_injectable("skim_dataset_dict") else: - _locals_dict["skim_dict"] = whale.get("skim_dict", None) + _locals_dict["skim_dict"] = whale.get_injectable("skim_dict") results, trace_results, trace_assigned_locals = assign.assign_variables( whale, @@ -117,7 +120,9 @@ def compute_columns(whale, df, model_settings, locals_dict={}, trace_label=None) tracing.trace_df(trace_results, label=trace_label, slicer="NONE") if trace_assigned_locals: - tracing.write_csv(trace_assigned_locals, file_name="%s_locals" % trace_label) + tracing.write_csv( + whale, trace_assigned_locals, file_name="%s_locals" % trace_label + ) return results diff --git a/activitysim/core/interaction_sample_simulate.py b/activitysim/core/interaction_sample_simulate.py index 53852bb8b..264b85e07 100644 --- a/activitysim/core/interaction_sample_simulate.py +++ b/activitysim/core/interaction_sample_simulate.py @@ -5,14 +5,14 @@ import numpy as np import pandas as pd -from . import chunk, interaction_simulate, logit, tracing -from .simulate import set_skim_wrapper_targets +from activitysim.core import chunk, interaction_simulate, logit, tracing, workflow +from activitysim.core.simulate import set_skim_wrapper_targets logger = logging.getLogger(__name__) def _interaction_sample_simulate( - whale, + whale: workflow.Whale, choosers, alternatives, spec, @@ -27,8 +27,9 @@ def _interaction_sample_simulate( trace_choice_name, estimator, skip_choice=False, + *, + chunk_sizer: chunk.ChunkSizer, ): - """ Run a MNL simulation in the situation in which alternatives must be merged with choosers because there are interaction terms or @@ -136,7 +137,7 @@ def _interaction_sample_simulate( interaction_simulate.ALT_CHOOSER_ID ] = interaction_df.index.values - chunk.log_df(trace_label, "interaction_df", interaction_df) + chunk_sizer.log_df(trace_label, "interaction_df", interaction_df) if have_trace_targets: trace_rows, trace_ids = tracing.interaction_trace_rows(interaction_df, choosers) @@ -170,10 +171,10 @@ def _interaction_sample_simulate( estimator=estimator, log_alt_losers=log_alt_losers, ) - chunk.log_df(trace_label, "interaction_utilities", interaction_utilities) + chunk_sizer.log_df(trace_label, "interaction_utilities", interaction_utilities) del interaction_df - chunk.log_df(trace_label, "interaction_df", None) + chunk_sizer.log_df(trace_label, "interaction_df", None) if have_trace_targets: tracing.trace_interaction_eval_results( @@ -197,7 +198,7 @@ def _interaction_sample_simulate( sample_counts = ( interaction_utilities.groupby(interaction_utilities.index).size().values ) - chunk.log_df(trace_label, "sample_counts", sample_counts) + chunk_sizer.log_df(trace_label, "sample_counts", sample_counts) # max number of alternatvies for any chooser max_sample_count = sample_counts.max() @@ -212,25 +213,25 @@ def _interaction_sample_simulate( inserts = np.repeat(last_row_offsets, max_sample_count - sample_counts) del sample_counts - chunk.log_df(trace_label, "sample_counts", None) + chunk_sizer.log_df(trace_label, "sample_counts", None) # insert the zero-prob utilities to pad each alternative set to same size padded_utilities = np.insert(interaction_utilities.utility.values, inserts, -999) - chunk.log_df(trace_label, "padded_utilities", padded_utilities) + chunk_sizer.log_df(trace_label, "padded_utilities", padded_utilities) del inserts del interaction_utilities - chunk.log_df(trace_label, "interaction_utilities", None) + chunk_sizer.log_df(trace_label, "interaction_utilities", None) # reshape to array with one row per chooser, one column per alternative padded_utilities = padded_utilities.reshape(-1, max_sample_count) # convert to a dataframe with one row per chooser and one column per alternative utilities_df = pd.DataFrame(padded_utilities, index=choosers.index) - chunk.log_df(trace_label, "utilities_df", utilities_df) + chunk_sizer.log_df(trace_label, "utilities_df", utilities_df) del padded_utilities - chunk.log_df(trace_label, "padded_utilities", None) + chunk_sizer.log_df(trace_label, "padded_utilities", None) if have_trace_targets: tracing.trace_df( @@ -247,16 +248,16 @@ def _interaction_sample_simulate( trace_label=trace_label, trace_choosers=choosers, ) - chunk.log_df(trace_label, "probs", probs) + chunk_sizer.log_df(trace_label, "probs", probs) if want_logsums: logsums = logit.utils_to_logsums( utilities_df, allow_zero_probs=allow_zero_probs ) - chunk.log_df(trace_label, "logsums", logsums) + chunk_sizer.log_df(trace_label, "logsums", logsums) del utilities_df - chunk.log_df(trace_label, "utilities_df", None) + chunk_sizer.log_df(trace_label, "utilities_df", None) if have_trace_targets: tracing.trace_df( @@ -282,11 +283,11 @@ def _interaction_sample_simulate( whale, probs, trace_label=trace_label, trace_choosers=choosers ) - chunk.log_df(trace_label, "positions", positions) - chunk.log_df(trace_label, "rands", rands) + chunk_sizer.log_df(trace_label, "positions", positions) + chunk_sizer.log_df(trace_label, "rands", rands) del probs - chunk.log_df(trace_label, "probs", None) + chunk_sizer.log_df(trace_label, "probs", None) # shouldn't have chosen any of the dummy pad utilities assert positions.max() < max_sample_count @@ -301,7 +302,7 @@ def _interaction_sample_simulate( # create a series with index from choosers and the index of the chosen alternative choices = pd.Series(choices, index=choosers.index) - chunk.log_df(trace_label, "choices", choices) + chunk_sizer.log_df(trace_label, "choices", choices) if allow_zero_probs and zero_probs.any() and zero_prob_choice_val is not None: # FIXME this is kind of gnarly, patch choice for zero_probs @@ -329,15 +330,16 @@ def _interaction_sample_simulate( choices = choices.to_frame("choice") choices["logsum"] = logsums - chunk.log_df(trace_label, "choices", choices) + chunk_sizer.log_df(trace_label, "choices", choices) # handing this off to our caller - chunk.log_df(trace_label, "choices", None) + chunk_sizer.log_df(trace_label, "choices", None) return choices def interaction_sample_simulate( + whale: workflow.Whale, choosers, alternatives, spec, @@ -355,7 +357,6 @@ def interaction_sample_simulate( estimator=None, skip_choice=False, ): - """ Run a simulation in the situation in which alternatives must be merged with choosers because there are interaction terms or @@ -423,11 +424,12 @@ def interaction_sample_simulate( chooser_chunk, alternative_chunk, chunk_trace_label, + chunk_sizer, ) in chunk.adaptive_chunked_choosers_and_alts( - choosers, alternatives, chunk_size, trace_label, chunk_tag + whale, choosers, alternatives, chunk_size, trace_label, chunk_tag ): - choices = _interaction_sample_simulate( + whale, chooser_chunk, alternative_chunk, spec, @@ -442,11 +444,12 @@ def interaction_sample_simulate( trace_choice_name, estimator, skip_choice, + chunk_sizer=chunk_sizer, ) result_list.append(choices) - chunk.log_df(trace_label, f"result_list", result_list) + chunk_sizer.log_df(trace_label, f"result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: diff --git a/activitysim/core/interaction_simulate.py b/activitysim/core/interaction_simulate.py index dbbe7a981..d6ea27e4c 100644 --- a/activitysim/core/interaction_simulate.py +++ b/activitysim/core/interaction_simulate.py @@ -213,7 +213,7 @@ def to_series(x): else: trace_eval_results = None - check_for_variability = config.setting("check_for_variability") + check_for_variability = whale.settings.check_for_variability # need to be able to identify which variables causes an error, which keeps # this from being expressed more parsimoniously @@ -678,7 +678,7 @@ def _interaction_simulate( alt_index_id = estimator.get_alt_id() if estimator else None chooser_index_id = ALT_CHOOSER_ID if log_alt_losers else None - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = whale.settings.sharrow interaction_utilities = None if locals_d is not None and locals_d.get("_sharrow_skip", False): diff --git a/activitysim/core/logit.py b/activitysim/core/logit.py index de32fd593..7860d63d5 100644 --- a/activitysim/core/logit.py +++ b/activitysim/core/logit.py @@ -59,7 +59,7 @@ def report_bad_choices( if trace_label: logger.info("dumping %s" % trace_label) - tracing.write_csv(df[:MAX_DUMP], file_name=trace_label, transpose=False) + tracing.write_csv(whale, df[:MAX_DUMP], file_name=trace_label, transpose=False) # log the indexes of the first MAX_DUMP offending rows for idx in df.index[:MAX_PRINT].values: diff --git a/activitysim/core/los.py b/activitysim/core/los.py index cd6a5345d..807a34868 100644 --- a/activitysim/core/los.py +++ b/activitysim/core/los.py @@ -376,6 +376,7 @@ def load_data(self): if TRACE_TRIMMED_MAZ_TO_TAP_TABLES: tracing.write_csv( + whale, df, file_name=f"trimmed_{maz_to_tap_settings['table']}", transpose=False, @@ -572,7 +573,7 @@ def multiprocess(self): ------- bool """ - is_multiprocess = config.setting("multiprocess", False) + is_multiprocess = whale.settings.multiprocess return is_multiprocess def load_shared_data(self, shared_data_buffers): @@ -600,7 +601,7 @@ def load_shared_data(self, shared_data_buffers): if self.zone_system == THREE_ZONE: assert self.tvpb is not None - if self.rebuild_tvpb_cache and not config.setting("resume_after", None): + if self.rebuild_tvpb_cache and not whale.settings.resume_after: # delete old cache at start of new run so that stale cache is not loaded by load_data_to_buffer # when singleprocess, this call is made (later in program flow) in the initialize_los step self.tvpb.tap_cache.cleanup() diff --git a/activitysim/core/mem.py b/activitysim/core/mem.py index 5e4d723e3..badb72299 100644 --- a/activitysim/core/mem.py +++ b/activitysim/core/mem.py @@ -49,14 +49,14 @@ def consolidate_logs(): Consolidate and aggregate subprocess mem logs """ - if not config.setting("multiprocess", False): + if not whale.settings.multiprocess: return - delete_originals = not config.setting("keep_mem_logs", False) + delete_originals = not whale.settings.keep_mem_logs omnibus_df = [] # for each multiprocess step - multiprocess_steps = config.setting("multiprocess_steps", []) + multiprocess_steps = whale.settings.multiprocess_steps for step in multiprocess_steps: step_name = step.get("name", None) diff --git a/activitysim/core/mp_tasks.py b/activitysim/core/mp_tasks.py index 1ee1485c4..0df5efe70 100644 --- a/activitysim/core/mp_tasks.py +++ b/activitysim/core/mp_tasks.py @@ -1429,7 +1429,7 @@ def skip_phase(phase): def find_breadcrumb(crumb, default=None): return old_breadcrumbs.get(step_name, {}).get(crumb, default) - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = whale.settings.sharrow # - allocate shared data shared_data_buffers = {} diff --git a/activitysim/core/pathbuilder.py b/activitysim/core/pathbuilder.py index 3ca78cc18..add9f6301 100644 --- a/activitysim/core/pathbuilder.py +++ b/activitysim/core/pathbuilder.py @@ -51,7 +51,7 @@ def compute_utilities( """ trace_label = tracing.extend_trace_label(trace_label, "compute_utils") - with chunk.chunk_log(trace_label, settings=whale.settings): + with chunk.chunk_log(trace_label, settings=whale.settings) as chunk_sizer: logger.debug( f"{trace_label} Running compute_utilities with {choosers.shape[0]} choosers" ) @@ -87,6 +87,7 @@ def compute_utilities( trace_all_rows=trace, trace_label=trace_label, trace_column_names=trace_column_names, + chunk_sizer=chunk_sizer, ) return utilities diff --git a/activitysim/core/simulate.py b/activitysim/core/simulate.py index e94a3fd51..b9e7d5c70 100644 --- a/activitysim/core/simulate.py +++ b/activitysim/core/simulate.py @@ -32,7 +32,6 @@ def random_rows(whale: workflow.Whale, df, n): - # only sample if df has more than n rows if len(df.index) > n: prng = whale.get_rn_generator().get_global_rng() @@ -43,7 +42,6 @@ def random_rows(whale: workflow.Whale, df, n): def uniquify_spec_index(spec): - # uniquify spec index inplace # ensure uniqueness of spec index by appending comment with dupe count # this allows us to use pandas dot to compute_utilities @@ -370,7 +368,6 @@ def get_segment_coefficients(whale: workflow.Whale, model_settings, segment_name def eval_nest_coefficients(nest_spec, coefficients, trace_label): def replace_coefficients(nest): if isinstance(nest, dict): - assert "coefficient" in nest coefficient_name = nest["coefficient"] if isinstance(coefficient_name, str): @@ -401,7 +398,6 @@ def eval_coefficients( coefficients: dict | pd.DataFrame, estimator, ): - spec = spec.copy() # don't clobber input spec if isinstance(coefficients, pd.DataFrame): @@ -450,6 +446,8 @@ def eval_utilities( log_alt_losers=False, zone_layer=None, spec_sh=None, + *, + chunk_sizer, ): """ Evaluate a utility function as defined in a spec file. @@ -537,7 +535,6 @@ def eval_utilities( # fixme - restore tracing and _check_for_variability if utilities is None or estimator or sharrow_enabled == "test": - trace_label = tracing.extend_trace_label(trace_label, "eval_utils") # avoid altering caller's passed-in locals_d parameter (they may be looping) @@ -557,11 +554,10 @@ def eval_utilities( exprs = spec.index expression_values = np.empty((spec.shape[0], choosers.shape[0])) - chunk.log_df(trace_label, "expression_values", expression_values) + chunk_sizer.log_df(trace_label, "expression_values", expression_values) i = 0 for expr, coefficients in zip(exprs, spec.values): - try: with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. @@ -598,7 +594,7 @@ def eval_utilities( expression_values[i] = expression_value i += 1 - chunk.log_df(trace_label, "expression_values", expression_values) + chunk_sizer.log_df(trace_label, "expression_values", expression_values) if estimator: df = pd.DataFrame( @@ -619,13 +615,12 @@ def eval_utilities( timelogger.mark("simple flow", False) utilities = pd.DataFrame(data=utilities, index=choosers.index, columns=spec.columns) - chunk.log_df(trace_label, "utilities", utilities) + chunk_sizer.log_df(trace_label, "utilities", utilities) timelogger.mark("assemble utilities") # sometimes tvpb will drop rows on the fly and we wind up with an empty # table of choosers. this will just bypass tracing in that case. if (trace_all_rows or have_trace_targets) and (len(choosers) > 0): - if trace_all_rows: trace_targets = pd.Series(True, index=choosers.index) else: @@ -684,7 +679,6 @@ def eval_utilities( ) if len(spec.columns) > 1: - for c in spec.columns: name = f"expression_value_{c}" @@ -739,10 +733,10 @@ def eval_utilities( timelogger.mark("sharrow test", True, logger, trace_label) del expression_values - chunk.log_df(trace_label, "expression_values", None) + chunk_sizer.log_df(trace_label, "expression_values", None) # no longer our problem - but our caller should re-log this... - chunk.log_df(trace_label, "utilities", None) + chunk_sizer.log_df(trace_label, "utilities", None) end_time = time.time() logger.info( @@ -795,7 +789,6 @@ def eval_variables(exprs, df, locals_d=None): locals_dict["df"] = df def to_array(x): - if x is None or np.isscalar(x): a = np.asanyarray([x] * len(df.index)) elif isinstance(x, pd.Series): @@ -965,7 +958,6 @@ def compute_nested_exp_utilities(raw_utilities, nest_spec): nested_utilities = pd.DataFrame(index=raw_utilities.index) for nest in logit.each_nest(nest_spec, post_order=True): - name = nest.name if nest.is_leaf: @@ -1014,7 +1006,6 @@ def compute_nested_probabilities(nested_exp_utilities, nest_spec, trace_label): nested_probabilities = pd.DataFrame(index=nested_exp_utilities.index) for nest in logit.each_nest(nest_spec, type="node", post_order=False): - probs = logit.utils_to_probs( nested_exp_utilities[nest.alternatives], trace_label=trace_label, @@ -1051,7 +1042,6 @@ def compute_base_probabilities(nested_probabilities, nests, spec): base_probabilities = pd.DataFrame(index=nested_probabilities.index) for nest in logit.each_nest(nests, type="leaf", post_order=False): - # skip root: it has a prob of 1 but we didn't compute a nested probability column for it ancestors = nest.ancestors[1:] @@ -1066,6 +1056,7 @@ def compute_base_probabilities(nested_probabilities, nests, spec): def eval_mnl( + whale: workflow.Whale, choosers, spec, locals_d, @@ -1076,6 +1067,8 @@ def eval_mnl( trace_label=None, trace_choice_name=None, trace_column_names=None, + *, + chunk_sizer, ): """ Run a simulation for when the model spec does not involve alternative @@ -1138,8 +1131,9 @@ def eval_mnl( have_trace_targets=have_trace_targets, estimator=estimator, trace_column_names=trace_column_names, + chunk_sizer=chunk_sizer, ) - chunk.log_df(trace_label, "utilities", utilities) + chunk_sizer.log_df(trace_label, "utilities", utilities) if have_trace_targets: tracing.trace_df( @@ -1151,10 +1145,10 @@ def eval_mnl( probs = logit.utils_to_probs( utilities, trace_label=trace_label, trace_choosers=choosers ) - chunk.log_df(trace_label, "probs", probs) + chunk_sizer.log_df(trace_label, "probs", probs) del utilities - chunk.log_df(trace_label, "utilities", None) + chunk_sizer.log_df(trace_label, "utilities", None) if have_trace_targets: # report these now in case make_choices throws error on bad_choices @@ -1172,7 +1166,7 @@ def eval_mnl( choices, rands = logit.make_choices(whale, probs, trace_label=trace_label) del probs - chunk.log_df(trace_label, "probs", None) + chunk_sizer.log_df(trace_label, "probs", None) if have_trace_targets: tracing.trace_df( @@ -1184,6 +1178,7 @@ def eval_mnl( def eval_nl( + whale: workflow.Whale, choosers, spec, nest_spec, @@ -1255,8 +1250,9 @@ def eval_nl( estimator=estimator, trace_column_names=trace_column_names, spec_sh=spec_sh, + chunk_sizer=chunk_sizer, ) - chunk.log_df(trace_label, "raw_utilities", raw_utilities) + chunk_sizer.log_df(trace_label, "raw_utilities", raw_utilities) if have_trace_targets: tracing.trace_df( @@ -1267,10 +1263,10 @@ def eval_nl( # exponentiated utilities of leaves and nests nested_exp_utilities = compute_nested_exp_utilities(raw_utilities, nest_spec) - chunk.log_df(trace_label, "nested_exp_utilities", nested_exp_utilities) + chunk_sizer.log_df(trace_label, "nested_exp_utilities", nested_exp_utilities) del raw_utilities - chunk.log_df(trace_label, "raw_utilities", None) + chunk_sizer.log_df(trace_label, "raw_utilities", None) if have_trace_targets: tracing.trace_df( @@ -1283,15 +1279,15 @@ def eval_nl( nested_probabilities = compute_nested_probabilities( nested_exp_utilities, nest_spec, trace_label=trace_label ) - chunk.log_df(trace_label, "nested_probabilities", nested_probabilities) + chunk_sizer.log_df(trace_label, "nested_probabilities", nested_probabilities) if want_logsums: # logsum of nest root logsums = pd.Series(np.log(nested_exp_utilities.root), index=choosers.index) - chunk.log_df(trace_label, "logsums", logsums) + chunk_sizer.log_df(trace_label, "logsums", logsums) del nested_exp_utilities - chunk.log_df(trace_label, "nested_exp_utilities", None) + chunk_sizer.log_df(trace_label, "nested_exp_utilities", None) if have_trace_targets: tracing.trace_df( @@ -1304,10 +1300,10 @@ def eval_nl( base_probabilities = compute_base_probabilities( nested_probabilities, nest_spec, spec ) - chunk.log_df(trace_label, "base_probabilities", base_probabilities) + chunk_sizer.log_df(trace_label, "base_probabilities", base_probabilities) del nested_probabilities - chunk.log_df(trace_label, "nested_probabilities", None) + chunk_sizer.log_df(trace_label, "nested_probabilities", None) if have_trace_targets: tracing.trace_df( @@ -1322,7 +1318,6 @@ def eval_nl( no_choices = (base_probabilities.sum(axis=1) - 1).abs() > BAD_PROB_THRESHOLD if no_choices.any(): - logit.report_bad_choices( no_choices, base_probabilities, @@ -1344,7 +1339,7 @@ def eval_nl( ) del base_probabilities - chunk.log_df(trace_label, "base_probabilities", None) + chunk_sizer.log_df(trace_label, "base_probabilities", None) if have_trace_targets: tracing.trace_df( @@ -1363,7 +1358,9 @@ def eval_nl( return choices +@workflow.func def _simple_simulate( + whale: workflow.Whale, choosers, spec, nest_spec, @@ -1376,6 +1373,8 @@ def _simple_simulate( trace_label=None, trace_choice_name=None, trace_column_names=None, + *, + chunk_sizer, ): """ Run an MNL or NL simulation for when the model spec does not involve alternative @@ -1427,6 +1426,7 @@ def _simple_simulate( if nest_spec is None: choices = eval_mnl( + whale, choosers, spec, locals_d, @@ -1437,9 +1437,11 @@ def _simple_simulate( trace_label=trace_label, trace_choice_name=trace_choice_name, trace_column_names=trace_column_names, + chunk_sizer=chunk_sizer, ) else: choices = eval_nl( + whale, choosers, spec, nest_spec, @@ -1451,6 +1453,7 @@ def _simple_simulate( trace_label=trace_label, trace_choice_name=trace_choice_name, trace_column_names=trace_column_names, + chunk_sizer=chunk_sizer, ) return choices @@ -1476,6 +1479,7 @@ def list_of_skims(skims): def simple_simulate( + whale: workflow.Whale, choosers, spec, nest_spec, @@ -1502,11 +1506,14 @@ def simple_simulate( result_list = [] # segment by person type and pick the right spec for each person type - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - whale, choosers, chunk_size, trace_label - ): - + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers(whale, choosers, chunk_size, trace_label): choices = _simple_simulate( + whale, chooser_chunk, spec, nest_spec, @@ -1519,11 +1526,12 @@ def simple_simulate( trace_label=chunk_trace_label, trace_choice_name=trace_choice_name, trace_column_names=trace_column_names, + chunk_sizer=chunk_sizer, ) result_list.append(choices) - chunk.log_df(trace_label, "result_list", result_list) + chunk_sizer.log_df(trace_label, "result_list", result_list) if len(result_list) > 1: choices = pd.concat(result_list) @@ -1534,6 +1542,7 @@ def simple_simulate( def simple_simulate_by_chunk_id( + whale: workflow.Whale, choosers, spec, nest_spec, @@ -1556,9 +1565,12 @@ def simple_simulate_by_chunk_id( i, chooser_chunk, chunk_trace_label, - ) in chunk.adaptive_chunked_choosers_by_chunk_id(choosers, chunk_size, trace_label): - + chunk_sizer, + ) in chunk.adaptive_chunked_choosers_by_chunk_id( + whale, choosers, chunk_size, trace_label + ): choices = _simple_simulate( + whale, chooser_chunk, spec, nest_spec, @@ -1582,7 +1594,9 @@ def simple_simulate_by_chunk_id( return choices -def eval_mnl_logsums(choosers, spec, locals_d, trace_label=None): +def eval_mnl_logsums( + whale: workflow.Whale, choosers, spec, locals_d, trace_label=None, *, chunk_sizer +): """ like eval_nl except return logsums instead of making choices @@ -1604,9 +1618,15 @@ def eval_mnl_logsums(choosers, spec, locals_d, trace_label=None): tracing.trace_df(choosers, "%s.choosers" % trace_label) utilities = eval_utilities( - whale, spec, choosers, locals_d, trace_label, have_trace_targets + whale, + spec, + choosers, + locals_d, + trace_label, + have_trace_targets, + chunk_sizer=chunk_sizer, ) - chunk.log_df(trace_label, "utilities", utilities) + chunk_sizer.log_df(trace_label, "utilities", utilities) if have_trace_targets: tracing.trace_df( @@ -1619,7 +1639,7 @@ def eval_mnl_logsums(choosers, spec, locals_d, trace_label=None): # logsum is log of exponentiated utilities summed across columns of each chooser row logsums = np.log(np.exp(utilities.values).sum(axis=1)) logsums = pd.Series(logsums, index=choosers.index) - chunk.log_df(trace_label, "logsums", logsums) + chunk_sizer.log_df(trace_label, "logsums", logsums) # trace utilities if have_trace_targets: @@ -1710,7 +1730,14 @@ def _replace_in_level(multiindex, level_name, *args, **kwargs): def eval_nl_logsums( - whale: workflow.Whale, choosers, spec, nest_spec, locals_d, trace_label=None + whale: workflow.Whale, + choosers, + spec, + nest_spec, + locals_d, + trace_label=None, + *, + chunk_sizer, ): """ like eval_nl except return logsums instead of making choices @@ -1740,8 +1767,9 @@ def eval_nl_logsums( trace_label=trace_label, have_trace_targets=have_trace_targets, spec_sh=spec_sh, + chunk_sizer=chunk_sizer, ) - chunk.log_df(trace_label, "raw_utilities", raw_utilities) + chunk_sizer.log_df(trace_label, "raw_utilities", raw_utilities) if have_trace_targets: tracing.trace_df( @@ -1752,15 +1780,15 @@ def eval_nl_logsums( # - exponentiated utilities of leaves and nests nested_exp_utilities = compute_nested_exp_utilities(raw_utilities, nest_spec) - chunk.log_df(trace_label, "nested_exp_utilities", nested_exp_utilities) + chunk_sizer.log_df(trace_label, "nested_exp_utilities", nested_exp_utilities) del raw_utilities # done with raw_utilities - chunk.log_df(trace_label, "raw_utilities", None) + chunk_sizer.log_df(trace_label, "raw_utilities", None) # - logsums logsums = np.log(nested_exp_utilities.root) logsums = pd.Series(logsums, index=choosers.index) - chunk.log_df(trace_label, "logsums", logsums) + chunk_sizer.log_df(trace_label, "logsums", logsums) if have_trace_targets: # add logsum to nested_exp_utilities for tracing @@ -1775,7 +1803,7 @@ def eval_nl_logsums( ) del nested_exp_utilities # done with nested_exp_utilities - chunk.log_df(trace_label, "nested_exp_utilities", None) + chunk_sizer.log_df(trace_label, "nested_exp_utilities", None) return logsums @@ -1788,6 +1816,8 @@ def _simple_simulate_logsums( skims=None, locals_d=None, trace_label=None, + *, + chunk_sizer, ): """ like simple_simulate except return logsums instead of making choices @@ -1802,10 +1832,23 @@ def _simple_simulate_logsums( set_skim_wrapper_targets(choosers, skims) if nest_spec is None: - logsums = eval_mnl_logsums(choosers, spec, locals_d, trace_label=trace_label) + logsums = eval_mnl_logsums( + whale, + choosers, + spec, + locals_d, + trace_label=trace_label, + chunk_sizer=chunk_sizer, + ) else: logsums = eval_nl_logsums( - whale, choosers, spec, nest_spec, locals_d, trace_label=trace_label + whale, + choosers, + spec, + nest_spec, + locals_d, + trace_label=trace_label, + chunk_sizer=chunk_sizer, ) return logsums @@ -1845,9 +1888,15 @@ def simple_simulate_logsums( ) in chunk.adaptive_chunked_choosers( whale, choosers, chunk_size, trace_label, chunk_tag ): - logsums = _simple_simulate_logsums( - whale, chooser_chunk, spec, nest_spec, skims, locals_d, chunk_trace_label + whale, + chooser_chunk, + spec, + nest_spec, + skims, + locals_d, + chunk_trace_label, + chunk_sizer=chunk_sizer, ) result_list.append(logsums) diff --git a/activitysim/core/skim_dataset.py b/activitysim/core/skim_dataset.py index 0cae4062b..dc01c764d 100644 --- a/activitysim/core/skim_dataset.py +++ b/activitysim/core/skim_dataset.py @@ -665,10 +665,10 @@ def load_skim_dataset_to_shared_memory(whale, skim_tag="taz") -> xr.Dataset: ------- xarray.Dataset """ - from ..core.los import ONE_ZONE + from activitysim.core.los import ONE_ZONE # TODO:SHARROW: taz and maz are the same - network_los_preload = whale.get_injectable("network_los_preload", None) + network_los_preload = whale.get_injectable("network_los_preload") if network_los_preload is None: raise ValueError("missing network_los_preload") diff --git a/activitysim/core/steps/output.py b/activitysim/core/steps/output.py index 04de513b1..f7663d9cf 100644 --- a/activitysim/core/steps/output.py +++ b/activitysim/core/steps/output.py @@ -318,7 +318,7 @@ def write_tables(whale, output_dir): ) df = df.sort_index() - if config.setting("recode_pipeline_columns", True): + if whale.settings.recode_pipeline_columns: for colname, decode_instruction in table_decode_cols.items(): if "|" in decode_instruction: decode_filter, decode_instruction = decode_instruction.split("|") diff --git a/activitysim/core/test/extensions/steps.py b/activitysim/core/test/extensions/steps.py index 05eaa79fe..c02cd7f98 100644 --- a/activitysim/core/test/extensions/steps.py +++ b/activitysim/core/test/extensions/steps.py @@ -7,21 +7,21 @@ def step1(whale: workflow.Whale): table1 = pd.DataFrame({"c": [1, 2, 3]}) - inject.add_table("table1", table1) + whale.add_table("table1", table1) @workflow.step def step2(whale: workflow.Whale): table1 = pd.DataFrame({"c": [2, 4, 6]}) - inject.add_table("table2", table1) + whale.add_table("table2", table1) @workflow.step def step3(whale: workflow.Whale): table1 = pd.DataFrame({"c": [3, 6, 9]}) - inject.add_table("table3", table1) + whale.add_table("table3", table1) @workflow.step @@ -57,7 +57,7 @@ def step_forget_tab(whale: workflow.Whale): def create_households(whale: workflow.Whale, trace_hh_id): df = pd.DataFrame({"household_id": [1, 2, 3], "home_zone_id": {100, 100, 101}}) - inject.add_table("households", df) + whale.add_table("households", df) pipeline.get_rn_generator().add_channel("households", df) diff --git a/activitysim/core/test/test_simulate.py b/activitysim/core/test/test_simulate.py index ab100f6a9..09618dee2 100644 --- a/activitysim/core/test/test_simulate.py +++ b/activitysim/core/test/test_simulate.py @@ -73,7 +73,7 @@ def test_simple_simulate(data, spec): inject.add_injectable("settings", {"check_for_variability": False}) - choices = simulate.simple_simulate(choosers=data, spec=spec, nest_spec=None) + choices = simulate.simple_simulate(whale, choosers=data, spec=spec, nest_spec=None) expected = pd.Series([1, 1, 1], index=data.index) pdt.assert_series_equal(choices, expected, check_dtype=False) @@ -83,7 +83,7 @@ def test_simple_simulate_chunked(data, spec): inject.add_injectable("settings", {"check_for_variability": False}) choices = simulate.simple_simulate( - choosers=data, spec=spec, nest_spec=None, chunk_size=2 + whale, choosers=data, spec=spec, nest_spec=None, chunk_size=2 ) expected = pd.Series([1, 1, 1], index=data.index) pdt.assert_series_equal(choices, expected, check_dtype=False) diff --git a/activitysim/core/test/test_tracing.py b/activitysim/core/test/test_tracing.py index df88e0d36..82a40656c 100644 --- a/activitysim/core/test/test_tracing.py +++ b/activitysim/core/test/test_tracing.py @@ -179,7 +179,7 @@ def test_write_csv(capsys): tracing.config_logger() # should complain if df not a DataFrame or Series - tracing.write_csv(df="not a df or series", file_name="baddie") + tracing.write_csv(whale, df="not a df or series", file_name="baddie") out, err = capsys.readouterr() diff --git a/activitysim/core/tracing.py b/activitysim/core/tracing.py index 1aa416380..f677cae37 100644 --- a/activitysim/core/tracing.py +++ b/activitysim/core/tracing.py @@ -120,7 +120,6 @@ def delete_output_files(whale, file_type, ignore=None, subdir=None): directories = subdir or ["", "log", "trace"] for subdir in directories: - dir = output_dir.joinpath(output_dir, subdir) if subdir else output_dir if not dir.exists(): @@ -259,7 +258,6 @@ def print_summary(label, df, describe=False, value_counts=False): @workflow.step def initialize_traceable_tables(whale: workflow.Whale): - whale.set("traceable_table_ids", {}) @@ -331,7 +329,6 @@ def register_traceable_table(whale, table_name, df): ) new_traced_ids = [trace_hh_id] else: - # find first already registered ref_col we can use to slice this table ref_col = next((c for c in traceable_table_indexes if c in df.columns), None) @@ -378,7 +375,6 @@ def register_traceable_table(whale, table_name, df): def write_df_csv( df, file_path, index_label=None, columns=None, column_labels=None, transpose=True ): - need_header = not os.path.isfile(file_path) if columns: @@ -395,7 +391,6 @@ def write_df_csv( df_t.index.name = index_label if need_header: - if column_labels is None: column_labels = [None, None] if column_labels[0] is None: @@ -426,7 +421,6 @@ def write_df_csv( def write_series_csv( series, file_path, index_label=None, columns=None, column_labels=None ): - if isinstance(columns, str): series = series.rename(columns) elif isinstance(columns, list): @@ -441,7 +435,13 @@ def write_series_csv( def write_csv( - df, file_name, index_label=None, columns=None, column_labels=None, transpose=True + whale: workflow.Whale, + df, + file_name, + index_label=None, + columns=None, + column_labels=None, + transpose=True, ): """ Print write_csv @@ -468,7 +468,7 @@ def write_csv( if not file_name.endswith(".%s" % CSV_FILE_TYPE): file_name = "%s.%s" % (file_name, CSV_FILE_TYPE) - file_path = config.trace_file_path(file_name) + file_path = whale.filesystem.get_trace_file_path(file_name) if os.name == "nt": abs_path = os.path.abspath(file_path) @@ -592,13 +592,11 @@ def get_trace_target(whale, df, slicer, column=None): @workflow.func def trace_targets(whale: workflow.Whale, df, slicer=None, column=None): - target_ids, column = get_trace_target(whale, df, slicer, column) if target_ids is None: targets = None else: - if column is None: targets = df.index.isin(target_ids) else: @@ -610,13 +608,11 @@ def trace_targets(whale: workflow.Whale, df, slicer=None, column=None): @workflow.func def has_trace_targets(whale: workflow.Whale, df, slicer=None, column=None): - target_ids, column = get_trace_target(whale, df, slicer, column) if target_ids is None: found = False else: - if column is None: found = df.index.isin(target_ids).any() else: @@ -736,6 +732,7 @@ def trace_df( if df.shape[0] > 0: write_csv( + whale, df, file_name=label, index_label=(index_label or slicer), @@ -820,7 +817,6 @@ def interaction_trace_rows(interaction_df, choosers, sample_size=None): trace_ids = interaction_df[trace_rows].index.values else: - if slicer_column_name == choosers.index.name: trace_rows = np.in1d(choosers.index, targets) trace_ids = np.asanyarray(choosers[trace_rows].index) @@ -885,7 +881,6 @@ def trace_interaction_eval_results(trace_results, trace_ids, label): # if there are multiple targets, we want them in separate tables for readability for target in targets: - df_target = trace_results[trace_results[slicer_column_name] == target] # we want the transposed columns in predictable order diff --git a/other_resources/scripts/simulation.py b/other_resources/scripts/simulation.py index b00ee62f8..c42447808 100644 --- a/other_resources/scripts/simulation.py +++ b/other_resources/scripts/simulation.py @@ -73,7 +73,7 @@ def log_settings(injectables): t0 = tracing.print_elapsed_time() # cleanup if not resuming - if not config.setting("resume_after", False): + if not whale.settings.resume_after: cleanup_output_files() run_list = mp_tasks.get_run_list() diff --git a/other_resources/verification/simulation.py b/other_resources/verification/simulation.py index 890418332..6be9765fe 100644 --- a/other_resources/verification/simulation.py +++ b/other_resources/verification/simulation.py @@ -84,7 +84,7 @@ def log_settings(injectables): t0 = tracing.print_elapsed_time() # cleanup if not resuming - if not config.setting("resume_after", False): + if not whale.settings.resume_after: cleanup_output_files() run_list = mp_tasks.get_run_list() From 2d8767402248d1fdcbc8f5e0762bb4065851cbe3 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Sun, 12 Feb 2023 08:52:59 -0600 Subject: [PATCH 009/419] refactoring --- .../abm/models/atwork_subtour_destination.py | 8 +- .../abm/models/atwork_subtour_frequency.py | 6 +- .../abm/models/atwork_subtour_mode_choice.py | 4 +- .../abm/models/atwork_subtour_scheduling.py | 6 +- activitysim/abm/models/auto_ownership.py | 13 +- activitysim/abm/models/cdap.py | 30 ++-- .../abm/models/disaggregate_accessibility.py | 14 +- activitysim/abm/models/free_parking.py | 18 ++- activitysim/abm/models/initialize.py | 21 +-- activitysim/abm/models/initialize_los.py | 7 +- activitysim/abm/models/initialize_tours.py | 18 ++- .../abm/models/joint_tour_composition.py | 6 +- .../abm/models/joint_tour_destination.py | 8 +- .../abm/models/joint_tour_frequency.py | 6 +- .../abm/models/joint_tour_participation.py | 14 +- .../abm/models/joint_tour_scheduling.py | 10 +- .../abm/models/mandatory_tour_frequency.py | 16 +-- .../abm/models/non_mandatory_destination.py | 8 +- .../models/non_mandatory_tour_frequency.py | 8 +- .../abm/models/parking_location_choice.py | 10 +- activitysim/abm/models/school_escorting.py | 8 +- activitysim/abm/models/stop_frequency.py | 8 +- activitysim/abm/models/summarize.py | 14 +- .../abm/models/telecommute_frequency.py | 6 +- activitysim/abm/models/tour_mode_choice.py | 30 ++-- activitysim/abm/models/tour_od_choice.py | 8 +- .../models/tour_scheduling_probabilistic.py | 9 +- .../abm/models/transit_pass_ownership.py | 6 +- .../abm/models/transit_pass_subsidy.py | 6 +- .../abm/models/trip_departure_choice.py | 4 +- activitysim/abm/models/trip_destination.py | 34 +++-- activitysim/abm/models/trip_matrices.py | 8 +- activitysim/abm/models/trip_mode_choice.py | 6 +- activitysim/abm/models/trip_purpose.py | 13 +- .../models/trip_purpose_and_destination.py | 6 +- activitysim/abm/models/trip_scheduling.py | 4 +- .../abm/models/trip_scheduling_choice.py | 4 +- activitysim/abm/models/util/canonical_ids.py | 30 ++-- activitysim/abm/models/util/cdap.py | 64 +++++---- activitysim/abm/models/util/estimation.py | 4 +- activitysim/abm/models/util/logsums.py | 6 +- activitysim/abm/models/util/mode.py | 6 +- activitysim/abm/models/util/overlap.py | 4 +- .../models/util/school_escort_tours_trips.py | 8 +- activitysim/abm/models/util/test/test_cdap.py | 8 +- .../test/test_mandatory_tour_frequency.py | 4 +- .../test/test_vectorize_tour_scheduling.py | 7 +- .../abm/models/util/tour_destination.py | 8 +- activitysim/abm/models/util/tour_frequency.py | 13 +- activitysim/abm/models/util/tour_od.py | 24 ++-- .../abm/models/util/tour_scheduling.py | 17 ++- .../models/util/vectorize_tour_scheduling.py | 129 ++++++++++-------- activitysim/abm/models/vehicle_allocation.py | 6 +- activitysim/abm/models/vehicle_type_choice.py | 18 +-- activitysim/abm/models/work_from_home.py | 8 +- .../abm/tables/disaggregate_accessibility.py | 8 +- activitysim/abm/tables/households.py | 36 ++++- activitysim/abm/tables/persons.py | 25 ++-- activitysim/abm/tables/time_windows.py | 2 +- activitysim/abm/tables/vehicles.py | 2 +- activitysim/abm/test/run_multi_zone_mp.py | 2 +- .../abm/test/test_misc/test_summarize.py | 10 +- .../abm/test/test_pipeline/test_pipeline.py | 6 +- activitysim/benchmarking/componentwise.py | 10 +- activitysim/cli/run.py | 4 +- activitysim/core/chunk.py | 14 ++ activitysim/core/configuration/filesystem.py | 35 ++++- activitysim/core/expressions.py | 4 +- activitysim/core/input.py | 4 +- activitysim/core/pathbuilder.py | 8 +- activitysim/core/pathbuilder_cache.py | 6 +- activitysim/core/simulate.py | 47 ++++--- activitysim/core/skim_dict_factory.py | 4 +- activitysim/core/steps/output.py | 56 ++++---- activitysim/core/test/extensions/steps.py | 2 +- activitysim/core/test/test_simulate.py | 6 +- activitysim/core/test/test_tracing.py | 12 +- activitysim/core/tracing.py | 6 +- activitysim/core/workflow/__init__.py | 1 + activitysim/core/workflow/state.py | 28 ++-- activitysim/core/workflow/steps.py | 35 ++++- .../example_estimation/scripts/infer.py | 20 +-- 82 files changed, 696 insertions(+), 466 deletions(-) diff --git a/activitysim/abm/models/atwork_subtour_destination.py b/activitysim/abm/models/atwork_subtour_destination.py index 98bb2ba41..7d423faa4 100644 --- a/activitysim/abm/models/atwork_subtour_destination.py +++ b/activitysim/abm/models/atwork_subtour_destination.py @@ -16,7 +16,7 @@ def atwork_subtour_destination( ): trace_label = "atwork_subtour_destination" model_settings_file_name = "atwork_subtour_destination.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) future_settings = { "SIZE_TERM_SELECTOR": "atwork", @@ -53,11 +53,9 @@ def atwork_subtour_destination( estimator.write_spec(model_settings, tag="SPEC") estimator.set_alt_id(model_settings["ALT_DEST_COL_NAME"]) estimator.write_table( - inject.get_injectable("size_terms"), "size_terms", append=False - ) - estimator.write_table( - inject.get_table("land_use").to_frame(), "landuse", append=False + whale.get_injectable("size_terms"), "size_terms", append=False ) + estimator.write_table(whale.get_dataframe("land_use"), "landuse", append=False) estimator.write_model_settings(model_settings, model_settings_file_name) choices_df, save_sample_df = tour_destination.run_tour_destination( diff --git a/activitysim/abm/models/atwork_subtour_frequency.py b/activitysim/abm/models/atwork_subtour_frequency.py index e72c57ca0..3a213bd2e 100644 --- a/activitysim/abm/models/atwork_subtour_frequency.py +++ b/activitysim/abm/models/atwork_subtour_frequency.py @@ -36,11 +36,11 @@ def atwork_subtour_frequency(whale: workflow.Whale, tours, persons_merged, chunk add_null_results(whale, trace_label, tours) return - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) estimator = estimation.manager.begin_estimation(whale, "atwork_subtour_frequency") - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients( whale, model_spec, coefficients_df, estimator ) diff --git a/activitysim/abm/models/atwork_subtour_mode_choice.py b/activitysim/abm/models/atwork_subtour_mode_choice.py index 05faa174d..ccc1fba03 100644 --- a/activitysim/abm/models/atwork_subtour_mode_choice.py +++ b/activitysim/abm/models/atwork_subtour_mode_choice.py @@ -26,7 +26,7 @@ def atwork_subtour_mode_choice( trace_hh_id = whale.settings.trace_hh_id model_settings_file_name = "tour_mode_choice.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME") mode_column_name = "tour_mode" @@ -187,7 +187,7 @@ def atwork_subtour_mode_choice( # - annotate tours table if model_settings.get("annotate_tours"): - tours = inject.get_table("tours").to_frame() + tours = whale.get_dataframe("tours") expressions.assign_columns( whale, df=tours, diff --git a/activitysim/abm/models/atwork_subtour_scheduling.py b/activitysim/abm/models/atwork_subtour_scheduling.py index 7cbe03b28..5a7bde235 100644 --- a/activitysim/abm/models/atwork_subtour_scheduling.py +++ b/activitysim/abm/models/atwork_subtour_scheduling.py @@ -42,12 +42,12 @@ def atwork_subtour_scheduling( tracing.no_results(trace_label) return - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) estimator = estimation.manager.begin_estimation(whale, "atwork_subtour_scheduling") - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) sharrow_skip = model_settings.get("sharrow_skip") - coefficients_df = simulate.read_model_coefficients(model_settings) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients( whale, model_spec, coefficients_df, estimator ) diff --git a/activitysim/abm/models/auto_ownership.py b/activitysim/abm/models/auto_ownership.py index 646bc5b7d..79d4c293b 100644 --- a/activitysim/abm/models/auto_ownership.py +++ b/activitysim/abm/models/auto_ownership.py @@ -2,6 +2,8 @@ # See full license in LICENSE.txt. import logging +import pandas as pd + from activitysim.abm.models.util import estimation from activitysim.core import config, simulate, tracing, workflow @@ -18,13 +20,12 @@ def auto_ownership_simulate( """ trace_label = "auto_ownership_simulate" model_settings_file_name = "auto_ownership.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) trace_hh_id = whale.settings.trace_hh_id estimator = estimation.manager.begin_estimation(whale, "auto_ownership") - - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients( whale, model_spec, coefficients_df, estimator ) @@ -32,7 +33,7 @@ def auto_ownership_simulate( nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) - choosers = households_merged.to_frame() + choosers = households_merged logger.info("Running %s with %d households", trace_label, len(choosers)) @@ -63,8 +64,6 @@ def auto_ownership_simulate( estimator.write_override_choices(choices) estimator.end_estimation() - households = households.to_frame() - # no need to reindex as we used all households households["auto_ownership"] = choices diff --git a/activitysim/abm/models/cdap.py b/activitysim/abm/models/cdap.py index f7eb40020..a0a18ee58 100644 --- a/activitysim/abm/models/cdap.py +++ b/activitysim/abm/models/cdap.py @@ -13,7 +13,11 @@ @workflow.step def cdap_simulate( - whale: workflow.Whale, persons_merged, persons, households, chunk_size + whale: workflow.Whale, + persons_merged: pd.DataFrame, + persons: pd.DataFrame, + households: pd.DataFrame, + chunk_size, ): """ CDAP stands for Coordinated Daily Activity Pattern, which is a choice of @@ -26,7 +30,7 @@ def cdap_simulate( """ trace_label = "cdap" - model_settings = config.read_model_settings("cdap.yaml") + model_settings = whale.filesystem.read_model_settings("cdap.yaml") trace_hh_id = whale.settings.trace_hh_id person_type_map = model_settings.get("PERSON_TYPE_MAP", None) assert ( @@ -34,11 +38,11 @@ def cdap_simulate( ), f"Expected to find PERSON_TYPE_MAP setting in cdap.yaml" estimator = estimation.manager.begin_estimation(whale, "cdap") - cdap_indiv_spec = simulate.read_model_spec( + cdap_indiv_spec = whale.filesystem.read_model_spec( file_name=model_settings["INDIV_AND_HHSIZE1_SPEC"] ) - coefficients_df = simulate.read_model_coefficients(model_settings) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) cdap_indiv_spec = simulate.eval_coefficients( whale, cdap_indiv_spec, coefficients_df, estimator ) @@ -77,12 +81,10 @@ def cdap_simulate( EXCEPT that the values computed are relative proportions, not utilities (i.e. values are not exponentiated before being normalized to probabilities summing to 1.0) """ - cdap_fixed_relative_proportions = simulate.read_model_spec( + cdap_fixed_relative_proportions = whale.filesystem.read_model_spec( file_name=model_settings["FIXED_RELATIVE_PROPORTIONS_SPEC"] ) - persons_merged = persons_merged.to_frame() - # add tour-based chunk_id so we can chunk all trips in tour together assert "chunk_id" not in persons_merged.columns unique_household_ids = persons_merged.household_id.unique() @@ -104,10 +106,12 @@ def cdap_simulate( # (also when multiprocessing locutor might not see all household sizes) logger.info("Pre-building cdap specs") for hhsize in range(2, cdap.MAX_HHSIZE + 1): - spec = cdap.build_cdap_spec(cdap_interaction_coefficients, hhsize, cache=True) - if inject.get_injectable("locutor", False): + spec = cdap.build_cdap_spec( + whale, cdap_interaction_coefficients, hhsize, cache=True + ) + if whale.get_injectable("locutor", False): spec.to_csv( - config.output_file_path("cdap_spec_%s.csv" % hhsize), index=True + whale.get_output_file_path("cdap_spec_%s.csv" % hhsize), index=True ) if estimator: @@ -125,7 +129,7 @@ def cdap_simulate( ) estimator.write_choosers(persons_merged) for hhsize in range(2, cdap.MAX_HHSIZE + 1): - spec = cdap.get_cached_spec(hhsize) + spec = cdap.get_cached_spec(whale, hhsize) estimator.write_table(spec, "spec_%s" % hhsize, append=False) logger.info("Running cdap_simulate with %d persons", len(persons_merged.index)) @@ -149,9 +153,6 @@ def cdap_simulate( estimator.write_override_choices(choices) estimator.end_estimation() - # - assign results to persons table and annotate - persons = persons.to_frame() - choices = choices.reindex(persons.index) persons["cdap_activity"] = choices @@ -165,7 +166,6 @@ def cdap_simulate( whale.add_table("persons", persons) # - annotate households table - households = households.to_frame() expressions.assign_columns( whale, df=households, diff --git a/activitysim/abm/models/disaggregate_accessibility.py b/activitysim/abm/models/disaggregate_accessibility.py index da07a73cf..96954b7ba 100644 --- a/activitysim/abm/models/disaggregate_accessibility.py +++ b/activitysim/abm/models/disaggregate_accessibility.py @@ -20,7 +20,7 @@ def read_disaggregate_accessibility_yaml(whale: workflow.Whale, file_name): """ Adds in default table suffixes 'proto_' if not defined in the settings file """ - model_settings = config.read_model_settings(file_name) + model_settings = whale.filesystem.read_model_settings(file_name) if not model_settings.get("suffixes"): model_settings["suffixes"] = { "SUFFIX": "proto_", @@ -521,12 +521,12 @@ def inject_tables(self): # Update canonical tables lists inject.add_injectable( "traceable_tables", - inject.get_injectable("traceable_tables") + list(self.proto_pop.keys()), + whale.get_injectable("traceable_tables") + list(self.proto_pop.keys()), ) for tablename, df in self.proto_pop.items(): whale.add_table(tablename, df) self.whale.get_rn_generator().add_channel(tablename, df) - tracing.register_traceable_table(tablename, df) + tracing.register_traceable_table(whale, tablename, df) def annotate_tables(self, whale: workflow.Whale): # Extract annotations @@ -588,7 +588,7 @@ def get_disaggregate_logsums( ]: trace_label = tracing.extend_trace_label(model_name, "accessibilities") print("Running model {}".format(trace_label)) - model_settings = config.read_model_settings(model_name + ".yaml") + model_settings = whale.filesystem.read_model_settings(model_name + ".yaml") model_settings["SAMPLE_SIZE"] = disagg_model_settings.get( "DESTINATION_SAMPLE_SIZE" ) @@ -693,12 +693,12 @@ def compute_disaggregate_accessibility( # Re-Register tables in this step, necessary for multiprocessing for tablename in ["proto_households", "proto_persons", "proto_tours"]: df = inject.get_table(tablename).to_frame() - traceables = inject.get_injectable("traceable_tables") + traceables = whale.get_injectable("traceable_tables") if tablename not in whale.get_rn_generator().channels: whale.get_rn_generator().add_channel(tablename, df) if tablename not in traceables: inject.add_injectable("traceable_tables", traceables + [tablename]) - tracing.register_traceable_table(tablename, df) + tracing.register_traceable_table(whale, tablename, df) del df # Run location choice @@ -752,7 +752,7 @@ def compute_disaggregate_accessibility( # Drop any prematurely added traceables for trace in [ - x for x in inject.get_injectable("traceable_tables") if "proto_" not in x + x for x in whale.get_injectable("traceable_tables") if "proto_" not in x ]: tracing.deregister_traceable_table(whale, trace) diff --git a/activitysim/abm/models/free_parking.py b/activitysim/abm/models/free_parking.py index 965ebca4c..bba1e9838 100644 --- a/activitysim/abm/models/free_parking.py +++ b/activitysim/abm/models/free_parking.py @@ -2,6 +2,8 @@ # See full license in LICENSE.txt. import logging +import pandas as pd + from activitysim.abm.models.util import estimation from activitysim.core import config, expressions, simulate, tracing, workflow @@ -9,18 +11,23 @@ @workflow.step -def free_parking(whale: workflow.Whale, persons_merged, persons, chunk_size): +def free_parking( + whale: workflow.Whale, + persons_merged: pd.DataFrame, + persons: pd.DataFrame, + chunk_size, +): """ """ trace_label = "free_parking" model_settings_file_name = "free_parking.yaml" trace_hh_id = whale.settings.trace_hh_id - choosers = persons_merged.to_frame() + choosers = pd.DataFrame(persons_merged) choosers = choosers[choosers.workplace_zone_id > -1] logger.info("Running %s with %d persons", trace_label, len(choosers)) - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) estimator = estimation.manager.begin_estimation(whale, "free_parking") constants = config.get_model_constants(model_settings) @@ -40,8 +47,8 @@ def free_parking(whale: workflow.Whale, persons_merged, persons, chunk_size): trace_label=trace_label, ) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients( whale, model_spec, coefficients_df, estimator ) @@ -77,7 +84,6 @@ def free_parking(whale: workflow.Whale, persons_merged, persons, chunk_size): estimator.write_override_choices(choices) estimator.end_estimation() - persons = persons.to_frame() persons["free_parking_at_work"] = ( choices.reindex(persons.index).fillna(0).astype(bool) ) diff --git a/activitysim/abm/models/initialize.py b/activitysim/abm/models/initialize.py index b6e3eaeec..cfea3a6bc 100644 --- a/activitysim/abm/models/initialize.py +++ b/activitysim/abm/models/initialize.py @@ -168,35 +168,28 @@ def initialize_households(whale: workflow.Whale): chunk_sizer.log_df(trace_label, "person_windows", person_windows) -@inject.injectable(cache=True) -def preload_injectables(): +@workflow.cached_object +def preload_injectables(whale: workflow.Whale): """ preload bulky injectables up front - stuff that isn't inserted into the pipeline """ logger.info("preload_injectables") - inject.add_step("track_skim_usage", track_skim_usage) - inject.add_step("write_data_dictionary", write_data_dictionary) - inject.add_step("write_tables", write_tables) + # whale.add_step("track_skim_usage", track_skim_usage) + # inject.add_step("write_data_dictionary", write_data_dictionary) + # inject.add_step("write_tables", write_tables) table_list = whale.settings.input_table_list # default ActivitySim table names and indices if table_list is None: - logger.warning( - "No 'input_table_list' found in settings. This will be a " - "required setting in upcoming versions of ActivitySim." - ) - - new_settings = inject.get_injectable("settings") - new_settings["input_table_list"] = DEFAULT_TABLE_LIST - inject.add_injectable("settings", new_settings) + raise ValueError("No 'input_table_list' found in settings.") # FIXME undocumented feature if whale.settings.write_raw_tables: # write raw input tables as csv (before annotation) - csv_dir = config.output_file_path("raw_tables") + csv_dir = whale.get_output_file_path("raw_tables") if not os.path.exists(csv_dir): os.makedirs(csv_dir) # make directory if needed diff --git a/activitysim/abm/models/initialize_los.py b/activitysim/abm/models/initialize_los.py index 6804453ef..06b018ead 100644 --- a/activitysim/abm/models/initialize_los.py +++ b/activitysim/abm/models/initialize_los.py @@ -135,7 +135,12 @@ def compute_utilities_for_attribute_tuple( chunk_tag = "initialize_tvpb" # all attribute_combinations can use same cached data for row_size calc - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers( whale, choosers_df, chunk_size, trace_label, chunk_tag=chunk_tag ): # we should count choosers_df as chunk overhead since its pretty big and was custom made for compute_utilities diff --git a/activitysim/abm/models/initialize_tours.py b/activitysim/abm/models/initialize_tours.py index 4e360a756..e12785ca0 100644 --- a/activitysim/abm/models/initialize_tours.py +++ b/activitysim/abm/models/initialize_tours.py @@ -17,11 +17,11 @@ SURVEY_PARTICIPANT_ID = "external_participant_id" ASIM_TOUR_ID = "tour_id" ASIM_PARENT_TOUR_ID = "parent_tour_id" -REQUIRED_TOUR_COLUMNS = set(["person_id", "tour_category", "tour_type"]) +REQUIRED_TOUR_COLUMNS = {"person_id", "tour_category", "tour_type"} -def patch_tour_ids(tours): - def set_tour_index(tours, parent_tour_num_col, is_joint): +def patch_tour_ids(whale: workflow.Whale, tours): + def set_tour_index(whale: workflow.Whale, tours, parent_tour_num_col, is_joint): group_cols = ["person_id", "tour_category", "tour_type"] if "parent_tour_num" in tours: @@ -32,7 +32,7 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): ) return tf.set_tour_index( - tours, parent_tour_num_col=parent_tour_num_col, is_joint=is_joint + whale, tours, parent_tour_num_col=parent_tour_num_col, is_joint=is_joint ) assert REQUIRED_TOUR_COLUMNS.issubset( @@ -48,6 +48,7 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): # mandatory tours mandatory_tours = set_tour_index( + whale, tours[tours.tour_category == "mandatory"], parent_tour_num_col=None, is_joint=False, @@ -60,6 +61,7 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): # non_mandatory tours non_mandatory_tours = set_tour_index( + whale, tours[tours.tour_category == "non_mandatory"], parent_tour_num_col=None, is_joint=False, @@ -93,7 +95,9 @@ def initialize_tours(whale: workflow.Whale, network_los, households, persons): tours = tours[tours.person_id.isin(persons.index)] # annotate before patching tour_id to allow addition of REQUIRED_TOUR_COLUMNS defined above - model_settings = config.read_model_settings("initialize_tours.yaml", mandatory=True) + model_settings = whale.filesystem.read_model_settings( + "initialize_tours.yaml", mandatory=True + ) expressions.assign_columns( whale, df=tours, @@ -105,7 +109,7 @@ def initialize_tours(whale: workflow.Whale, network_los, households, persons): if skip_patch_tour_ids: pass else: - tours = patch_tour_ids(tours) + tours = patch_tour_ids(whale, tours) assert tours.index.name == "tour_id" # replace table function with dataframe @@ -113,7 +117,7 @@ def initialize_tours(whale: workflow.Whale, network_los, households, persons): whale.get_rn_generator().add_channel("tours", tours) - tracing.register_traceable_table("tours", tours) + tracing.register_traceable_table(whale, "tours", tours) logger.debug(f"{len(tours.household_id.unique())} unique household_ids in tours") logger.debug(f"{len(households.index.unique())} unique household_ids in households") diff --git a/activitysim/abm/models/joint_tour_composition.py b/activitysim/abm/models/joint_tour_composition.py index 0db8a099b..51b8db297 100644 --- a/activitysim/abm/models/joint_tour_composition.py +++ b/activitysim/abm/models/joint_tour_composition.py @@ -34,7 +34,7 @@ def joint_tour_composition( add_null_results(whale, trace_label, tours) return - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) estimator = estimation.manager.begin_estimation(whale, "joint_tour_composition") # - only interested in households with joint_tours @@ -69,8 +69,8 @@ def joint_tour_composition( ) # - simple_simulate - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients( whale, model_spec, coefficients_df, estimator ) diff --git a/activitysim/abm/models/joint_tour_destination.py b/activitysim/abm/models/joint_tour_destination.py index 11c474703..f8b9460a8 100644 --- a/activitysim/abm/models/joint_tour_destination.py +++ b/activitysim/abm/models/joint_tour_destination.py @@ -29,7 +29,7 @@ def joint_tour_destination( trace_label = "joint_tour_destination" model_settings_file_name = "joint_tour_destination.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) trace_hh_id = whale.settings.trace_hh_id logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME") @@ -58,11 +58,9 @@ def joint_tour_destination( estimator.write_spec(model_settings, tag="SPEC") estimator.set_alt_id(model_settings["ALT_DEST_COL_NAME"]) estimator.write_table( - inject.get_injectable("size_terms"), "size_terms", append=False - ) - estimator.write_table( - inject.get_table("land_use").to_frame(), "landuse", append=False + whale.get_injectable("size_terms"), "size_terms", append=False ) + estimator.write_table(whale.get_dataframe("land_use"), "landuse", append=False) estimator.write_model_settings(model_settings, model_settings_file_name) choices_df, save_sample_df = tour_destination.run_tour_destination( diff --git a/activitysim/abm/models/joint_tour_frequency.py b/activitysim/abm/models/joint_tour_frequency.py index e8500538e..da75746b1 100644 --- a/activitysim/abm/models/joint_tour_frequency.py +++ b/activitysim/abm/models/joint_tour_frequency.py @@ -25,7 +25,7 @@ def joint_tour_frequency(whale: workflow.Whale, households, persons, chunk_size) estimator = estimation.manager.begin_estimation(whale, "joint_tour_frequency") - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) alternatives = simulate.read_model_alts( whale, "joint_tour_frequency_alternatives.csv", set_index="alt" @@ -62,8 +62,8 @@ def joint_tour_frequency(whale: workflow.Whale, households, persons, chunk_size) trace_label=trace_label, ) - model_spec = simulate.read_model_spec(whale, file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients( whale, model_spec, coefficients_df, estimator ) diff --git a/activitysim/abm/models/joint_tour_participation.py b/activitysim/abm/models/joint_tour_participation.py index 3df221ad1..bcb949a9e 100644 --- a/activitysim/abm/models/joint_tour_participation.py +++ b/activitysim/abm/models/joint_tour_participation.py @@ -154,7 +154,9 @@ def participants_chooser(probs, choosers, spec, trace_label): assert probs.index.equals(choosers.index) # choice is boolean (participate or not) - model_settings = config.read_model_settings("joint_tour_participation.yaml") + model_settings = whale.filesystem.read_model_settings( + "joint_tour_participation.yaml" + ) choice_col = model_settings.get("participation_choice", "participate") assert ( @@ -241,7 +243,7 @@ def participants_chooser(probs, choosers, spec, trace_label): def annotate_jtp(model_settings, trace_label): # - annotate persons - persons = inject.get_table("persons").to_frame() + persons = whale.get_dataframe("persons") expressions.assign_columns( whale, df=persons, @@ -272,7 +274,7 @@ def joint_tour_participation(whale: workflow.Whale, tours, persons_merged, chunk """ trace_label = "joint_tour_participation" model_settings_file_name = "joint_tour_participation.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) trace_hh_id = whale.settings.trace_hh_id tours = tours.to_frame() @@ -287,7 +289,7 @@ def joint_tour_participation(whale: workflow.Whale, tours, persons_merged, chunk # - create joint_tour_participation_candidates table candidates = joint_tour_participation_candidates(joint_tours, persons_merged) - tracing.register_traceable_table("joint_tour_participants", candidates) + tracing.register_traceable_table(whale, "joint_tour_participants", candidates) whale.get_rn_generator().add_channel("joint_tour_participants", candidates) logger.info( @@ -315,8 +317,8 @@ def joint_tour_participation(whale: workflow.Whale, tours, persons_merged, chunk estimator = estimation.manager.begin_estimation(whale, "joint_tour_participation") - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients( whale, model_spec, coefficients_df, estimator ) diff --git a/activitysim/abm/models/joint_tour_scheduling.py b/activitysim/abm/models/joint_tour_scheduling.py index 6350f64fd..7e8bec68b 100644 --- a/activitysim/abm/models/joint_tour_scheduling.py +++ b/activitysim/abm/models/joint_tour_scheduling.py @@ -24,7 +24,7 @@ def joint_tour_scheduling( trace_label = "joint_tour_scheduling" model_settings_file_name = "joint_tour_scheduling.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) trace_hh_id = whale.settings.trace_hh_id joint_tours = tours[tours.tour_category == "joint"] @@ -35,7 +35,7 @@ def joint_tour_scheduling( return # use inject.get_table as this won't exist if there are no joint_tours - joint_tour_participants = inject.get_table("joint_tour_participants").to_frame() + joint_tour_participants = whale.get_dataframe("joint_tour_participants") persons_merged = persons_merged.to_frame() @@ -69,13 +69,13 @@ def joint_tour_scheduling( trace_label=trace_label, ) - timetable = inject.get_injectable("timetable") + timetable = whale.get_injectable("timetable") estimator = estimation.manager.begin_estimation(whale, "joint_tour_scheduling") - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) sharrow_skip = model_settings.get("sharrow_skip", False) - coefficients_df = simulate.read_model_coefficients(model_settings) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients( whale, model_spec, coefficients_df, estimator ) diff --git a/activitysim/abm/models/mandatory_tour_frequency.py b/activitysim/abm/models/mandatory_tour_frequency.py index 201b8c751..ad168bccb 100644 --- a/activitysim/abm/models/mandatory_tour_frequency.py +++ b/activitysim/abm/models/mandatory_tour_frequency.py @@ -14,7 +14,7 @@ def add_null_results(whale, trace_label, mandatory_tour_frequency_settings): logger.info("Skipping %s: add_null_results", trace_label) - persons = inject.get_table("persons").to_frame() + persons = whale.get_dataframe("persons") persons["mandatory_tour_frequency"] = "" tours = pd.DataFrame() @@ -44,9 +44,9 @@ def mandatory_tour_frequency(whale: workflow.Whale, persons_merged, chunk_size): model_settings_file_name = "mandatory_tour_frequency.yaml" trace_hh_id = whale.settings.trace_hh_id - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) - choosers = persons_merged.to_frame() + choosers = persons_merged # filter based on results of CDAP choosers = choosers[choosers.cdap_activity == "M"] logger.info("Running mandatory_tour_frequency with %d persons", len(choosers)) @@ -71,8 +71,8 @@ def mandatory_tour_frequency(whale: workflow.Whale, persons_merged, chunk_size): estimator = estimation.manager.begin_estimation(whale, "mandatory_tour_frequency") - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients( whale, model_spec, coefficients_df, estimator ) @@ -121,15 +121,15 @@ def mandatory_tour_frequency(whale: workflow.Whale, persons_merged, chunk_size): choosers["mandatory_tour_frequency"] = choices.reindex(choosers.index) mandatory_tours = process_mandatory_tours( - persons=choosers, mandatory_tour_frequency_alts=alternatives + whale, persons=choosers, mandatory_tour_frequency_alts=alternatives ) tours = whale.extend_table("tours", mandatory_tours) - tracing.register_traceable_table("tours", mandatory_tours) + tracing.register_traceable_table(whale, "tours", mandatory_tours) whale.get_rn_generator().add_channel("tours", mandatory_tours) # - annotate persons - persons = inject.get_table("persons").to_frame() + persons = whale.get_dataframe("persons") # need to reindex as we only handled persons with cdap_activity == 'M' persons["mandatory_tour_frequency"] = ( diff --git a/activitysim/abm/models/non_mandatory_destination.py b/activitysim/abm/models/non_mandatory_destination.py index 2852eae44..5641a5dc7 100644 --- a/activitysim/abm/models/non_mandatory_destination.py +++ b/activitysim/abm/models/non_mandatory_destination.py @@ -24,7 +24,7 @@ def non_mandatory_tour_destination( trace_label = "non_mandatory_tour_destination" model_settings_file_name = "non_mandatory_tour_destination.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) trace_hh_id = whale.settings.trace_hh_id logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME") @@ -67,11 +67,9 @@ def non_mandatory_tour_destination( estimator.write_spec(model_settings, tag="SPEC") estimator.set_alt_id(model_settings["ALT_DEST_COL_NAME"]) estimator.write_table( - inject.get_injectable("size_terms"), "size_terms", append=False - ) - estimator.write_table( - inject.get_table("land_use").to_frame(), "landuse", append=False + whale.get_injectable("size_terms"), "size_terms", append=False ) + estimator.write_table(whale.get_dataframe("land_use"), "landuse", append=False) estimator.write_model_settings(model_settings, model_settings_file_name) choices_df, save_sample_df = tour_destination.run_tour_destination( diff --git a/activitysim/abm/models/non_mandatory_tour_frequency.py b/activitysim/abm/models/non_mandatory_tour_frequency.py index 97f41d257..a774513fd 100644 --- a/activitysim/abm/models/non_mandatory_tour_frequency.py +++ b/activitysim/abm/models/non_mandatory_tour_frequency.py @@ -149,7 +149,7 @@ def non_mandatory_tour_frequency( trace_label = "non_mandatory_tour_frequency" model_settings_file_name = "non_mandatory_tour_frequency.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) # FIXME kind of tacky both that we know to add this here and del it below # 'tot_tours' is used in model_spec expressions @@ -179,7 +179,7 @@ def non_mandatory_tour_frequency( constants = config.get_model_constants(model_settings) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) spec_segments = model_settings.get("SPEC_SEGMENTS", {}) # segment by person type and pick the right spec for each person type @@ -205,7 +205,7 @@ def non_mandatory_tour_frequency( whale, model_name=segment_name, bundle_name="non_mandatory_tour_frequency" ) - coefficients_df = simulate.read_model_coefficients(segment_settings) + coefficients_df = whale.filesystem.read_model_coefficients(segment_settings) segment_spec = simulate.eval_coefficients( whale, segment_spec, coefficients_df, estimator ) @@ -375,7 +375,7 @@ def non_mandatory_tour_frequency( whale.extend_table("tours", non_mandatory_tours) - tracing.register_traceable_table("tours", non_mandatory_tours) + tracing.register_traceable_table(whale, "tours", non_mandatory_tours) whale.get_rn_generator().add_channel("tours", non_mandatory_tours) if whale.is_table("school_escort_tours"): diff --git a/activitysim/abm/models/parking_location_choice.py b/activitysim/abm/models/parking_location_choice.py index cb46c95f5..224a6ebaa 100644 --- a/activitysim/abm/models/parking_location_choice.py +++ b/activitysim/abm/models/parking_location_choice.py @@ -48,7 +48,7 @@ def wrap_skims(model_settings): dict containing skims, keyed by canonical names relative to tour orientation """ - network_los = inject.get_injectable("network_los") + network_los = whale.get_injectable("network_los") skim_dict = network_los.get_default_skim_dict() origin = model_settings["TRIP_ORIGIN"] @@ -79,7 +79,7 @@ def wrap_skims(model_settings): def get_spec_for_segment(model_settings, spec_name, segment): - omnibus_spec = simulate.read_model_spec(file_name=model_settings[spec_name]) + omnibus_spec = whale.filesystem.read_model_spec(file_name=model_settings[spec_name]) spec = omnibus_spec[[segment]] @@ -114,7 +114,7 @@ def parking_destination_simulate( spec = get_spec_for_segment(model_settings, "SPECIFICATION", segment_name) - coefficients_df = simulate.read_model_coefficients(model_settings) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) spec = simulate.eval_coefficients(whale, spec, coefficients_df, None) alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] @@ -290,7 +290,9 @@ def parking_location( """ trace_label = "parking_location" - model_settings = config.read_model_settings("parking_location_choice.yaml") + model_settings = whale.filesystem.read_model_settings( + "parking_location_choice.yaml" + ) trace_hh_id = whale.settings.trace_hh_id alt_destination_col_name = model_settings["ALT_DEST_COL_NAME"] diff --git a/activitysim/abm/models/school_escorting.py b/activitysim/abm/models/school_escorting.py index 78bd260a7..7254bd2c7 100644 --- a/activitysim/abm/models/school_escorting.py +++ b/activitysim/abm/models/school_escorting.py @@ -359,7 +359,7 @@ def school_escorting( """ trace_label = "school_escorting_simulate" model_settings_file_name = "school_escorting.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) trace_hh_id = whale.settings.trace_hh_id alts = simulate.read_model_alts(whale, model_settings["ALTS"], set_index="Alt") @@ -383,10 +383,10 @@ def school_escorting( whale, "school_escorting_" + stage ) - model_spec_raw = simulate.read_model_spec( + model_spec_raw = whale.filesystem.read_model_spec( file_name=model_settings[stage.upper() + "_SPEC"] ) - coefficients_df = simulate.read_model_coefficients( + coefficients_df = whale.filesystem.read_model_coefficients( file_name=model_settings[stage.upper() + "_COEFFICIENTS"] ) model_spec = simulate.eval_coefficients( @@ -528,7 +528,7 @@ def school_escorting( whale.add_table("school_escort_trips", school_escort_trips) # updating timetable object with pure escort tours so joint tours do not schedule ontop - timetable = inject.get_injectable("timetable") + timetable = whale.get_injectable("timetable") # Need to do this such that only one person is in nth_tours # thus, looping through tour_category and tour_num diff --git a/activitysim/abm/models/stop_frequency.py b/activitysim/abm/models/stop_frequency.py index 172331dbb..236bcb9f9 100644 --- a/activitysim/abm/models/stop_frequency.py +++ b/activitysim/abm/models/stop_frequency.py @@ -50,7 +50,7 @@ def stop_frequency( model_settings_file_name = "stop_frequency.yaml" trace_hh_id = whale.settings.trace_hh_id - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) tours = tours.to_frame() tours_merged = tours_merged.to_frame() @@ -121,13 +121,15 @@ def stop_frequency( whale, model_name=segment_name, bundle_name="stop_frequency" ) - segment_spec = simulate.read_model_spec(file_name=segment_settings["SPEC"]) + segment_spec = whale.filesystem.read_model_spec( + file_name=segment_settings["SPEC"] + ) assert segment_spec is not None, ( "spec for segment_type %s not found" % segment_name ) coefficients_file_name = segment_settings["COEFFICIENTS"] - coefficients_df = simulate.read_model_coefficients( + coefficients_df = whale.filesystem.read_model_coefficients( file_name=coefficients_file_name ) segment_spec = simulate.eval_coefficients( diff --git a/activitysim/abm/models/summarize.py b/activitysim/abm/models/summarize.py index 56a790c3a..9ab09dcf8 100644 --- a/activitysim/abm/models/summarize.py +++ b/activitysim/abm/models/summarize.py @@ -225,12 +225,12 @@ def summarize( """ trace_label = "summarize" model_settings_file_name = "summarize.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) output_location = ( model_settings["OUTPUT"] if "OUTPUT" in model_settings else "summaries" ) - os.makedirs(config.output_file_path(output_location), exist_ok=True) + os.makedirs(whale.get_output_file_path(output_location), exist_ok=True) spec = pd.read_csv( whale.filesystem.get_config_file_path(model_settings["SPECIFICATION"]), @@ -319,10 +319,12 @@ def summarize( # Output pipeline tables for expression development if model_settings["EXPORT_PIPELINE_TABLES"] is True: pipeline_table_dir = os.path.join(output_location, "pipeline_tables") - os.makedirs(config.output_file_path(pipeline_table_dir), exist_ok=True) + os.makedirs(whale.get_output_file_path(pipeline_table_dir), exist_ok=True) for name, df in locals_d.items(): df.to_csv( - config.output_file_path(os.path.join(pipeline_table_dir, f"{name}.csv")) + whale.get_output_file_path( + os.path.join(pipeline_table_dir, f"{name}.csv") + ) ) # Add classification functions to locals @@ -350,6 +352,8 @@ def summarize( resultset = eval(expr, globals(), locals_d) resultset.to_csv( - config.output_file_path(os.path.join(output_location, f"{out_file}.csv")), + whale.get_output_file_path( + os.path.join(output_location, f"{out_file}.csv") + ), index=False, ) diff --git a/activitysim/abm/models/telecommute_frequency.py b/activitysim/abm/models/telecommute_frequency.py index 213f7c3dc..33b402019 100755 --- a/activitysim/abm/models/telecommute_frequency.py +++ b/activitysim/abm/models/telecommute_frequency.py @@ -30,7 +30,7 @@ def telecommute_frequency( logger.info("Running %s with %d persons", trace_label, len(choosers)) - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) estimator = estimation.manager.begin_estimation(whale, "telecommute_frequency") constants = config.get_model_constants(model_settings) @@ -50,8 +50,8 @@ def telecommute_frequency( trace_label=trace_label, ) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients( whale, model_spec, coefficients_df, estimator ) diff --git a/activitysim/abm/models/tour_mode_choice.py b/activitysim/abm/models/tour_mode_choice.py index c8c841cd2..e7f5f19cc 100644 --- a/activitysim/abm/models/tour_mode_choice.py +++ b/activitysim/abm/models/tour_mode_choice.py @@ -24,7 +24,9 @@ """ -def get_alts_from_segmented_nested_logit(model_settings, segment_name, trace_label): +def get_alts_from_segmented_nested_logit( + whale: workflow.Whale, model_settings, segment_name, trace_label +): """Infer alts from logit spec Parameters @@ -39,7 +41,9 @@ def get_alts_from_segmented_nested_logit(model_settings, segment_name, trace_lab """ nest_spec = config.get_logit_model_settings(model_settings) - coefficients = simulate.get_segment_coefficients(model_settings, segment_name) + coefficients = whale.filesystem.get_segment_coefficients( + model_settings, segment_name + ) nest_spec = simulate.eval_nest_coefficients(nest_spec, coefficients, trace_label) tour_mode_alts = [] for nest in logit.each_nest(nest_spec): @@ -49,7 +53,9 @@ def get_alts_from_segmented_nested_logit(model_settings, segment_name, trace_lab return tour_mode_alts -def create_logsum_trips(tours, segment_column_name, model_settings, trace_label): +def create_logsum_trips( + whale: workflow.Whale, tours, segment_column_name, model_settings, trace_label +): """ Construct table of trips from half-tours (1 inbound, 1 outbound) for each tour-mode. @@ -66,7 +72,7 @@ def create_logsum_trips(tours, segment_column_name, model_settings, trace_label) pandas.DataFrame Table of trips: 2 per tour, with O/D and purpose inherited from tour """ - stop_frequency_alts = inject.get_injectable("stop_frequency_alts") + stop_frequency_alts = whale.get_injectable("stop_frequency_alts") stop_freq = "0out_0in" # no intermediate stops tours["stop_frequency"] = stop_freq tours["primary_purpose"] = tours["tour_purpose"] @@ -80,7 +86,7 @@ def create_logsum_trips(tours, segment_column_name, model_settings, trace_label) # to get a set of coefficients from the spec segment_name = tours.iloc[0][segment_column_name] tour_mode_alts = get_alts_from_segmented_nested_logit( - model_settings, segment_name, trace_label + whale, model_settings, segment_name, trace_label ) # repeat rows from the trips table iterating over tour mode @@ -94,7 +100,7 @@ def create_logsum_trips(tours, segment_column_name, model_settings, trace_label) return logsum_trips -def append_tour_leg_trip_mode_choice_logsums(tours): +def append_tour_leg_trip_mode_choice_logsums(whale: workflow.Whale, tours): """Creates trip mode choice logsum column in tours table for each tour mode and leg Parameters @@ -106,7 +112,7 @@ def append_tour_leg_trip_mode_choice_logsums(tours): tours : pd.DataFrame Adds two * n_modes logsum columns to each tour row, e.g. "logsum_DRIVE_outbound" """ - trips = inject.get_table("trips").to_frame() + trips = whale.get_dataframe("trips") trip_dir_mode_logsums = trips.pivot( index="tour_id", columns=["tour_mode", "outbound"], @@ -145,12 +151,12 @@ def get_trip_mc_logsums_for_all_modes( # create pseudo-trips from tours for all tour modes logsum_trips = create_logsum_trips( - tours, segment_column_name, model_settings, trace_label + whale, tours, segment_column_name, model_settings, trace_label ) # temporarily register trips in the pipeline whale.add_table("trips", logsum_trips) - tracing.register_traceable_table("trips", logsum_trips) + tracing.register_traceable_table(whale, "trips", logsum_trips) whale.get_rn_generator().add_channel("trips", logsum_trips) # run trip mode choice on pseudo-trips. use orca instead of pipeline to @@ -158,11 +164,11 @@ def get_trip_mc_logsums_for_all_modes( orca.run(["trip_mode_choice"]) # add trip mode choice logsums as new cols in tours - tours = append_tour_leg_trip_mode_choice_logsums(tours) + tours = append_tour_leg_trip_mode_choice_logsums(whale, tours) # de-register logsum trips table whale.get_rn_generator().drop_channel("trips") - tracing.deregister_traceable_table("trips") + tracing.deregister_traceable_table(whale, "trips") return tours @@ -176,7 +182,7 @@ def tour_mode_choice_simulate( """ trace_label = "tour_mode_choice" model_settings_file_name = "tour_mode_choice.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME") mode_column_name = "tour_mode" diff --git a/activitysim/abm/models/tour_od_choice.py b/activitysim/abm/models/tour_od_choice.py index a3cc9d9fc..b1722b3e3 100644 --- a/activitysim/abm/models/tour_od_choice.py +++ b/activitysim/abm/models/tour_od_choice.py @@ -49,7 +49,7 @@ def tour_od_choice( trace_label = "tour_od_choice" model_settings_file_name = "tour_od_choice.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) origin_col_name = model_settings["ORIG_COL_NAME"] dest_col_name = model_settings["DEST_COL_NAME"] alt_id_col = tour_od.get_od_id_col(origin_col_name, dest_col_name) @@ -75,11 +75,9 @@ def tour_od_choice( estimator.write_spec(model_settings, tag="SPEC") estimator.set_alt_id(alt_id_col) estimator.write_table( - inject.get_injectable("size_terms"), "size_terms", append=False - ) - estimator.write_table( - inject.get_table("land_use").to_frame(), "landuse", append=False + whale.get_injectable("size_terms"), "size_terms", append=False ) + estimator.write_table(whale.get_dataframe("land_use"), "landuse", append=False) estimator.write_model_settings(model_settings, model_settings_file_name) choices_df, save_sample_df = tour_od.run_tour_od( diff --git a/activitysim/abm/models/tour_scheduling_probabilistic.py b/activitysim/abm/models/tour_scheduling_probabilistic.py index 609897ddc..dd0cc9ff5 100644 --- a/activitysim/abm/models/tour_scheduling_probabilistic.py +++ b/activitysim/abm/models/tour_scheduling_probabilistic.py @@ -48,7 +48,12 @@ def run_tour_scheduling_probabilistic( series of chosen alternative indices for each chooser """ result_list = [] - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers( whale, tours_df, chunk_size, trace_label, trace_label ): choices = ps.make_scheduling_choices( @@ -95,7 +100,7 @@ def tour_scheduling_probabilistic( trace_label = "tour_scheduling_probabilistic" model_settings_file_name = "tour_scheduling_probabilistic.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) depart_alt_base = model_settings.get("depart_alt_base", 0) scheduling_probs_filepath = whale.filesystem.get_config_file_path( model_settings["PROBS_SPEC"] diff --git a/activitysim/abm/models/transit_pass_ownership.py b/activitysim/abm/models/transit_pass_ownership.py index 76624e3cd..9720c63f7 100644 --- a/activitysim/abm/models/transit_pass_ownership.py +++ b/activitysim/abm/models/transit_pass_ownership.py @@ -24,7 +24,7 @@ def transit_pass_ownership( choosers = persons_merged.to_frame() logger.info("Running %s with %d persons", trace_label, len(choosers)) - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) estimator = estimation.manager.begin_estimation(whale, "transit_pass_ownership") constants = config.get_model_constants(model_settings) @@ -44,8 +44,8 @@ def transit_pass_ownership( trace_label=trace_label, ) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients( whale, model_spec, coefficients_df, estimator ) diff --git a/activitysim/abm/models/transit_pass_subsidy.py b/activitysim/abm/models/transit_pass_subsidy.py index b7ab2ae71..08dca03e4 100644 --- a/activitysim/abm/models/transit_pass_subsidy.py +++ b/activitysim/abm/models/transit_pass_subsidy.py @@ -24,7 +24,7 @@ def transit_pass_subsidy( choosers = persons_merged.to_frame() logger.info("Running %s with %d persons", trace_label, len(choosers)) - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) estimator = estimation.manager.begin_estimation(whale, "transit_pass_subsidy") constants = config.get_model_constants(model_settings) @@ -44,8 +44,8 @@ def transit_pass_subsidy( trace_label=trace_label, ) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients( whale, model_spec, coefficients_df, estimator ) diff --git a/activitysim/abm/models/trip_departure_choice.py b/activitysim/abm/models/trip_departure_choice.py index fdb8c47f0..09a42527b 100644 --- a/activitysim/abm/models/trip_departure_choice.py +++ b/activitysim/abm/models/trip_departure_choice.py @@ -470,9 +470,9 @@ def trip_departure_choice( whale: workflow.Whale, trips, trips_merged, skim_dict, chunk_size, trace_hh_id ): trace_label = "trip_departure_choice" - model_settings = config.read_model_settings("trip_departure_choice.yaml") + model_settings = whale.filesystem.read_model_settings("trip_departure_choice.yaml") - spec = simulate.read_model_spec(file_name=model_settings["SPECIFICATION"]) + spec = whale.filesystem.read_model_spec(file_name=model_settings["SPECIFICATION"]) trips_merged_df = trips_merged.to_frame() # add tour-based chunk_id so we can chunk all trips in tour together diff --git a/activitysim/abm/models/trip_destination.py b/activitysim/abm/models/trip_destination.py index 7361a3bdd..ef9f69e93 100644 --- a/activitysim/abm/models/trip_destination.py +++ b/activitysim/abm/models/trip_destination.py @@ -247,7 +247,7 @@ def choose_MAZ_for_TAZ( taz_sample.rename(columns={alt_dest_col_name: DEST_TAZ}, inplace=True) - trace_hh_id = inject.get_injectable("trace_hh_id", None) + trace_hh_id = whale.settings.trace_hh_id have_trace_targets = trace_hh_id and tracing.has_trace_targets(whale, taz_sample) if have_trace_targets: trace_label = tracing.extend_trace_label(trace_label, "choose_MAZ_for_TAZ") @@ -571,7 +571,7 @@ def trip_destination_sample( assert len(alternatives) > 0 # by default, enable presampling for multizone systems, unless they disable it in settings file - network_los = inject.get_injectable("network_los") + network_los = whale.get_injectable("network_los") pre_sample_taz = network_los.zone_system != los.ONE_ZONE if pre_sample_taz and not whale.settings.want_dest_choice_presampling: pre_sample_taz = False @@ -699,7 +699,7 @@ def compute_logsums( chunk_tag = "trip_destination.compute_logsums" # FIXME should pass this in? - network_los = inject.get_injectable("network_los") + network_los = whale.get_injectable("network_los") # - trips_merged - merge trips and tours_merged trips_merged = pd.merge( @@ -719,13 +719,17 @@ def compute_logsums( ).set_index("trip_id") assert choosers.index.equals(destination_sample.index) - logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) - coefficients = simulate.get_segment_coefficients(logsum_settings, primary_purpose) + logsum_settings = whale.filesystem.read_model_settings( + model_settings["LOGSUM_SETTINGS"] + ) + coefficients = whale.filesystem.get_segment_coefficients( + logsum_settings, primary_purpose + ) nest_spec = config.get_logit_model_settings(logsum_settings) nest_spec = simulate.eval_nest_coefficients(nest_spec, coefficients, trace_label) - logsum_spec = simulate.read_model_spec(whale, file_name=logsum_settings["SPEC"]) + logsum_spec = whale.filesystem.read_model_spec(file_name=logsum_settings["SPEC"]) logsum_spec = simulate.eval_coefficients( whale, logsum_spec, coefficients, estimator=None ) @@ -1159,9 +1163,11 @@ def run_trip_destination( """ model_settings_file_name = "trip_destination.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) preprocessor_settings = model_settings.get("preprocessor", None) - logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) + logsum_settings = whale.filesystem.read_model_settings( + model_settings["LOGSUM_SETTINGS"] + ) logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME") want_logsums = logsum_column_name is not None @@ -1172,8 +1178,8 @@ def run_trip_destination( ) land_use = inject.get_table("land_use") - size_terms = inject.get_injectable("size_terms") - network_los = inject.get_injectable("network_los") + size_terms = whale.get_injectable("size_terms") + network_los = whale.get_injectable("network_los") trips = trips.sort_index() trips["next_trip_id"] = np.roll(trips.index, -1) trips.next_trip_id = trips.next_trip_id.where(trips.trip_num < trips.trip_count, 0) @@ -1405,7 +1411,7 @@ def trip_destination( trace_label = "trip_destination" model_settings_file_name = "trip_destination.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) CLEANUP = model_settings.get("CLEANUP", True) fail_some_trips_for_testing = model_settings.get( @@ -1430,11 +1436,9 @@ def trip_destination( estimator.write_spec(model_settings, tag="SPEC") estimator.set_alt_id(model_settings["ALT_DEST_COL_NAME"]) estimator.write_table( - inject.get_injectable("size_terms"), "size_terms", append=False - ) - estimator.write_table( - inject.get_table("land_use").to_frame(), "landuse", append=False + whale.get_injectable("size_terms"), "size_terms", append=False ) + estimator.write_table(whale.get_dataframe("land_use"), "landuse", append=False) estimator.write_model_settings(model_settings, model_settings_file_name) logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) diff --git a/activitysim/abm/models/trip_matrices.py b/activitysim/abm/models/trip_matrices.py index 683c3e108..5e02e01fd 100644 --- a/activitysim/abm/models/trip_matrices.py +++ b/activitysim/abm/models/trip_matrices.py @@ -42,14 +42,16 @@ def write_trip_matrices(whale: workflow.Whale, network_los): ) return - model_settings = config.read_model_settings("write_trip_matrices.yaml") + model_settings = whale.filesystem.read_model_settings("write_trip_matrices.yaml") trips_df = annotate_trips(whale, trips, network_los, model_settings) if bool(model_settings.get("SAVE_TRIPS_TABLE")): whale.add_table("trips", trips_df) if "parking_location" in whale.settings.models: - parking_settings = config.read_model_settings("parking_location_choice.yaml") + parking_settings = whale.filesystem.read_model_settings( + "parking_location_choice.yaml" + ) parking_taz_col_name = parking_settings["ALT_DEST_COL_NAME"] if parking_taz_col_name in trips_df: # TODO make parking zone negative, not zero, if not used @@ -300,7 +302,7 @@ def write_matrices( if matrix_is_tap == is_tap: # only write tap matrices to tap matrix files filename = matrix.get("file_name") - filepath = config.output_file_path(filename) + filepath = whale.get_output_file_path(filename) logger.info("opening %s" % filepath) file = omx.open_file(filepath, "w") # possibly overwrite existing file table_settings = matrix.get("tables") diff --git a/activitysim/abm/models/trip_mode_choice.py b/activitysim/abm/models/trip_mode_choice.py index dc710f64a..e9cc2ab4e 100644 --- a/activitysim/abm/models/trip_mode_choice.py +++ b/activitysim/abm/models/trip_mode_choice.py @@ -38,7 +38,7 @@ def trip_mode_choice( trace_label = "trip_mode_choice" model_settings_file_name = "trip_mode_choice.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME") mode_column_name = "trip_mode" @@ -161,7 +161,7 @@ def trip_mode_choice( estimator.write_spec(model_settings) estimator.write_model_settings(model_settings, model_settings_file_name) - model_spec = simulate.read_model_spec(whale, file_name=model_settings["SPEC"]) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) nest_spec = config.get_logit_model_settings(model_settings) choices_list = [] @@ -183,7 +183,7 @@ def trip_mode_choice( tvpb_logsum_odt.extend_trace_label(primary_purpose) # tvpb_logsum_dot.extend_trace_label(primary_purpose) - coefficients = simulate.get_segment_coefficients( + coefficients = whale.filesystem.get_segment_coefficients( model_settings, primary_purpose ) diff --git a/activitysim/abm/models/trip_purpose.py b/activitysim/abm/models/trip_purpose.py index 1ece7f8d0..6fbfa3feb 100644 --- a/activitysim/abm/models/trip_purpose.py +++ b/activitysim/abm/models/trip_purpose.py @@ -101,7 +101,7 @@ def choose_intermediate_trip_purpose( ] # join to persons for better diagnostics - persons = inject.get_table("persons").to_frame() + persons = whale.get_dataframe("persons") persons_cols = [ "age", "is_worker", @@ -186,7 +186,7 @@ def run_trip_purpose( chunk_tag = "trip_purpose" model_settings_file_name = "trip_purpose.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) probs_join_cols = model_settings.get("probs_join_cols", PROBS_JOIN_COLUMNS) @@ -196,7 +196,7 @@ def run_trip_purpose( ) # FIXME for now, not really doing estimation for probabilistic model - just overwriting choices # besides, it isn't clear that named coefficients would be helpful if we had some form of estimation - # coefficients_df = simulate.read_model_coefficients(model_settings) + # coefficients_df = whale.filesystem.read_model_coefficients(model_settings) # probs_spec = map_coefficients(probs_spec, coefficients_df) if estimator: @@ -237,7 +237,12 @@ def run_trip_purpose( use_depart_time = model_settings.get("use_depart_time", True) - for i, trips_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( + for ( + i, + trips_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers( whale, trips_df, chunk_size, chunk_tag, trace_label ): choices = choose_intermediate_trip_purpose( diff --git a/activitysim/abm/models/trip_purpose_and_destination.py b/activitysim/abm/models/trip_purpose_and_destination.py index 072baf689..c9ea74060 100644 --- a/activitysim/abm/models/trip_purpose_and_destination.py +++ b/activitysim/abm/models/trip_purpose_and_destination.py @@ -56,10 +56,12 @@ def trip_purpose_and_destination( whale: workflow.Whale, trips, tours_merged, chunk_size, trace_hh_id ): trace_label = "trip_purpose_and_destination" - model_settings = config.read_model_settings("trip_purpose_and_destination.yaml") + model_settings = whale.filesystem.read_model_settings( + "trip_purpose_and_destination.yaml" + ) # for consistency, read sample_table_name setting from trip_destination settings file - trip_destination_model_settings = config.read_model_settings( + trip_destination_model_settings = whale.filesystem.read_model_settings( "trip_destination.yaml" ) sample_table_name = trip_destination_model_settings.get( diff --git a/activitysim/abm/models/trip_scheduling.py b/activitysim/abm/models/trip_scheduling.py index 080da2c01..80c75770f 100644 --- a/activitysim/abm/models/trip_scheduling.py +++ b/activitysim/abm/models/trip_scheduling.py @@ -405,7 +405,7 @@ def trip_scheduling(whale: workflow.Whale, trips, tours, chunk_size, trace_hh_id """ trace_label = "trip_scheduling" model_settings_file_name = "trip_scheduling.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) trips_df = trips.to_frame() tours = tours.to_frame() @@ -446,7 +446,7 @@ def trip_scheduling(whale: workflow.Whale, trips, tours, chunk_size, trace_hh_id ) # FIXME for now, not really doing estimation for probabilistic model - just overwriting choices # besides, it isn't clear that named coefficients would be helpful if we had some form of estimation - # coefficients_df = simulate.read_model_coefficients(model_settings) + # coefficients_df = whale.filesystem.read_model_coefficients(model_settings) # probs_spec = map_coefficients(probs_spec, coefficients_df) # add tour-based chunk_id so we can chunk all trips in tour together diff --git a/activitysim/abm/models/trip_scheduling_choice.py b/activitysim/abm/models/trip_scheduling_choice.py index d7fbb986f..221560b7d 100644 --- a/activitysim/abm/models/trip_scheduling_choice.py +++ b/activitysim/abm/models/trip_scheduling_choice.py @@ -196,7 +196,7 @@ def get_spec_for_segment(model_settings, spec_name, segment): :return: array of utility equations """ - omnibus_spec = simulate.read_model_spec(file_name=model_settings[spec_name]) + omnibus_spec = whale.filesystem.read_model_spec(file_name=model_settings[spec_name]) spec = omnibus_spec[[segment]] @@ -327,7 +327,7 @@ def trip_scheduling_choice( whale: workflow.Whale, trips, tours, skim_dict, chunk_size, trace_hh_id ): trace_label = "trip_scheduling_choice" - model_settings = config.read_model_settings("trip_scheduling_choice.yaml") + model_settings = whale.filesystem.read_model_settings("trip_scheduling_choice.yaml") spec = get_spec_for_segment(model_settings, "SPECIFICATION", "stage_one") trips_df = trips.to_frame() diff --git a/activitysim/abm/models/util/canonical_ids.py b/activitysim/abm/models/util/canonical_ids.py index ca95169c4..6bb2c3570 100644 --- a/activitysim/abm/models/util/canonical_ids.py +++ b/activitysim/abm/models/util/canonical_ids.py @@ -218,7 +218,9 @@ def canonical_tours(whale: workflow.Whale): # ---- non_mandatory_channels nm_model_settings_file_name = "non_mandatory_tour_frequency.yaml" - nm_model_settings = config.read_model_settings(nm_model_settings_file_name) + nm_model_settings = whale.filesystem.read_model_settings( + nm_model_settings_file_name + ) nm_alts = read_alts_file(whale, "non_mandatory_tour_frequency_alternatives.csv") # first need to determine max extension @@ -255,7 +257,9 @@ def canonical_tours(whale: workflow.Whale): # ---- mandatory_channels mtf_model_settings_file_name = "mandatory_tour_frequency.yaml" - mtf_model_settings = config.read_model_settings(mtf_model_settings_file_name) + mtf_model_settings = whale.filesystem.read_model_settings( + mtf_model_settings_file_name + ) mtf_spec = mtf_model_settings.get("SPEC", "mandatory_tour_frequency.csv") mtf_model_spec = read_alts_file(whale, file_name=mtf_spec) default_mandatory_tour_flavors = {"work": 2, "school": 2} @@ -269,7 +273,9 @@ def canonical_tours(whale: workflow.Whale): # ---- atwork_subtour_channels atwork_model_settings_file_name = "atwork_subtour_frequency.yaml" - atwork_model_settings = config.read_model_settings(atwork_model_settings_file_name) + atwork_model_settings = whale.filesystem.read_model_settings( + atwork_model_settings_file_name + ) atwork_alts = read_alts_file(whale, "atwork_subtour_frequency_alternatives.csv") provided_atwork_flavors = atwork_model_settings.get("ATWORK_SUBTOUR_FLAVORS", None) @@ -293,7 +299,9 @@ def canonical_tours(whale: workflow.Whale): # ---- joint_tour_channels jtf_model_settings_file_name = "joint_tour_frequency.yaml" - jtf_model_settings = config.read_model_settings(jtf_model_settings_file_name) + jtf_model_settings = whale.filesystem.read_model_settings( + jtf_model_settings_file_name + ) jtf_alts = read_alts_file(whale, "joint_tour_frequency_alternatives.csv") provided_joint_flavors = jtf_model_settings.get("JOINT_TOUR_FLAVORS", None) @@ -325,7 +333,9 @@ def canonical_tours(whale: workflow.Whale): "school_escorting" in whale.settings.models ): se_model_settings_file_name = "school_escorting.yaml" - se_model_settings = config.read_model_settings(se_model_settings_file_name) + se_model_settings = whale.filesystem.read_model_settings( + se_model_settings_file_name + ) num_escortees = se_model_settings.get("NUM_ESCORTEES", 3) school_escort_flavors = {"escort": 2 * num_escortees} school_escort_channels = enumerate_tour_types(school_escort_flavors) @@ -340,7 +350,11 @@ def canonical_tours(whale: workflow.Whale): def set_tour_index( - tours, parent_tour_num_col=None, is_joint=False, is_school_escorting=False + whale: workflow.Whale, + tours, + parent_tour_num_col=None, + is_joint=False, + is_school_escorting=False, ): """ The new index values are stable based on the person_id, tour_type, and tour_num. @@ -360,7 +374,7 @@ def set_tour_index( """ tour_num_col = "tour_type_num" - possible_tours = canonical_tours() + possible_tours = canonical_tours(whale) possible_tours_count = len(possible_tours) assert tour_num_col in tours.columns @@ -416,7 +430,7 @@ def set_tour_index( def determine_max_trips_per_leg(whale: workflow.Whale, default_max_trips_per_leg=4): model_settings_file_name = "stop_frequency.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) # first see if flavors given explicitly provided_max_trips_per_leg = model_settings.get("MAX_TRIPS_PER_LEG", None) diff --git a/activitysim/abm/models/util/cdap.py b/activitysim/abm/models/util/cdap.py index d10945091..1310fb012 100644 --- a/activitysim/abm/models/util/cdap.py +++ b/activitysim/abm/models/util/cdap.py @@ -29,7 +29,6 @@ def set_hh_index(df): - # index on household_id, not person_id df.set_index(_hh_id_, inplace=True) df.index.name = _hh_index_ @@ -273,11 +272,10 @@ def cached_spec_name(hhsize): return "cdap_spec_%s" % hhsize -def get_cached_spec(hhsize): - +def get_cached_spec(whale: workflow.Whale, hhsize): spec_name = cached_spec_name(hhsize) - spec = inject.get_injectable(spec_name, None) + spec = whale.get_injectable(spec_name, None) if spec is not None: logger.debug("build_cdap_spec returning cached injectable spec %s", spec_name) return spec @@ -287,22 +285,27 @@ def get_cached_spec(hhsize): # cached spec will be available as an injectable to subsequent chunks # # try data dir - # if os.path.exists(config.output_file_path(spec_name)): - # spec_path = config.output_file_path(spec_name) + # if os.path.exists(whale.get_output_file_path(spec_name)): + # spec_path = whale.get_output_file_path(spec_name) # logger.info("build_cdap_spec reading cached spec %s from %s", spec_name, spec_path) # return pd.read_csv(spec_path, index_col='Expression') return None -def cache_spec(hhsize, spec): +def cache_spec(whale: workflow.Whale, hhsize, spec): spec_name = cached_spec_name(hhsize) # cache as injectable - inject.add_injectable(spec_name, spec) + whale.add_injectable(spec_name, spec) def build_cdap_spec( - interaction_coefficients, hhsize, trace_spec=False, trace_label=None, cache=True + whale: workflow.Whale, + interaction_coefficients, + hhsize, + trace_spec=False, + trace_label=None, + cache=True, ): """ Build a spec file for computing utilities of alternative household member interaction patterns @@ -359,7 +362,7 @@ def build_cdap_spec( hhsize = min(hhsize, MAX_HHSIZE) if cache: - spec = get_cached_spec(hhsize) + spec = get_cached_spec(whale, hhsize) if spec is not None: return spec @@ -383,7 +386,6 @@ def build_cdap_spec( # N_p1 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0 for pnum in range(1, hhsize + 1): for activity in ["M", "N", "H"]: - new_row_index = len(spec) spec.loc[new_row_index, expression_name] = add_pn(activity, pnum) @@ -399,10 +401,8 @@ def build_cdap_spec( # for each row in the interaction_coefficients table for row in interaction_coefficients[relevant_rows].itertuples(): - # if it is a wildcard all_people interaction if not row.interaction_ptypes: - # wildcard interactions only apply if the interaction includes all household members # this will be the case if the cardinality of the wildcard equals the hhsize # conveniently, the slug is given the name of the alternative column (e.g. HHHH) @@ -426,7 +426,6 @@ def build_cdap_spec( # possible combination of interacting persons # e.g. for (1, 2), (1,3), (2,3) for a coefficient with cardinality 2 in hhsize 3 for tup in itertools.combinations(list(range(1, hhsize + 1)), row.cardinality): - # determine the name of the chooser column with the ptypes for this interaction if row.cardinality == 1: interaction_column = "ptype_p%d" % tup[0] @@ -489,7 +488,7 @@ def build_cdap_spec( ) if cache: - cache_spec(hhsize, spec) + cache_spec(whale, hhsize, spec) t0 = tracing.print_elapsed_time("build_cdap_spec hh_size %s" % hhsize, t0) @@ -610,7 +609,6 @@ def hh_choosers(indiv_utils, hhsize): # for each of the higher cdap_ranks for pnum in range(2, hhsize + 1): - # df with merge columns for indiv with cdap_rank of pnum rhs = indiv_utils.loc[ include_households & (indiv_utils["cdap_rank"] == pnum), merge_cols @@ -682,10 +680,10 @@ def household_activity_choices( # index on household_id, not person_id set_hh_index(utils) else: - choosers = hh_choosers(indiv_utils, hhsize=hhsize) spec = build_cdap_spec( + whale, interaction_coefficients, hhsize, trace_spec=(trace_hh_id in choosers.index), @@ -709,7 +707,6 @@ def household_activity_choices( choices = pd.Series(utils.columns[idx_choices].values, index=utils.index) if trace_hh_id: - if hhsize > 1: tracing.trace_df( choosers, @@ -789,7 +786,12 @@ def unpack_cdap_indiv_activity_choices(persons, hh_choices, trace_hh_id, trace_l def extra_hh_member_choices( - persons, cdap_fixed_relative_proportions, locals_d, trace_hh_id, trace_label + whale: workflow.Whale, + persons, + cdap_fixed_relative_proportions, + locals_d, + trace_hh_id, + trace_label, ): """ Generate the activity choices for the 'extra' household members who weren't handled by cdap @@ -830,7 +832,7 @@ def extra_hh_member_choices( # eval the expression file values = simulate.eval_variables( - cdap_fixed_relative_proportions.index, choosers, locals_d + whale, cdap_fixed_relative_proportions.index, choosers, locals_d ) # cdap_fixed_relative_proportions computes relative proportions by ptype, not utilities @@ -893,7 +895,7 @@ def _run_cdap( trace_label, *, chunk_sizer, -): +) -> pd.DataFrame: """ Implements core run_cdap functionality on persons df (or chunked subset thereof) Aside from chunking of persons df, params are passed through from run_cdap unchanged @@ -924,19 +926,20 @@ def _run_cdap( trace_label, chunk_sizer=chunk_sizer, ) - chunk.log_df(trace_label, "indiv_utils", indiv_utils) + chunk_sizer.log_df(trace_label, "indiv_utils", indiv_utils) # compute interaction utilities, probabilities, and hh activity pattern choices # for each size household separately in turn up to MAX_HHSIZE hh_choices_list = [] for hhsize in range(1, MAX_HHSIZE + 1): - choices = household_activity_choices( + whale, indiv_utils, interaction_coefficients, hhsize=hhsize, trace_hh_id=trace_hh_id, trace_label=trace_label, + chunk_sizer=chunk_sizer, ) hh_choices_list.append(choices) @@ -957,7 +960,12 @@ def _run_cdap( # assign activities to extra household members (with cdap_rank > MAX_HHSIZE) # resulting series contains one activity per individual hh member, indexed on _persons_index_ extra_person_choices = extra_hh_member_choices( - persons, cdap_fixed_relative_proportions, locals_d, trace_hh_id, trace_label + whale, + persons, + cdap_fixed_relative_proportions, + locals_d, + trace_hh_id, + trace_label, ) # concat cdap and extra persoin choices into a single series @@ -1039,8 +1047,9 @@ def run_cdap( persons_chunk, chunk_trace_label, chunk_sizer, - ) in chunk.adaptive_chunked_choosers_by_chunk_id(persons, chunk_size, trace_label): - + ) in chunk.adaptive_chunked_choosers_by_chunk_id( + whale, persons, chunk_size, trace_label + ): cdap_results = _run_cdap( whale, persons_chunk, @@ -1056,7 +1065,7 @@ def run_cdap( result_list.append(cdap_results) - chunk.log_df(trace_label, f"result_list", result_list) + chunk_sizer.log_df(trace_label, f"result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: @@ -1065,7 +1074,6 @@ def run_cdap( cdap_results = pd.concat(result_list) if trace_hh_id: - tracing.trace_df( cdap_results, label="cdap", diff --git a/activitysim/abm/models/util/estimation.py b/activitysim/abm/models/util/estimation.py index 7d02457b3..91a51142e 100644 --- a/activitysim/abm/models/util/estimation.py +++ b/activitysim/abm/models/util/estimation.py @@ -280,7 +280,9 @@ def write_coefficients( assert file_name is not None if coefficients_df is None: - coefficients_df = simulate.read_model_coefficients(file_name=file_name) + coefficients_df = whale.filesystem.read_model_coefficients( + file_name=file_name + ) # preserve original config file name base_file_name = os.path.basename(file_name) diff --git a/activitysim/abm/models/util/logsums.py b/activitysim/abm/models/util/logsums.py index e4aff206f..7eaa2b5ac 100644 --- a/activitysim/abm/models/util/logsums.py +++ b/activitysim/abm/models/util/logsums.py @@ -128,9 +128,9 @@ def compute_logsums( else: logger.error("Choosers table already has column 'duration'.") - logsum_spec = simulate.read_model_spec(whale, file_name=logsum_settings["SPEC"]) - coefficients = simulate.get_segment_coefficients( - whale, logsum_settings, tour_purpose + logsum_spec = whale.filesystem.read_model_spec(file_name=logsum_settings["SPEC"]) + coefficients = whale.filesystem.get_segment_coefficients( + logsum_settings, tour_purpose ) logsum_spec = simulate.eval_coefficients( diff --git a/activitysim/abm/models/util/mode.py b/activitysim/abm/models/util/mode.py index f1455f0ae..c85eb13a6 100644 --- a/activitysim/abm/models/util/mode.py +++ b/activitysim/abm/models/util/mode.py @@ -105,8 +105,10 @@ def run_tour_mode_choice_simulate( you want to use in the evaluation of variables. """ - spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients = simulate.get_segment_coefficients(model_settings, tour_purpose) + spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients = whale.filesystem.get_segment_coefficients( + model_settings, tour_purpose + ) spec = simulate.eval_coefficients(whale, spec, coefficients, estimator) diff --git a/activitysim/abm/models/util/overlap.py b/activitysim/abm/models/util/overlap.py index 70fadfbd4..a014b7b15 100644 --- a/activitysim/abm/models/util/overlap.py +++ b/activitysim/abm/models/util/overlap.py @@ -103,7 +103,7 @@ def p2p_time_window_overlap(p1_ids, p2_ids): """ - timetable = inject.get_injectable("timetable") + timetable = whale.get_injectable("timetable") assert len(p1_ids) == len(p2_ids) # if series, ought to have same index @@ -223,7 +223,7 @@ def person_time_window_overlap(persons): def person_max_window(persons): - timetable = inject.get_injectable("timetable") + timetable = whale.get_injectable("timetable") # ndarray with one row per person and one column per time period # array value of 1 where free periods and 0 elsewhere diff --git a/activitysim/abm/models/util/school_escort_tours_trips.py b/activitysim/abm/models/util/school_escort_tours_trips.py index 3c72d0175..20acfca61 100644 --- a/activitysim/abm/models/util/school_escort_tours_trips.py +++ b/activitysim/abm/models/util/school_escort_tours_trips.py @@ -376,7 +376,7 @@ def process_tours_after_escorting_model(escort_bundles, tours): tours.loc[bad_end_times, "end"] = tours.loc[bad_end_times, "start"] # updating tdd to match start and end times - tdd_alts = inject.get_injectable("tdd_alts") + tdd_alts = whale.get_injectable("tdd_alts") tdd_alts["tdd"] = tdd_alts.index tours.drop(columns="tdd", inplace=True) @@ -582,7 +582,7 @@ def create_pure_school_escort_tours(bundles): pe_tours["tour_num"] = grouped.cumcount() + 1 pe_tours["tour_count"] = pe_tours["tour_num"] + grouped.cumcount(ascending=False) - pe_tours = canonical_ids.set_tour_index(pe_tours, is_school_escorting=True) + pe_tours = canonical_ids.set_tour_index(whale, pe_tours, is_school_escorting=True) return pe_tours @@ -601,7 +601,7 @@ def force_escortee_tour_modes_to_match_chauffeur(tours): # FIXME: escortee tour can have different chauffeur in outbound vs inbound direction # which tour mode should it be set to? Currently it's whatever comes last. # Does it even matter if trip modes are getting matched later? - escort_bundles = inject.get_table("escort_bundles").to_frame() + escort_bundles = whale.get_dataframe("escort_bundles") # grabbing the school tour ids for each school escort bundle se_tours = escort_bundles[["school_tour_ids", "chauf_tour_id"]].copy() @@ -629,7 +629,7 @@ def force_escortee_tour_modes_to_match_chauffeur(tours): def force_escortee_trip_modes_to_match_chauffeur(trips): - school_escort_trips = inject.get_table("school_escort_trips").to_frame() + school_escort_trips = whale.get_dataframe("school_escort_trips") # starting with only trips that are created as part of the school escorting model se_trips = trips[trips.index.isin(school_escort_trips.index)].copy() diff --git a/activitysim/abm/models/util/test/test_cdap.py b/activitysim/abm/models/util/test/test_cdap.py index fd195d8f9..3e2b25eed 100644 --- a/activitysim/abm/models/util/test/test_cdap.py +++ b/activitysim/abm/models/util/test/test_cdap.py @@ -77,7 +77,7 @@ def test_assign_cdap_rank(people, model_settings): def test_individual_utilities(people, model_settings): - cdap_indiv_and_hhsize1 = simulate.read_model_spec( + cdap_indiv_and_hhsize1 = whale.filesystem.read_model_spec( file_name="cdap_indiv_and_hhsize1.csv" ) @@ -131,7 +131,7 @@ def test_individual_utilities(people, model_settings): def test_build_cdap_spec_hhsize2(whale: workflow.Whale, people, model_settings): hhsize = 2 - cdap_indiv_and_hhsize1 = simulate.read_model_spec( + cdap_indiv_and_hhsize1 = whale.filesystem.read_model_spec( file_name="cdap_indiv_and_hhsize1.csv" ) @@ -160,14 +160,14 @@ def test_build_cdap_spec_hhsize2(whale: workflow.Whale, people, model_settings): choosers = cdap.hh_choosers(indiv_utils, hhsize=hhsize) spec = cdap.build_cdap_spec( - interaction_coefficients, hhsize=hhsize, cache=False + whale, interaction_coefficients, hhsize=hhsize, cache=False ) # pandas.dot depends on column names of expression_values matching spec index values # expressions should have been uniquified when spec was read assert spec.index.is_unique - vars = simulate.eval_variables(spec.index, choosers) + vars = simulate.eval_variables(whale, spec.index, choosers) assert (spec.index.values == vars.columns.values).all() # spec = spec.astype(np.float64) diff --git a/activitysim/abm/models/util/test/test_mandatory_tour_frequency.py b/activitysim/abm/models/util/test/test_mandatory_tour_frequency.py index 3f0144608..99570da2d 100644 --- a/activitysim/abm/models/util/test/test_mandatory_tour_frequency.py +++ b/activitysim/abm/models/util/test/test_mandatory_tour_frequency.py @@ -53,7 +53,9 @@ def test_mtf(): tour_frequency_alternatives = mandatory_tour_frequency_alternatives() - mandatory_tours = process_mandatory_tours(persons, tour_frequency_alternatives) + mandatory_tours = process_mandatory_tours( + whale, persons, tour_frequency_alternatives + ) idx = mandatory_tours.index diff --git a/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py b/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py index 96fc4ee4c..940562732 100644 --- a/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py +++ b/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py @@ -8,7 +8,7 @@ import pandas.testing as pdt import pytest -from activitysim.core import inject +from activitysim.core import inject, workflow from ..vectorize_tour_scheduling import ( get_previous_tour_by_tourid, @@ -27,7 +27,7 @@ def setup_function(): def test_vts(): - + whale = workflow.Whale() inject.add_injectable("settings", {}) # note: need 0 duration tour on one end of day to guarantee at least one available tour @@ -70,9 +70,10 @@ def test_vts(): inject.add_injectable("check_for_variability", True) - timetable = inject.get_injectable("timetable") + timetable = whale.get_injectable("timetable") tdd_choices = vectorize_tour_scheduling( + whale, tours, persons, alts, diff --git a/activitysim/abm/models/util/tour_destination.py b/activitysim/abm/models/util/tour_destination.py index 8b3f8b931..0ec399cb8 100644 --- a/activitysim/abm/models/util/tour_destination.py +++ b/activitysim/abm/models/util/tour_destination.py @@ -26,7 +26,7 @@ class SizeTermCalculator(object): def __init__(self, size_term_selector): # do this once so they can request size_terms for various segments (tour_type or purpose) land_use = inject.get_table("land_use") - size_terms = inject.get_injectable("size_terms") + size_terms = whale.get_injectable("size_terms") self.destination_size_terms = tour_destination_size_terms( land_use, size_terms, size_term_selector ) @@ -251,7 +251,7 @@ def choose_MAZ_for_TAZ(whale: workflow.Whale, taz_sample, MAZ_size_terms, trace_ # 542963 53 0.004224 2 13243 # 542963 59 0.008628 1 13243 - trace_hh_id = inject.get_injectable("trace_hh_id", None) + trace_hh_id = whale.settings.trace_hh_id have_trace_targets = trace_hh_id and tracing.has_trace_targets(whale, taz_sample) if have_trace_targets: trace_label = tracing.extend_trace_label(trace_label, "choose_MAZ_for_TAZ") @@ -628,7 +628,9 @@ def run_destination_logsums( +-----------+--------------+----------------+------------+----------------+ """ - logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) + logsum_settings = whale.filesystem.read_model_settings( + model_settings["LOGSUM_SETTINGS"] + ) # if special person id is passed chooser_id_column = model_settings.get("CHOOSER_ID_COLUMN", "person_id") diff --git a/activitysim/abm/models/util/tour_frequency.py b/activitysim/abm/models/util/tour_frequency.py index 37108b564..f527bc488 100644 --- a/activitysim/abm/models/util/tour_frequency.py +++ b/activitysim/abm/models/util/tour_frequency.py @@ -6,6 +6,7 @@ import pandas as pd from activitysim.abm.models.util.canonical_ids import set_tour_index +from activitysim.core import workflow from activitysim.core.util import reindex logger = logging.getLogger(__name__) @@ -167,7 +168,9 @@ def process_tours( return tours -def process_mandatory_tours(persons, mandatory_tour_frequency_alts): +def process_mandatory_tours( + whale: workflow.Whale, persons, mandatory_tour_frequency_alts +): """ This method processes the mandatory_tour_frequency column that comes out of the model of the same name and turns into a DataFrame that represents the @@ -238,7 +241,7 @@ def process_mandatory_tours(persons, mandatory_tour_frequency_alts): tours["household_id"] = tours_merged.household_id # assign stable (predictable) tour_id - set_tour_index(tours) + set_tour_index(whale, tours) """ person_id tour_type tour_type_count tour_type_num tour_num tour_count @@ -291,7 +294,7 @@ def process_non_mandatory_tours(persons, tour_counts): tours["origin"] = reindex(persons.home_zone_id, tours.person_id) # assign stable (predictable) tour_id - set_tour_index(tours) + set_tour_index(whale, tours) """ person_id tour_type tour_type_count tour_type_num tour_num tour_count @@ -377,7 +380,7 @@ def process_atwork_subtours(work_tours, atwork_subtour_frequency_alts): tours = pd.merge(tours, work_tours, left_on=parent_col, right_index=True) # assign stable (predictable) tour_id - set_tour_index(tours, parent_tour_num_col="parent_tour_num") + set_tour_index(whale, tours, parent_tour_num_col="parent_tour_num") """ person_id tour_type tour_type_count tour_type_num tour_num tour_count @@ -442,7 +445,7 @@ def process_joint_tours(joint_tour_frequency, joint_tour_frequency_alts, point_p tours["origin"] = reindex(point_persons.home_zone_id, tours.household_id) # assign stable (predictable) tour_id - set_tour_index(tours, is_joint=True) + set_tour_index(whale, tours, is_joint=True) """ household_id tour_type tour_type_count tour_type_num tour_num tour_count diff --git a/activitysim/abm/models/util/tour_od.py b/activitysim/abm/models/util/tour_od.py index 41cf80f6b..68434a501 100644 --- a/activitysim/abm/models/util/tour_od.py +++ b/activitysim/abm/models/util/tour_od.py @@ -342,7 +342,7 @@ def choose_MAZ_for_TAZ( # 542963 53 0.004224 2 13243 # 542963 59 0.008628 1 13243 - trace_hh_id = inject.get_injectable("trace_hh_id", None) + trace_hh_id = whale.settings.trace_hh_id have_trace_targets = trace_hh_id and tracing.has_trace_targets(whale, taz_sample) if have_trace_targets: trace_label = tracing.extend_trace_label(trace_label, "choose_MAZ_for_TAZ") @@ -648,7 +648,7 @@ def __init__(self, size_term_selector): # do this once so they can request size_terms for various segments (tour_type or purpose) land_use = inject.get_table("land_use") self.land_use = land_use - size_terms = inject.get_injectable("size_terms") + size_terms = whale.get_injectable("size_terms") self.destination_size_terms = tour_destination_size_terms( self.land_use, size_terms, size_term_selector ) @@ -757,6 +757,7 @@ def run_od_sample( def run_od_logsums( + whale: workflow.Whale, spec_segment_name, tours_merged_df, od_sample, @@ -774,7 +775,9 @@ def run_od_logsums( (person, OD_id) pair in od_sample, and computing the logsum of all the utilities """ chunk_tag = "tour_od.logsums" - logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) + logsum_settings = whale.filesystem.read_model_settings( + model_settings["LOGSUM_SETTINGS"] + ) origin_id_col = model_settings["ORIG_COL_NAME"] dest_id_col = model_settings["DEST_COL_NAME"] tour_od_id_col = get_od_id_col(origin_id_col, dest_id_col) @@ -797,7 +800,9 @@ def run_od_logsums( # run trip mode choice to compute tour mode choice logsums if logsum_settings.get("COMPUTE_TRIP_MODE_CHOICE_LOGSUMS", False): pseudo_tours = choosers.copy() - trip_mode_choice_settings = config.read_model_settings("trip_mode_choice") + trip_mode_choice_settings = whale.filesystem.read_model_settings( + "trip_mode_choice" + ) # tours_merged table doesn't yet have all the cols it needs to be called (e.g. # home_zone_id), so in order to compute tour mode choice/trip mode choice logsums @@ -826,7 +831,7 @@ def run_od_logsums( # tour dest as separate column in the trips table bc the trip mode choice # preprocessor isn't able to get the tour dest from the tours table bc the # tours don't yet have ODs. - stop_frequency_alts = inject.get_injectable("stop_frequency_alts") + stop_frequency_alts = whale.get_injectable("stop_frequency_alts") pseudo_tours["tour_destination"] = pseudo_tours[dest_id_col] trips = trip.initialize_from_tours( pseudo_tours, @@ -843,7 +848,7 @@ def run_od_logsums( nest_spec = config.get_logit_model_settings(logsum_settings) # actual coeffs dont matter here, just need them to load the nest structure - coefficients = simulate.get_segment_coefficients( + coefficients = whale.filesystem.get_segment_coefficients( logsum_settings, pseudo_tours.iloc[0]["tour_purpose"] ) nest_spec = simulate.eval_nest_coefficients( @@ -866,7 +871,7 @@ def run_od_logsums( logsum_trips[col] = reindex(pseudo_tours[col], logsum_trips.unique_id) whale.add_table("trips", logsum_trips) - tracing.register_traceable_table("trips", logsum_trips) + tracing.register_traceable_table(whale, "trips", logsum_trips) whale.get_rn_generator().add_channel("trips", logsum_trips) # run trip mode choice on pseudo-trips. use orca instead of pipeline to @@ -875,7 +880,7 @@ def run_od_logsums( # grab trip mode choice logsums and pivot by tour mode and direction, index # on tour_id to enable merge back to choosers table - trips = inject.get_table("trips").to_frame() + trips = whale.get_dataframe("trips") trip_dir_mode_logsums = trips.pivot( index=["tour_id", tour_od_id_col], columns=["tour_mode", "outbound"], @@ -896,7 +901,7 @@ def run_od_logsums( choosers.set_index(choosers_og_index, inplace=True) whale.get_rn_generator().drop_channel("trips") - tracing.deregister_traceable_table("trips") + tracing.deregister_traceable_table(whale, "trips") assert (od_sample.index == choosers.index).all() for col in new_cols: @@ -1122,6 +1127,7 @@ def run_tour_od( # - destination_logsums od_sample_df = run_od_logsums( + whale, spec_segment_name, choosers, od_sample_df, diff --git a/activitysim/abm/models/util/tour_scheduling.py b/activitysim/abm/models/util/tour_scheduling.py index 11ab76671..f3c0d5b07 100644 --- a/activitysim/abm/models/util/tour_scheduling.py +++ b/activitysim/abm/models/util/tour_scheduling.py @@ -23,10 +23,12 @@ def run_tour_scheduling( trace_label = model_name model_settings_file_name = f"{model_name}.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) if "LOGSUM_SETTINGS" in model_settings: - logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) + logsum_settings = whale.filesystem.read_model_settings( + model_settings["LOGSUM_SETTINGS"] + ) logsum_columns = logsum_settings.get("LOGSUM_CHOOSER_COLUMNS", []) else: logsum_columns = [] @@ -39,7 +41,7 @@ def run_tour_scheduling( persons_merged = expressions.filter_chooser_columns(persons_merged, chooser_columns) - timetable = inject.get_injectable("timetable") + timetable = whale.get_injectable("timetable") # - run preprocessor to annotate choosers preprocessor_settings = model_settings.get("preprocessor", None) @@ -70,8 +72,8 @@ def run_tour_scheduling( ) spec_file_name = spec_settings["SPEC"] - model_spec = simulate.read_model_spec(file_name=spec_file_name) - coefficients_df = simulate.read_model_coefficients(spec_settings) + model_spec = whale.filesystem.read_model_spec(file_name=spec_file_name) + coefficients_df = whale.filesystem.read_model_coefficients(spec_settings) specs[spec_segment_name] = simulate.eval_coefficients( whale, model_spec, coefficients_df, estimator ) @@ -110,9 +112,9 @@ def run_tour_scheduling( estimator = estimation.manager.begin_estimation(whale, model_name) spec_file_name = model_settings["SPEC"] - model_spec = simulate.read_model_spec(file_name=spec_file_name) + model_spec = whale.filesystem.read_model_spec(file_name=spec_file_name) sharrow_skip = model_settings.get("sharrow_skip", False) - coefficients_df = simulate.read_model_coefficients(model_settings) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients( whale, model_spec, coefficients_df, estimator ) @@ -135,6 +137,7 @@ def run_tour_scheduling( logger.info(f"Running {model_name} with %d tours", len(chooser_tours)) choices = vts.vectorize_tour_scheduling( + whale, chooser_tours, persons_merged, tdd_alts, diff --git a/activitysim/abm/models/util/vectorize_tour_scheduling.py b/activitysim/abm/models/util/vectorize_tour_scheduling.py index 23ad3224a..d81612bf2 100644 --- a/activitysim/abm/models/util/vectorize_tour_scheduling.py +++ b/activitysim/abm/models/util/vectorize_tour_scheduling.py @@ -19,11 +19,10 @@ RUN_ALTS_PREPROCESSOR_BEFORE_MERGE = True # see FIXME below before changing this -def skims_for_logsums(tour_purpose, model_settings, trace_label): - +def skims_for_logsums(whale: workflow.Whale, tour_purpose, model_settings, trace_label): assert "LOGSUM_SETTINGS" in model_settings - network_los = inject.get_injectable("network_los") + network_los = whale.get_injectable("network_los") skim_dict = network_los.get_default_skim_dict() @@ -108,7 +107,9 @@ def _compute_logsums( trace_label = tracing.extend_trace_label(trace_label, "logsums") with chunk.chunk_log(trace_label, settings=whale.settings): - logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) + logsum_settings = whale.filesystem.read_model_settings( + model_settings["LOGSUM_SETTINGS"] + ) choosers = alt_tdd.join(tours_merged, how="left", rsuffix="_chooser") logger.info( f"{trace_label} compute_logsums for {choosers.shape[0]} choosers {alt_tdd.shape[0]} alts" @@ -128,7 +129,9 @@ def _compute_logsums( locals_dict.update(skims) # constrained coefficients can appear in expressions - coefficients = simulate.get_segment_coefficients(logsum_settings, tour_purpose) + coefficients = whale.filesystem.get_segment_coefficients( + logsum_settings, tour_purpose + ) locals_dict.update(coefficients) # - run preprocessor to annotate choosers @@ -137,7 +140,6 @@ def _compute_logsums( preprocessor_settings = logsum_settings[preprocessor] if preprocessor_settings: - simulate.set_skim_wrapper_targets(choosers, skims) expressions.assign_columns( @@ -149,7 +151,9 @@ def _compute_logsums( ) # - compute logsums - logsum_spec = simulate.read_model_spec(file_name=logsum_settings["SPEC"]) + logsum_spec = whale.filesystem.read_model_spec( + file_name=logsum_settings["SPEC"] + ) logsum_spec = simulate.eval_coefficients( whale, logsum_spec, coefficients, estimator=None ) @@ -174,8 +178,7 @@ def _compute_logsums( def dedupe_alt_tdd(whale: workflow.Whale, alt_tdd, tour_purpose, trace_label): - - tdd_segments = inject.get_injectable("tdd_alt_segments", None) + tdd_segments = whale.get_injectable("tdd_alt_segments", None) alt_tdd_periods = None logger.info("tdd_alt_segments specified for representative logsums") @@ -183,15 +186,12 @@ def dedupe_alt_tdd(whale: workflow.Whale, alt_tdd, tour_purpose, trace_label): with chunk.chunk_log( tracing.extend_trace_label(trace_label, "dedupe_alt_tdd"), settings=whale.settings, - ): - + ) as chunk_sizer: if tdd_segments is not None: - dedupe_columns = ["out_period", "in_period"] # tdd_alt_segments is optionally segmented by tour purpose if "tour_purpose" in tdd_segments: - is_tdd_for_tour_purpose = tdd_segments.tour_purpose == tour_purpose if not is_tdd_for_tour_purpose.any(): is_tdd_for_tour_purpose = tdd_segments.tour_purpose.isnull() @@ -215,7 +215,7 @@ def dedupe_alt_tdd(whale: workflow.Whale, alt_tdd, tour_purpose, trace_label): how="left", on="out_period", ) - chunk.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) + chunk_sizer.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) # left join representative end on in_period alt_tdd_periods = pd.merge( @@ -226,7 +226,7 @@ def dedupe_alt_tdd(whale: workflow.Whale, alt_tdd, tour_purpose, trace_label): how="left", on=["in_period"], ) - chunk.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) + chunk_sizer.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) if tdd_segments.start.isnull().any(): missing_periods = tdd_segments.out_period[ @@ -251,13 +251,13 @@ def dedupe_alt_tdd(whale: workflow.Whale, alt_tdd, tour_purpose, trace_label): alt_tdd_periods = alt_tdd_periods.drop_duplicates().set_index( alt_tdd.index.name ) - chunk.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) + chunk_sizer.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) # representative duration alt_tdd_periods["duration"] = ( alt_tdd_periods["end"] - alt_tdd_periods["start"] ) - chunk.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) + chunk_sizer.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) logger.debug( f"{trace_label} " @@ -268,7 +268,6 @@ def dedupe_alt_tdd(whale: workflow.Whale, alt_tdd, tour_purpose, trace_label): # if there is no tdd_alt_segments file, we can at least dedupe on 'out_period', 'in_period', 'duration' if alt_tdd_periods is None: - # FIXME This won't work if they reference start or end in logsum calculations # for MTC only duration is used (to calculate all_day parking cost) dedupe_columns = ["out_period", "in_period", "duration"] @@ -286,7 +285,7 @@ def dedupe_alt_tdd(whale: workflow.Whale, alt_tdd, tour_purpose, trace_label): .drop_duplicates() .set_index(alt_tdd.index.name) ) - chunk.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) + chunk_sizer.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) logger.debug( f"{trace_label} " @@ -298,8 +297,16 @@ def dedupe_alt_tdd(whale: workflow.Whale, alt_tdd, tour_purpose, trace_label): return alt_tdd_periods, dedupe_columns -def compute_logsums( - whale, alt_tdd, tours_merged, tour_purpose, model_settings, skims, trace_label +def compute_tour_scheduling_logsums( + whale: workflow.Whale, + alt_tdd, + tours_merged, + tour_purpose, + model_settings, + skims, + trace_label, + *, + chunk_sizer: chunk.ChunkSizer, ): """ Compute logsums for the tour alt_tdds, which will differ based on their different start, stop @@ -314,7 +321,7 @@ def compute_logsums( """ trace_label = tracing.extend_trace_label(trace_label, "compute_logsums") - network_los = inject.get_injectable("network_los") + network_los = whale.get_injectable("network_los") # - in_period and out_period assert "out_period" not in alt_tdd @@ -329,13 +336,13 @@ def compute_logsums( alt_tdd["duration"] = alt_tdd["end"] - alt_tdd["start"] # outside chunk_log context because we extend log_df call for alt_tdd made by our only caller _schedule_tours - chunk.log_df(trace_label, "alt_tdd", alt_tdd) - - with chunk.chunk_log(trace_label, settings=whale.settings): + chunk_sizer.log_df(trace_label, "alt_tdd", alt_tdd) + with chunk.chunk_log(trace_label, settings=whale.settings) as chunk_sizer: if USE_BRUTE_FORCE_TO_COMPUTE_LOGSUMS: # compute logsums for all the tour alt_tdds (inefficient) logsums = _compute_logsums( + whale, alt_tdd, tours_merged, tour_purpose, @@ -350,7 +357,7 @@ def compute_logsums( deduped_alt_tdds, redupe_columns = dedupe_alt_tdd( whale, alt_tdd, tour_purpose, trace_label ) - chunk.log_df(trace_label, "deduped_alt_tdds", deduped_alt_tdds) + chunk_sizer.log_df(trace_label, "deduped_alt_tdds", deduped_alt_tdds) logger.info( f"{trace_label} compute_logsums " @@ -363,6 +370,7 @@ def compute_logsums( # - compute logsums for the alt_tdd_periods deduped_alt_tdds["logsums"] = _compute_logsums( + whale, deduped_alt_tdds, tours_merged, tour_purpose, @@ -385,16 +393,17 @@ def compute_logsums( .set_index(index_name) .logsums ) - chunk.log_df(trace_label, "logsums", logsums) + chunk_sizer.log_df(trace_label, "logsums", logsums) del deduped_alt_tdds - chunk.log_df(trace_label, "deduped_alt_tdds", None) + chunk_sizer.log_df(trace_label, "deduped_alt_tdds", None) # this is really expensive TRACE = False if TRACE: trace_logsums_df = logsums.to_frame("representative_logsum") trace_logsums_df["brute_force_logsum"] = _compute_logsums( + whale, alt_tdd, tours_merged, tour_purpose, @@ -410,7 +419,7 @@ def compute_logsums( transpose=False, ) - # leave it to our caller to pick up logsums with call to chunk.log_df + # leave it to our caller to pick up logsums with call to chunk_sizer.log_df return logsums @@ -491,13 +500,13 @@ def tdd_interaction_dataset( trace_label = tracing.extend_trace_label(trace_label, "tdd_interaction_dataset") - with chunk.chunk_log(trace_label, settings=whale.settings): + with chunk.chunk_log(trace_label, settings=whale.settings) as chunk_sizer: alts_ids = np.tile(alts.index, len(tours.index)) - chunk.log_df(trace_label, "alts_ids", alts_ids) + chunk_sizer.log_df(trace_label, "alts_ids", alts_ids) tour_ids = np.repeat(tours.index, len(alts.index)) window_row_ids = np.repeat(tours[window_id_col], len(alts.index)) - chunk.log_df(trace_label, "window_row_ids", window_row_ids) + chunk_sizer.log_df(trace_label, "window_row_ids", window_row_ids) alt_tdd = alts.take(alts_ids) @@ -520,20 +529,20 @@ def tdd_interaction_dataset( available = timetable.tour_available(window_row_ids, alts_ids) del window_row_ids - chunk.log_df(trace_label, "window_row_ids", None) + chunk_sizer.log_df(trace_label, "window_row_ids", None) logger.debug( f"tdd_interaction_dataset keeping {available.sum()} of ({len(available)}) available alt_tdds" ) assert available.any() - chunk.log_df( + chunk_sizer.log_df( trace_label, "alt_tdd_", alt_tdd_ ) # catch this before we slice on available alt_tdd = alt_tdd_.isel({dimname: available}).to_dataframe() - chunk.log_df(trace_label, "alt_tdd", alt_tdd) + chunk_sizer.log_df(trace_label, "alt_tdd", alt_tdd) # FIXME - don't need this any more after slicing # del alt_tdd[window_id_col] @@ -585,7 +594,6 @@ def run_alts_preprocessor(model_settings, alts, segment, locals_dict, trace_labe preprocessor_settings = None if preprocessor_settings: - logger.debug( f"run_alts_preprocessor calling assign_columns for {segment} preprocessor_settings" ) @@ -618,6 +626,8 @@ def _schedule_tours( estimator, tour_trace_label, sharrow_skip=False, + *, + chunk_sizer: chunk.ChunkSizer, ): """ previous_tour stores values used to add columns that can be used in the spec @@ -676,7 +686,7 @@ def _schedule_tours( right_index=True, suffixes=("", "_y"), ) - chunk.log_df(tour_trace_label, "tours", tours) + chunk_sizer.log_df(tour_trace_label, "tours", tours) # - add explicit window_id_col for timetable owner if it is index # if no timetable window_id_col specified, then add index as an explicit column @@ -697,11 +707,11 @@ def _schedule_tours( ) # print(f"tours {tours.shape} alts {alts.shape}") - chunk.log_df(tour_trace_label, "alt_tdd", alt_tdd) + chunk_sizer.log_df(tour_trace_label, "alt_tdd", alt_tdd) # - add logsums if logsum_tour_purpose: - logsums = compute_logsums( + logsums = compute_tour_scheduling_logsums( whale, alt_tdd, tours, @@ -709,20 +719,21 @@ def _schedule_tours( model_settings, skims, tour_trace_label, + chunk_sizer=chunk_sizer, ) else: logsums = 0 alt_tdd["mode_choice_logsum"] = logsums del logsums - chunk.log_df(tour_trace_label, "alt_tdd", alt_tdd) + chunk_sizer.log_df(tour_trace_label, "alt_tdd", alt_tdd) # - merge in previous tour columns # adds start_previous and end_previous, joins on index tours = tours.join( get_previous_tour_by_tourid(tours[tour_owner_id_col], previous_tour, alts) ) - chunk.log_df(tour_trace_label, "tours", tours) + chunk_sizer.log_df(tour_trace_label, "tours", tours) # - make choices locals_d = {"tt": timetable} @@ -749,7 +760,7 @@ def _schedule_tours( alt_tdd = run_alts_preprocessor( model_settings, alt_tdd, spec_segment, locals_d, tour_trace_label ) - chunk.log_df(tour_trace_label, "alt_tdd", alt_tdd) + chunk_sizer.log_df(tour_trace_label, "alt_tdd", alt_tdd) if estimator: # write choosers after annotation @@ -771,7 +782,7 @@ def _schedule_tours( trace_label=tour_trace_label, estimator=estimator, ) - chunk.log_df(tour_trace_label, "choices", choices) + chunk_sizer.log_df(tour_trace_label, "choices", choices) # - update previous_tour and timetable parameters @@ -828,15 +839,21 @@ def schedule_tours( if "LOGSUM_SETTINGS" in model_settings: # we need skims to calculate tvpb skim overhead in 3_ZONE systems for use by calc_rows_per_chunk - skims = skims_for_logsums(logsum_tour_purpose, model_settings, tour_trace_label) + skims = skims_for_logsums( + whale, logsum_tour_purpose, model_settings, tour_trace_label + ) else: skims = None result_list = [] - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers( whale, tours, chunk_size, tour_trace_label, tour_chunk_tag ): - choices = _schedule_tours( whale, chooser_chunk, @@ -853,11 +870,12 @@ def schedule_tours( estimator, tour_trace_label=chunk_trace_label, sharrow_skip=sharrow_skip, + chunk_sizer=chunk_sizer, ) result_list.append(choices) - chunk.log_df(tour_trace_label, "result_list", result_list) + chunk_sizer.log_df(tour_trace_label, "result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: @@ -871,6 +889,7 @@ def schedule_tours( def vectorize_tour_scheduling( + whale: workflow.Whale, tours, persons_merged, alts, @@ -938,7 +957,7 @@ def vectorize_tour_scheduling( timetable_window_id_col = "person_id" tour_owner_id_col = "person_id" - compute_logsums = "LOGSUM_SETTINGS" in model_settings + should_compute_logsums = "LOGSUM_SETTINGS" in model_settings assert isinstance(tour_segments, dict) @@ -948,16 +967,13 @@ def vectorize_tour_scheduling( # segregate scheduling by tour_type if multiple specs passed in dict keyed by tour_type for tour_num, nth_tours in tours.groupby("tour_num", sort=True): - tour_trace_label = tracing.extend_trace_label(trace_label, f"tour_{tour_num}") tour_chunk_tag = tracing.extend_trace_label( trace_label, f"tour_{1 if tour_num == 1 else 'n'}" ) if tour_segment_col is not None: - for tour_segment_name, tour_segment_info in tour_segments.items(): - segment_trace_label = tracing.extend_trace_label( tour_trace_label, tour_segment_name ) @@ -968,7 +984,9 @@ def vectorize_tour_scheduling( # assume segmentation of spec and coefficients are aligned spec_segment_name = tour_segment_info.get("spec_segment_name") # assume logsum segmentation is same as tours - logsum_tour_purpose = tour_segment_name if compute_logsums else None + logsum_tour_purpose = ( + tour_segment_name if should_compute_logsums else None + ) nth_tours_in_segment = nth_tours[ nth_tours[tour_segment_col] == tour_segment_name @@ -1009,12 +1027,11 @@ def vectorize_tour_scheduling( choice_list.append(choices) else: - # MTC non_mandatory_tours are not segmented by tour_purpose and do not require logsums # FIXME should support logsums? assert ( - not compute_logsums + not should_compute_logsums ), "logsums for unsegmented spec not implemented because not currently needed" assert tour_segments.get("spec_segment_name") is None @@ -1128,7 +1145,6 @@ def vectorize_subtour_scheduling( # this ought to have been ensured when tours are created (tour_frequency.process_tours) for tour_num, nth_tours in subtours.groupby("tour_num", sort=True): - tour_trace_label = tracing.extend_trace_label(trace_label, f"tour_{tour_num}") tour_chunk_tag = tracing.extend_trace_label( trace_label, f"tour_{1 if tour_num == 1 else 'n'}" @@ -1176,7 +1192,6 @@ def vectorize_subtour_scheduling( def build_joint_tour_timetables( joint_tours, joint_tour_participants, persons_timetable, alts ): - # timetable with a window for each joint tour joint_tour_windows_df = tt.create_timetable_windows(joint_tours, alts) joint_tour_timetable = tt.TimeTable(joint_tour_windows_df, alts) @@ -1184,7 +1199,6 @@ def build_joint_tour_timetables( for participant_num, nth_participants in joint_tour_participants.groupby( "participant_num", sort=True ): - # nth_participant windows from persons_timetable participant_windows = persons_timetable.slice_windows_by_row_id( nth_participants.person_id @@ -1271,7 +1285,6 @@ def vectorize_joint_tour_scheduling( # persons_timetable.slice_windows_by_row_id(joint_tour_participants.person_id) for tour_num, nth_tours in joint_tours.groupby("tour_num", sort=True): - tour_trace_label = tracing.extend_trace_label(trace_label, f"tour_{tour_num}") tour_chunk_tag = tracing.extend_trace_label( trace_label, f"tour_{1 if tour_num == 1 else 'n'}" diff --git a/activitysim/abm/models/vehicle_allocation.py b/activitysim/abm/models/vehicle_allocation.py index 73e88e3eb..379769345 100644 --- a/activitysim/abm/models/vehicle_allocation.py +++ b/activitysim/abm/models/vehicle_allocation.py @@ -110,14 +110,14 @@ def vehicle_allocation( """ trace_label = "vehicle_allocation" model_settings_file_name = "vehicle_allocation.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME") estimator = estimation.manager.begin_estimation(whale, "vehicle_allocation") - model_spec_raw = simulate.read_model_spec(whale, file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) + model_spec_raw = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients( whale, model_spec_raw, coefficients_df, estimator ) diff --git a/activitysim/abm/models/vehicle_type_choice.py b/activitysim/abm/models/vehicle_type_choice.py index f1378d8f5..699b6a7d0 100644 --- a/activitysim/abm/models/vehicle_type_choice.py +++ b/activitysim/abm/models/vehicle_type_choice.py @@ -103,7 +103,7 @@ def annotate_vehicle_type_choice_households( model_settings : dict trace_label : str """ - households = inject.get_table("households").to_frame() + households = whale.get_dataframe("households") expressions.assign_columns( whale, df=households, @@ -124,7 +124,7 @@ def annotate_vehicle_type_choice_persons( model_settings : dict trace_label : str """ - persons = inject.get_table("persons").to_frame() + persons = whale.get_dataframe("persons") expressions.assign_columns( whale, df=persons, @@ -145,7 +145,7 @@ def annotate_vehicle_type_choice_vehicles( model_settings : dict trace_label : str """ - vehicles = inject.get_table("vehicles").to_frame() + vehicles = whale.get_dataframe("vehicles") expressions.assign_columns( whale, df=vehicles, @@ -182,7 +182,9 @@ def get_combinatorial_vehicle_alternatives(alts_cats_dict): return alts_wide, alts_long -def construct_model_alternatives(model_settings, alts_cats_dict, vehicle_type_data): +def construct_model_alternatives( + whale: workflow.Whale, model_settings, alts_cats_dict, vehicle_type_data +): """ Construct the table of vehicle type alternatives. @@ -232,7 +234,7 @@ def construct_model_alternatives(model_settings, alts_cats_dict, vehicle_type_da alts_wide["age"] = alts_wide["age"].astype(int) # store alts in primary configs dir for inspection - configs_dirs = inject.get_injectable("configs_dir") + configs_dirs = whale.filesystem.get_configs_dir() configs_dirs = configs_dirs if isinstance(configs_dirs, list) else [configs_dirs] if model_settings.get("WRITE_OUT_ALTS_FILE", False): @@ -516,12 +518,12 @@ def vehicle_type_choice( """ trace_label = "vehicle_type_choice" model_settings_file_name = "vehicle_type_choice.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) estimator = estimation.manager.begin_estimation(whale, "vehicle_type") - model_spec_raw = simulate.read_model_spec(whale, file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) + model_spec_raw = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients( whale, model_spec_raw, coefficients_df, estimator ) diff --git a/activitysim/abm/models/work_from_home.py b/activitysim/abm/models/work_from_home.py index 33d0f8044..b353d8210 100755 --- a/activitysim/abm/models/work_from_home.py +++ b/activitysim/abm/models/work_from_home.py @@ -23,7 +23,7 @@ def work_from_home(whale: workflow.Whale, persons_merged, persons, chunk_size): model_settings_file_name = "work_from_home.yaml" choosers = persons_merged.to_frame() - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = whale.filesystem.read_model_settings(model_settings_file_name) chooser_filter_column_name = model_settings.get( "CHOOSER_FILTER_COLUMN_NAME", "is_worker" ) @@ -50,8 +50,8 @@ def work_from_home(whale: workflow.Whale, persons_merged, persons, chunk_size): trace_label=trace_label, ) - model_spec = simulate.read_model_spec(whale, file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) nest_spec = config.get_logit_model_settings(model_settings) @@ -85,7 +85,7 @@ def work_from_home(whale: workflow.Whale, persons_merged, persons, chunk_size): ) # re-read spec to reset substitution - model_spec = simulate.read_model_spec(whale, file_name=model_settings["SPEC"]) + model_spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) model_spec = simulate.eval_coefficients( whale, model_spec, coefficients_df, estimator ) diff --git a/activitysim/abm/tables/disaggregate_accessibility.py b/activitysim/abm/tables/disaggregate_accessibility.py index 83b87c13b..6e57f9b7f 100644 --- a/activitysim/abm/tables/disaggregate_accessibility.py +++ b/activitysim/abm/tables/disaggregate_accessibility.py @@ -47,7 +47,7 @@ def nearest_node(oz, zones_df): if method == "centroids": # Extract and vectorize TAZ centroids - centroids = inject.get_table("maz_centroids").to_frame() + centroids = whale.get_dataframe("maz_centroids") # TODO.NF This is a bit hacky, needs some work for variable zone names if "TAZ" in centroids.columns: @@ -65,7 +65,7 @@ def nearest_node(oz, zones_df): nearest = [nearest_node(Oz, _centroids.XY) for Oz in unmatched_zones] else: - skim_dict = inject.get_injectable("skim_dict") + skim_dict = whale.get_injectable("skim_dict") nearest = [nearest_skim(Oz, accessibility_zones) for Oz in unmatched_zones] # Add the nearest zones to the matched zones @@ -151,7 +151,9 @@ def disaggregate_accessibility(whale: workflow.Whale): ) # Extract model settings - model_settings = config.read_model_settings("disaggregate_accessibility.yaml") + model_settings = whale.filesystem.read_model_settings( + "disaggregate_accessibility.yaml" + ) merging_params = model_settings.get("MERGE_ON") nearest_method = model_settings.get("NEAREST_METHOD", "skims") accessibility_cols = [ diff --git a/activitysim/abm/tables/households.py b/activitysim/abm/tables/households.py index f150aa5d1..dd8ac88ef 100644 --- a/activitysim/abm/tables/households.py +++ b/activitysim/abm/tables/households.py @@ -118,11 +118,39 @@ def households(whale: workflow.Whale): # this is a common merge so might as well define it once here and use it -@inject.table() -def households_merged(households, land_use, accessibility): - return inject.merge_tables( - households.name, tables=[households, land_use, accessibility] +@workflow.temp_table +def households_merged( + whale: workflow.Whale, + households: pd.DataFrame, + land_use: pd.DataFrame, + accessibility: pd.DataFrame, +): + # land_use = whale.get_dataframe("land_use") + # households = whale.get_dataframe("households") + # accessibility = whale.get_dataframe("accessibility") + + def join(left, right, left_on): + intersection = set(left.columns).intersection(right.columns) + intersection.discard(left_on) # intersection is ok if it's the join key + right = right.drop(intersection, axis=1) + return pd.merge( + left, + right, + left_on=left_on, + right_index=True, + ) + + households = join( + households, + land_use, + left_on="home_zone_id", + ) + households = join( + households, + accessibility, + left_on="home_zone_id", ) + return households inject.broadcast("households", "persons", cast_index=True, onto_on="household_id") diff --git a/activitysim/abm/tables/persons.py b/activitysim/abm/tables/persons.py index 859b4118a..6ac3e6bb2 100644 --- a/activitysim/abm/tables/persons.py +++ b/activitysim/abm/tables/persons.py @@ -12,7 +12,6 @@ def read_raw_persons(whale, households): - df = read_input_table(whale, "persons") if whale.get_injectable("households_sliced", False): @@ -92,14 +91,20 @@ def persons(whale: workflow.Whale): # return inject.merge_tables(persons.name, tables=tables) -@workflow.table -def persons_merged(whale: workflow.Whale): - - land_use = whale.get_dataframe("land_use") - households = whale.get_dataframe("households") - accessibility = whale.get_dataframe("accessibility") - persons = whale.get_dataframe("persons") - disaggregate_accessibility = whale.get_dataframe("disaggregate_accessibility") +@workflow.temp_table +def persons_merged( + whale: workflow.Whale, + persons: pd.DataFrame, + land_use: pd.DataFrame, + households: pd.DataFrame, + accessibility: pd.DataFrame, + disaggregate_accessibility: pd.DataFrame = None, +): + # land_use = whale.get_dataframe("land_use") + # households = whale.get_dataframe("households") + # accessibility = whale.get_dataframe("accessibility") + # persons = whale.get_dataframe("persons") + # disaggregate_accessibility = whale.get_dataframe("disaggregate_accessibility") def join(left, right, left_on): intersection = set(left.columns).intersection(right.columns) @@ -127,7 +132,7 @@ def join(left, right, left_on): households, left_on="household_id", ) - if not disaggregate_accessibility.empty: + if disaggregate_accessibility is not None and not disaggregate_accessibility.empty: persons = join( persons, disaggregate_accessibility, diff --git a/activitysim/abm/tables/time_windows.py b/activitysim/abm/tables/time_windows.py index c1356ce07..128d996c8 100644 --- a/activitysim/abm/tables/time_windows.py +++ b/activitysim/abm/tables/time_windows.py @@ -69,4 +69,4 @@ def timetable( whale: workflow.Whale, person_windows: pd.DataFrame, tdd_alts: pd.DataFrame ) -> tt.TimeTable: logging.debug("@inject timetable") - return tt.TimeTable(person_windows, tdd_alts, person_windows.name) + return tt.TimeTable(person_windows, tdd_alts, "person_windows") diff --git a/activitysim/abm/tables/vehicles.py b/activitysim/abm/tables/vehicles.py index 384b3dc70..feaf05003 100644 --- a/activitysim/abm/tables/vehicles.py +++ b/activitysim/abm/tables/vehicles.py @@ -40,7 +40,7 @@ def vehicles(whale: workflow.Whale, households): whale.add_table("vehicles", vehicles) whale.get_rn_generator().add_channel("vehicles", vehicles) - tracing.register_traceable_table("vehicles", vehicles) + tracing.register_traceable_table(whale, "vehicles", vehicles) return vehicles diff --git a/activitysim/abm/test/run_multi_zone_mp.py b/activitysim/abm/test/run_multi_zone_mp.py index dacd1b310..ec87258d2 100644 --- a/activitysim/abm/test/run_multi_zone_mp.py +++ b/activitysim/abm/test/run_multi_zone_mp.py @@ -25,7 +25,7 @@ def test_mp_run(): # do this after config.handle_standard_args, as command line args may override injectables injectables = ["data_dir", "configs_dir", "output_dir", "settings_file_name"] - injectables = {k: inject.get_injectable(k) for k in injectables} + injectables = {k: whale.get_injectable(k) for k in injectables} mp_tasks.run_multiprocess(run_list, injectables) pipeline.open_pipeline("_") diff --git a/activitysim/abm/test/test_misc/test_summarize.py b/activitysim/abm/test/test_misc/test_summarize.py index cbbae9807..eb645b05d 100644 --- a/activitysim/abm/test/test_misc/test_summarize.py +++ b/activitysim/abm/test/test_misc/test_summarize.py @@ -54,22 +54,24 @@ def test_summarize(initialize_pipeline: pipeline.Whale, caplog): pipeline.run(models=["summarize"]) # Retrieve output tables to check contents - model_settings = config.read_model_settings("summarize.yaml") + model_settings = whale.filesystem.read_model_settings("summarize.yaml") output_location = ( model_settings["OUTPUT"] if "OUTPUT" in model_settings else "summaries" ) - output_dir = config.output_file_path(output_location) + output_dir = whale.get_output_file_path(output_location) # Check that households are counted correctly households_count = pd.read_csv( - config.output_file_path(os.path.join(output_location, f"households_count.csv")) + whale.get_output_file_path( + os.path.join(output_location, f"households_count.csv") + ) ) households = pd.read_csv(config.data_file_path("households.csv")) assert int(households_count.iloc[0]) == len(households) # Check that bike trips are counted correctly trips_by_mode_count = pd.read_csv( - config.output_file_path( + whale.get_output_file_path( os.path.join(output_location, f"trips_by_mode_count.csv") ) ) diff --git a/activitysim/abm/test/test_pipeline/test_pipeline.py b/activitysim/abm/test/test_pipeline/test_pipeline.py index 0bb09a4cc..8b488932a 100644 --- a/activitysim/abm/test/test_pipeline/test_pipeline.py +++ b/activitysim/abm/test/test_pipeline/test_pipeline.py @@ -283,7 +283,7 @@ def test_mini_pipeline_run3(): setup_dirs() inject_settings(hh_ids="override_hh_ids.csv") - households = inject.get_table("households").to_frame() + households = whale.get_dataframe("households") override_hh_ids = pd.read_csv(config.data_file_path("override_hh_ids.csv")) @@ -334,7 +334,7 @@ def full_run( def get_trace_csv(file_name): - file_name = config.output_file_path(file_name) + file_name = whale.get_output_file_path(file_name) df = pd.read_csv(file_name) # label value_1 value_2 value_3 value_4 @@ -456,7 +456,7 @@ def regress(): assert trips_df.shape[0] >= 2 * tours_df.shape[0] # write_trip_matrices - trip_matrices_file = config.output_file_path("trips_md.omx") + trip_matrices_file = whale.get_output_file_path("trips_md.omx") assert os.path.exists(trip_matrices_file) trip_matrices = omx.open_file(trip_matrices_file) assert trip_matrices.shape() == (25, 25) diff --git a/activitysim/benchmarking/componentwise.py b/activitysim/benchmarking/componentwise.py index cfba48651..29b72fdd2 100644 --- a/activitysim/benchmarking/componentwise.py +++ b/activitysim/benchmarking/componentwise.py @@ -127,12 +127,12 @@ def setup_component( whale.open_pipeline(resume_after, mode="r") for k in preload_injectables: - if inject.get_injectable(k, None) is not None: + if whale.get_injectable(k, None) is not None: logger.info("pre-loaded %s", k) # Directories Logging for k in ["configs_dir", "settings_file_name", "data_dir", "output_dir"]: - logger.info(f"DIRECTORY {k}: {inject.get_injectable(k, None)}") + logger.info(f"DIRECTORY {k}: {whale.get_injectable(k, None)}") # Settings Logging log_settings = [ @@ -261,9 +261,9 @@ def pre_run( # directories for k in ["configs_dir", "settings_file_name", "data_dir", "output_dir"]: - logger.info("SETTING %s: %s" % (k, inject.get_injectable(k, None))) + logger.info("SETTING %s: %s" % (k, whale.get_injectable(k, None))) - log_settings = inject.get_injectable("log_settings", {}) + log_settings = whale.get_injectable("log_settings", {}) for k in log_settings: logger.info("SETTING %s: %s" % (k, config.setting(k))) @@ -322,7 +322,7 @@ def run_multiprocess(): from activitysim.core import mp_tasks - injectables = {k: inject.get_injectable(k) for k in INJECTABLES} + injectables = {k: whale.get_injectable(k) for k in INJECTABLES} mp_tasks.run_multiprocess(injectables) # assert not pipeline.is_open() diff --git a/activitysim/cli/run.py b/activitysim/cli/run.py index 3a822998b..89d4f549c 100644 --- a/activitysim/cli/run.py +++ b/activitysim/cli/run.py @@ -109,7 +109,7 @@ def add_run_args(parser, multiprocess=True): def validate_injectable(whale: workflow.Whale, name, make_if_missing=False): try: dir_paths = whale.context.get_formatted(name) - # dir_paths = inject.get_injectable(name) + # dir_paths = whale.get_injectable(name) except RuntimeError: # injectable is missing, meaning is hasn't been explicitly set # and defaults cannot be found. @@ -393,7 +393,7 @@ def run(args): from activitysim.core import mp_tasks - injectables = {k: inject.get_injectable(k) for k in INJECTABLES} + injectables = {k: whale.get_injectable(k) for k in INJECTABLES} mp_tasks.run_multiprocess(whale, injectables) assert not whale.is_open diff --git a/activitysim/core/chunk.py b/activitysim/core/chunk.py index 3fba39817..13764382f 100644 --- a/activitysim/core/chunk.py +++ b/activitysim/core/chunk.py @@ -1069,6 +1069,20 @@ def log_df(self, trace_label, table_name, df): @contextmanager def chunk_log(trace_label, chunk_tag=None, base=False, settings=None): + """ + Chunk management. + + Parameters + ---------- + trace_label : str + chunk_tag : str, optional + base + settings + + Yields + ------ + ChunkSizer + """ # With `base=True` this method can be used to instantiate # a ChunkSizer class object without actually chunking. This # avoids breaking the assertion below. diff --git a/activitysim/core/configuration/filesystem.py b/activitysim/core/configuration/filesystem.py index 53fdbe472..cff1f9215 100644 --- a/activitysim/core/configuration/filesystem.py +++ b/activitysim/core/configuration/filesystem.py @@ -8,9 +8,9 @@ import yaml from pydantic import DirectoryPath, validator -from ..exceptions import SettingsFileNotFoundError -from ..util import parse_suffix_args, suffix_tables_in_settings -from .base import PydanticBase +from activitysim.core.configuration.base import PydanticBase +from activitysim.core.exceptions import SettingsFileNotFoundError +from activitysim.core.util import parse_suffix_args, suffix_tables_in_settings logger = logging.getLogger(__name__) @@ -131,6 +131,9 @@ def get_output_dir(self, subdir=None) -> Path: out.mkdir(parents=True) return out + def get_output_file_path(self, file_name) -> Path: + return self.get_output_dir().joinpath(file_name) + def get_pipeline_filepath(self) -> Path: """ Get the complete path to the pipeline file or directory. @@ -322,7 +325,6 @@ def expand_input_file_list(self, input_files) -> list[Path]: ungroked_files = 0 for file_name in input_files: - file_name = self.get_data_file_path(file_name, allow_glob=True) if file_name.is_file(): @@ -618,4 +620,27 @@ def backfill_settings(settings, backfill): else: return settings - read_model_settings = read_settings_file + def read_model_settings( + self, + file_name, + mandatory=False, + ): + # in the legacy implementation, this function has a default mandatory=False + return self.read_settings_file(file_name, mandatory=mandatory) + + def read_model_spec(self, file_name: str): + from activitysim.core import simulate + + return simulate.read_model_spec(self, file_name) + + def read_model_coefficients(self, model_settings=None, file_name=None): + from activitysim.core import simulate + + return simulate.read_model_coefficients( + self, model_settings=model_settings, file_name=file_name + ) + + def get_segment_coefficients(self, model_settings, segment_name): + from activitysim.core import simulate + + return simulate.get_segment_coefficients(self, model_settings, segment_name) diff --git a/activitysim/core/expressions.py b/activitysim/core/expressions.py index 79201a625..3ab0eb5c5 100644 --- a/activitysim/core/expressions.py +++ b/activitysim/core/expressions.py @@ -36,7 +36,9 @@ def compute_columns(whale, df, model_settings, locals_dict={}, trace_label=None) if isinstance(model_settings, str): model_settings_name = model_settings - model_settings = config.read_model_settings("%s.yaml" % model_settings) + model_settings = whale.filesystem.read_model_settings( + "%s.yaml" % model_settings + ) assert model_settings, "Found no model settings for %s" % model_settings_name else: model_settings_name = "dict" diff --git a/activitysim/core/input.py b/activitysim/core/input.py index ed7bdb454..fd8992096 100644 --- a/activitysim/core/input.py +++ b/activitysim/core/input.py @@ -129,11 +129,11 @@ def read_from_table_info(table_info: InputTable, whale): if create_input_store: raise NotImplementedError("the input store functionality has been disabled") - # h5_filepath = config.output_file_path("input_data.h5") + # h5_filepath = whale.get_output_file_path("input_data.h5") # logger.info("writing %s to %s" % (h5_tablename, h5_filepath)) # df.to_hdf(h5_filepath, key=h5_tablename, mode="a") # - # csv_dir = config.output_file_path("input_data") + # csv_dir = whale.get_output_file_path("input_data") # if not os.path.exists(csv_dir): # os.makedirs(csv_dir) # make directory if needed # df.to_csv(os.path.join(csv_dir, "%s.csv" % tablename), index=False) diff --git a/activitysim/core/pathbuilder.py b/activitysim/core/pathbuilder.py index add9f6301..6f283ef7d 100644 --- a/activitysim/core/pathbuilder.py +++ b/activitysim/core/pathbuilder.py @@ -60,7 +60,7 @@ def compute_utilities( locals_dict.update(model_constants) # we don't grok coefficients, but allow them to use constants in spec alt columns - spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) + spec = whale.filesystem.read_model_spec(file_name=model_settings["SPEC"]) for c in spec.columns: if c != simulate.SPEC_LABEL_NAME: spec[c] = spec[c].map(lambda s: model_constants.get(s, s)).astype(float) @@ -964,7 +964,7 @@ def build_virtual_path( np.nansum(np.exp(utilities_df.values), axis=1) == 0 ] zero_utilities_df.to_csv( - config.output_file_path("warning_utilities_df.csv"), + whale.get_output_file_path("warning_utilities_df.csv"), index=True, ) @@ -1081,7 +1081,7 @@ def get_tvpb_logsum( trace_label=trace_label, ) - trace_hh_id = inject.get_injectable("trace_hh_id", None) + trace_hh_id = whale.settings.trace_hh_id if (all(logsum_df["logsum"] == UNAVAILABLE)) or (len(logsum_df) == 0): trace_hh_id = False @@ -1125,7 +1125,7 @@ def get_tvpb_best_transit_time(self, orig, dest, tod): trace_label=trace_label, ) - trace_od = inject.get_injectable("trace_od", None) + trace_od = whale.get_injectable("trace_od", None) if trace_od: filter_targets = (orig == trace_od[0]) & (dest == trace_od[1]) if filter_targets.any(): diff --git a/activitysim/core/pathbuilder_cache.py b/activitysim/core/pathbuilder_cache.py index 1bd219693..5ddda9ff1 100644 --- a/activitysim/core/pathbuilder_cache.py +++ b/activitysim/core/pathbuilder_cache.py @@ -306,7 +306,7 @@ def get_data_and_lock_from_buffers(self): ------- either multiprocessing.Array and lock or multiprocessing.RawArray and None according to RAWARRAY """ - data_buffers = inject.get_injectable("data_buffers", None) + data_buffers = whale.get_injectable("data_buffers", None) assert self.cache_tag in data_buffers # internal error logger.debug(f"TVPBCache.get_data_and_lock_from_buffers") data_buffer = data_buffers[self.cache_tag] @@ -361,7 +361,9 @@ def __init__(self, network_los): spec_name = self.network_los.setting( f"TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.SPEC" ) - self.set_names = list(simulate.read_model_spec(file_name=spec_name).columns) + self.set_names = list( + whale.filesystem.read_model_spec(file_name=spec_name).columns + ) @property def fully_populated_shape(self): diff --git a/activitysim/core/simulate.py b/activitysim/core/simulate.py index b9e7d5c70..ebff82726 100644 --- a/activitysim/core/simulate.py +++ b/activitysim/core/simulate.py @@ -15,6 +15,7 @@ assign, chunk, config, + configuration, logit, pathbuilder, tracing, @@ -64,7 +65,7 @@ def read_model_alts(whale: workflow.Whale, file_name, set_index=None): return df -def read_model_spec(whale: workflow.Whale, file_name: str): +def read_model_spec(filesystem: configuration.FileSystem, file_name: str): """ Read a CSV model specification into a Pandas DataFrame or Series. @@ -100,7 +101,7 @@ def read_model_spec(whale: workflow.Whale, file_name: str): if not file_name.lower().endswith(".csv"): file_name = "%s.csv" % (file_name,) - file_path = whale.filesystem.get_config_file_path(file_name) + file_path = filesystem.get_config_file_path(file_name) try: spec = pd.read_csv(file_path, comment="#") @@ -128,10 +129,13 @@ def read_model_spec(whale: workflow.Whale, file_name: str): return spec -def read_model_coefficients(whale, model_settings=None, file_name=None): +def read_model_coefficients( + filesystem: configuration.FileSystem, model_settings=None, file_name=None +): """ Read the coefficient file specified by COEFFICIENTS model setting """ + assert isinstance(filesystem, configuration.FileSystem) if model_settings is None: assert file_name is not None @@ -145,7 +149,7 @@ def read_model_coefficients(whale, model_settings=None, file_name=None): file_name = model_settings["COEFFICIENTS"] logger.debug(f"read_model_coefficients file_name {file_name}") - file_path = whale.filesystem.get_config_file_path(file_name) + file_path = filesystem.get_config_file_path(file_name) try: coefficients = pd.read_csv(file_path, comment="#", index_col="coefficient_name") except ValueError: @@ -189,7 +193,7 @@ def spec_for_segment( """ spec_file_name = model_settings[spec_id] - spec = read_model_spec(whale, file_name=spec_file_name) + spec = read_model_spec(whale.filesystem, file_name=spec_file_name) if len(spec.columns) > 1: # if spec is segmented @@ -213,14 +217,16 @@ def spec_for_segment( return spec - coefficients = read_model_coefficients(whale, model_settings) + coefficients = whale.filesystem.read_model_coefficients(model_settings) spec = eval_coefficients(whale, spec, coefficients, estimator) return spec -def read_model_coefficient_template(whale: workflow.Whale, model_settings): +def read_model_coefficient_template( + filesystem: configuration.FileSystem, model_settings +): """ Read the coefficient template specified by COEFFICIENT_TEMPLATE model setting """ @@ -233,7 +239,7 @@ def read_model_coefficient_template(whale: workflow.Whale, model_settings): coefficients_file_name = model_settings["COEFFICIENT_TEMPLATE"] - file_path = whale.filesystem.get_config_file_path(coefficients_file_name) + file_path = filesystem.get_config_file_path(coefficients_file_name) try: template = pd.read_csv(file_path, comment="#", index_col="coefficient_name") except ValueError: @@ -265,25 +271,26 @@ def dump_mapped_coefficients(whale: workflow.Whale, model_settings): dump template_df with coefficient values """ - coefficients_df = read_model_coefficients(whale, model_settings) - template_df = read_model_coefficient_template(whale, model_settings) + coefficients_df = whale.filesystem.read_model_coefficients(model_settings) + template_df = read_model_coefficient_template(whale.filesystem, model_settings) for c in template_df.columns: template_df[c] = template_df[c].map(coefficients_df.value) coefficients_template_file_name = model_settings["COEFFICIENT_TEMPLATE"] - file_path = config.output_file_path(coefficients_template_file_name) + file_path = whale.get_output_file_path(coefficients_template_file_name) template_df.to_csv(file_path, index=True) logger.info(f"wrote mapped coefficient template to {file_path}") coefficients_file_name = model_settings["COEFFICIENTS"] - file_path = config.output_file_path(coefficients_file_name) + file_path = whale.get_output_file_path(coefficients_file_name) coefficients_df.to_csv(file_path, index=True) logger.info(f"wrote raw coefficients to {file_path}") -@workflow.func -def get_segment_coefficients(whale: workflow.Whale, model_settings, segment_name): +def get_segment_coefficients( + filesystem: configuration.FileSystem, model_settings, segment_name +): """ Return a dict mapping generic coefficient names to segment-specific coefficient values @@ -336,7 +343,7 @@ def get_segment_coefficients(whale: workflow.Whale, model_settings, segment_name if legacy: constants = config.get_model_constants(model_settings) - legacy_coeffs_file_path = whale.filesystem.get_config_file_path( + legacy_coeffs_file_path = filesystem.get_config_file_path( model_settings[legacy] ) omnibus_coefficients = pd.read_csv( @@ -346,8 +353,8 @@ def get_segment_coefficients(whale: workflow.Whale, model_settings, segment_name omnibus_coefficients[segment_name], constants=constants ) else: - coefficients_df = read_model_coefficients(whale, model_settings) - template_df = read_model_coefficient_template(whale, model_settings) + coefficients_df = filesystem.read_model_coefficients(model_settings) + template_df = read_model_coefficient_template(filesystem, model_settings) coefficients_col = ( template_df[segment_name].map(coefficients_df.value).astype(float) ) @@ -538,7 +545,7 @@ def eval_utilities( trace_label = tracing.extend_trace_label(trace_label, "eval_utils") # avoid altering caller's passed-in locals_d parameter (they may be looping) - locals_dict = assign.local_utilities() + locals_dict = assign.local_utilities(whale) if locals_d is not None: locals_dict.update(locals_d) @@ -746,7 +753,7 @@ def eval_utilities( return utilities -def eval_variables(exprs, df, locals_d=None): +def eval_variables(whale: workflow.Whale, exprs, df, locals_d=None): """ Evaluate a set of variable expressions from a spec in the context of a given data table. @@ -781,7 +788,7 @@ def eval_variables(exprs, df, locals_d=None): """ # avoid altering caller's passed-in locals_d parameter (they may be looping) - locals_dict = assign.local_utilities() + locals_dict = assign.local_utilities(whale) if locals_d is not None: locals_dict.update(locals_d) globals_dict = {} diff --git a/activitysim/core/skim_dict_factory.py b/activitysim/core/skim_dict_factory.py index e1638fc6a..48da308f5 100644 --- a/activitysim/core/skim_dict_factory.py +++ b/activitysim/core/skim_dict_factory.py @@ -503,7 +503,7 @@ def get_skim_data(self, skim_tag, skim_info): SkimData """ - data_buffers = inject.get_injectable("data_buffers", None) + data_buffers = whale.get_injectable("data_buffers", None) if data_buffers: # we assume any existing skim buffers will already have skim data loaded into them logger.info( @@ -594,7 +594,7 @@ def get_skim_data(self, skim_tag, skim_info): """ # don't expect legacy shared memory buffers - assert not inject.get_injectable("data_buffers", {}).get(skim_tag) + assert not whale.get_injectable("data_buffers", {}).get(skim_tag) skim_cache_path = self._memmap_skim_data_path(skim_tag) if not os.path.isfile(skim_cache_path): diff --git a/activitysim/core/steps/output.py b/activitysim/core/steps/output.py index f7663d9cf..d0b84d5ca 100644 --- a/activitysim/core/steps/output.py +++ b/activitysim/core/steps/output.py @@ -11,7 +11,8 @@ logger = logging.getLogger(__name__) -def track_skim_usage(output_dir): +@workflow.step +def track_skim_usage(whale: workflow.Whale): """ write statistics on skim usage (diagnostic to detect loading of un-needed skims) @@ -27,10 +28,12 @@ def track_skim_usage(output_dir): pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 - skim_dict = inject.get_injectable("skim_dict") + skim_dict = whale.get_injectable("skim_dict") mode = "wb" if sys.version_info < (3,) else "w" - with open(config.output_file_path("skim_usage.txt"), mode) as output_file: + with open( + whale.filesystem.get_output_file_path("skim_usage.txt"), mode + ) as output_file: print("\n### skim_dict usage", file=output_file) for key in skim_dict.get_skim_usage(): print(key, file=output_file) @@ -60,12 +63,12 @@ def previous_write_data_dictionary(whale: workflow.Whale, output_dir): """ - model_settings = config.read_model_settings("write_data_dictionary") + model_settings = whale.filesystem.read_model_settings("write_data_dictionary") txt_format = model_settings.get("txt_format", "data_dict.txt") csv_format = model_settings.get("csv_format", "data_dict.csv") if txt_format: - output_file_path = config.output_file_path(txt_format) + output_file_path = whale.get_output_file_path(txt_format) pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 @@ -83,7 +86,8 @@ def previous_write_data_dictionary(whale: workflow.Whale, output_dir): print(df.dtypes, file=output_file) -def write_data_dictionary(output_dir): +@workflow.step +def write_data_dictionary(whale: workflow.Whale): """ Write table schema for all tables @@ -104,7 +108,7 @@ def write_data_dictionary(output_dir): """ - model_settings = config.read_model_settings("write_data_dictionary") + model_settings = whale.filesystem.read_model_settings("write_data_dictionary") txt_format = model_settings.get("txt_format", "data_dict.txt") csv_format = model_settings.get("csv_format", "data_dict.csv") @@ -114,7 +118,7 @@ def write_data_dictionary(output_dir): ) return - table_names = pipeline.registered_tables() + table_names = whale.registered_tables() # use table_names list from model_settings, if provided schema_tables = model_settings.get("tables", None) @@ -126,7 +130,7 @@ def write_data_dictionary(output_dir): final_shapes = dict() for table_name in table_names: try: - df = pipeline.get_table(table_name) + df = whale.get_dataframe(table_name) except RuntimeError as run_err: if run_err.args and "dropped" in run_err.args[0]: # if a checkpointed table was dropped, that's not ideal, so we should @@ -135,6 +139,8 @@ def write_data_dictionary(output_dir): # note actually emitting a warnings.warn instead of a logger message will # unfortunately cause some of our excessively strict tests to fail continue + else: + raise final_shapes[table_name] = df.shape @@ -152,8 +158,8 @@ def write_data_dictionary(output_dir): schema[table_name] = info # annotate schema.info with name of checkpoint columns were first seen - for _, row in pipeline.get_checkpoints().iterrows(): - checkpoint_name = row[pipeline.CHECKPOINT_NAME] + for _, row in whale.get_checkpoints().iterrows(): + checkpoint_name = row[workflow.state.CHECKPOINT_NAME] for table_name in table_names: # no change to table in this checkpoint @@ -161,7 +167,7 @@ def write_data_dictionary(output_dir): continue # get the checkpointed version of the table - df = pipeline.get_table(table_name, checkpoint_name) + df = whale.get_table(table_name, checkpoint_name) if df.index.name and df.index.name not in df.columns: df = df.reset_index() @@ -181,10 +187,12 @@ def write_data_dictionary(output_dir): schema_df = pd.concat(schema.values()) if csv_format: - schema_df.to_csv(config.output_file_path(csv_format), header=True, index=False) + schema_df.to_csv( + whale.get_output_file_path(csv_format), header=True, index=False + ) if txt_format: - with open(config.output_file_path(txt_format), "w") as output_file: + with open(whale.get_output_file_path(txt_format), "w") as output_file: # get max schema column widths from omnibus table col_width = {c: schema_df[c].str.len().max() + 2 for c in schema_df} @@ -209,7 +217,8 @@ def write_data_dictionary(output_dir): print(f"{info}\n", file=output_file) -def write_tables(whale, output_dir): +@workflow.step +def write_tables(whale: workflow.Whale): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. @@ -264,17 +273,14 @@ def write_tables(whale, output_dir): h5_store = output_tables_settings.get("h5_store", False) sort = output_tables_settings.get("sort", False) - registered_tables = pipeline.registered_tables() + registered_tables = whale.registered_tables() if action == "include": # interpret empty or missing tables setting to mean include all registered tables output_tables_list = tables if tables is not None else registered_tables elif action == "skip": output_tables_list = [t for t in registered_tables if t not in tables] else: - raise "expected %s action '%s' to be either 'include' or 'skip'" % ( - output_tables_settings_name, - action, - ) + raise f"expected action '{action}' to be either 'include' or 'skip'" for table_name in output_tables_list: if not isinstance(table_name, str): @@ -284,15 +290,15 @@ def write_tables(whale, output_dir): table_decode_cols = {} if table_name == "checkpoints": - df = pipeline.get_checkpoints() + df = whale.get_checkpoints() else: if table_name not in registered_tables: logger.warning("Skipping '%s': Table not found." % table_name) continue - df = pipeline.get_table(table_name) + df = whale.get_dataframe(table_name) if sort: - traceable_table_indexes = inject.get_injectable( + traceable_table_indexes = whale.get_injectable( "traceable_table_indexes", {} ) @@ -356,11 +362,11 @@ def map_func(x): df = df.drop(columns=[f"_original_{lookup_col}"]) if h5_store: - file_path = config.output_file_path("%soutput_tables.h5" % prefix) + file_path = whale.get_output_file_path("%soutput_tables.h5" % prefix) df.to_hdf(file_path, key=table_name, mode="a", format="fixed") else: file_name = "%s%s.csv" % (prefix, table_name) - file_path = config.output_file_path(file_name) + file_path = whale.get_output_file_path(file_name) # include the index if it has a name or is a MultiIndex write_index = df.index.name is not None or isinstance( diff --git a/activitysim/core/test/extensions/steps.py b/activitysim/core/test/extensions/steps.py index c02cd7f98..0f5fcfcbd 100644 --- a/activitysim/core/test/extensions/steps.py +++ b/activitysim/core/test/extensions/steps.py @@ -61,4 +61,4 @@ def create_households(whale: workflow.Whale, trace_hh_id): pipeline.get_rn_generator().add_channel("households", df) - tracing.register_traceable_table("households", df) + tracing.register_traceable_table(whale, "households", df) diff --git a/activitysim/core/test/test_simulate.py b/activitysim/core/test/test_simulate.py index 09618dee2..c4d7b1b1f 100644 --- a/activitysim/core/test/test_simulate.py +++ b/activitysim/core/test/test_simulate.py @@ -24,7 +24,7 @@ def spec_name(data_dir): @pytest.fixture(scope="module") def spec(data_dir, spec_name): - return simulate.read_model_spec(file_name=spec_name) + return whale.filesystem.read_model_spec(file_name=spec_name) @pytest.fixture(scope="module") @@ -42,7 +42,7 @@ def setup_function(): def test_read_model_spec(spec_name): - spec = simulate.read_model_spec(file_name=spec_name) + spec = whale.filesystem.read_model_spec(file_name=spec_name) assert len(spec) == 4 assert spec.index.name == "Expression" @@ -52,7 +52,7 @@ def test_read_model_spec(spec_name): def test_eval_variables(spec, data): - result = simulate.eval_variables(spec.index, data) + result = simulate.eval_variables(whale, spec.index, data) expected = pd.DataFrame( [[1, 0, 4, 1], [0, 1, 4, 1], [0, 1, 5, 1]], index=data.index, columns=spec.index diff --git a/activitysim/core/test/test_tracing.py b/activitysim/core/test/test_tracing.py index 82a40656c..7b6e13be8 100644 --- a/activitysim/core/test/test_tracing.py +++ b/activitysim/core/test/test_tracing.py @@ -105,14 +105,14 @@ def test_register_households(capsys): inject.add_injectable("traceable_tables", ["households"]) inject.add_injectable("trace_hh_id", 5) - tracing.register_traceable_table("households", df) + tracing.register_traceable_table(whale, "households", df) out, err = capsys.readouterr() # print out # don't consume output assert "Can't register table 'households' without index name" in out df.index.name = "household_id" - tracing.register_traceable_table("households", df) + tracing.register_traceable_table(whale, "households", df) out, err = capsys.readouterr() # print out # don't consume output @@ -139,7 +139,7 @@ def test_register_tours(capsys): tours_df = pd.DataFrame({"zort": ["a", "b", "c"]}, index=[10, 11, 12]) tours_df.index.name = "tour_id" - tracing.register_traceable_table("tours", tours_df) + tracing.register_traceable_table(whale, "tours", tours_df) out, err = capsys.readouterr() assert ( @@ -150,9 +150,9 @@ def test_register_tours(capsys): inject.add_injectable("trace_hh_id", 3) households_df = pd.DataFrame({"dzing": ["a", "b", "c"]}, index=[1, 2, 3]) households_df.index.name = "household_id" - tracing.register_traceable_table("households", households_df) + tracing.register_traceable_table(whale, "households", households_df) - tracing.register_traceable_table("tours", tours_df) + tracing.register_traceable_table(whale, "tours", tours_df) out, err = capsys.readouterr() # print out # don't consume output @@ -160,7 +160,7 @@ def test_register_tours(capsys): tours_df["household_id"] = [1, 5, 3] - tracing.register_traceable_table("tours", tours_df) + tracing.register_traceable_table(whale, "tours", tours_df) out, err = capsys.readouterr() print(out) # don't consume output diff --git a/activitysim/core/tracing.py b/activitysim/core/tracing.py index f677cae37..08b1a4a79 100644 --- a/activitysim/core/tracing.py +++ b/activitysim/core/tracing.py @@ -261,7 +261,7 @@ def initialize_traceable_tables(whale: workflow.Whale): whale.set("traceable_table_ids", {}) -def register_traceable_table(whale, table_name, df): +def register_traceable_table(whale: workflow.Whale, table_name: str, df: pd.DataFrame): """ Register traceable table @@ -682,8 +682,8 @@ def dump_df(dump_switch, df, trace_label, fname): def trace_df( - df, - label, + df: pd.DataFrame, + label: str, slicer=None, columns=None, index_label=None, diff --git a/activitysim/core/workflow/__init__.py b/activitysim/core/workflow/__init__.py index e8f9db45d..b6a39e62f 100644 --- a/activitysim/core/workflow/__init__.py +++ b/activitysim/core/workflow/__init__.py @@ -3,3 +3,4 @@ from .steps import workflow_cached_object as cached_object from .steps import workflow_step as step from .steps import workflow_table as table +from .steps import workflow_temp_table as temp_table diff --git a/activitysim/core/workflow/state.py b/activitysim/core/workflow/state.py index e36d5cd5c..a4905faab 100644 --- a/activitysim/core/workflow/state.py +++ b/activitysim/core/workflow/state.py @@ -58,8 +58,9 @@ def split_arg(s, sep, default=""): class WhaleAttr: - def __init__(self, member_type): + def __init__(self, member_type, default_init=False): self.member_type = member_type + self._default_init = default_init def __set_name__(self, owner, name): self.name = name @@ -68,6 +69,9 @@ def __get__(self, instance, objtype=None): try: return instance.context[self.name] except (KeyError, AttributeError): + if self._default_init: + instance.context[self.name] = self.member_type() + return instance.context[self.name] raise WhaleAccessError(f"{self.name} not initialized for this whale") def __set__(self, instance, value): @@ -114,6 +118,7 @@ def init_state(self, pipeline_file_format="parquet"): filesystem = WhaleAttr(FileSystem) settings = WhaleAttr(Settings) network_settings = WhaleAttr(NetworkSettings) + predicates = WhaleAttr(dict, default_init=True) # @property # def filesystem(self) -> FileSystem: @@ -159,6 +164,7 @@ def init_state(self, pipeline_file_format="parquet"): _RUNNABLE_STEPS = {} _LOADABLE_TABLES = {} _LOADABLE_OBJECTS = {} + _PREDICATES = {} @property def known_table_names(self): @@ -225,7 +231,7 @@ def get_dataframe(self, tablename): def access(self, key, initializer): if key not in self.context: - self.context[key] = initializer + self.set(key, initializer) return self.context[key] def get(self, key, default: Any = NO_DEFAULT): @@ -249,6 +255,10 @@ def get(self, key, default: Any = NO_DEFAULT): def set(self, key, value): self.context[key] = value + for i in self._PREDICATES.get(key, []): + if i in self.context: + logger.critical(f"update of {key} clears cached {i}") + del self.context[i] def extract(self, func): return func(self) @@ -456,7 +466,7 @@ def add_table(self, name, content, salient=True): # mark this salient table as edited, so it can be checkpointed # at some later time if desired. self.existing_table_status[name] = True - self.context.update({name: content}) + self.set(name, content) def is_table(self, name): return name in self.existing_table_status @@ -643,7 +653,7 @@ def load_checkpoint(self, checkpoint_name): # register for tracing in order that tracing.register_traceable_table wants us to register them traceable_tables = self.get_injectable("traceable_tables", []) - from .tracing import register_traceable_table + from activitysim.core.tracing import register_traceable_table for table_name in traceable_tables: if table_name in loaded_tables: @@ -1071,9 +1081,6 @@ def extend_table(self, table_name, df, axis=0): orca/inject table name df : pandas DataFrame """ - - assert self.is_open, f"Pipeline is not open." - assert axis in [0, 1] if self.is_table(table_name): @@ -1107,7 +1114,6 @@ def extend_table(self, table_name, df, axis=0): return df def drop_table(self, table_name): - assert self.is_open, f"Pipeline is not open." if self.is_table(table_name): logger.debug("drop_table dropping orca table '%s'" % table_name) @@ -1202,3 +1208,9 @@ def chunk_log(self, *args, **kwargs): from activitysim.core.chunk import chunk_log return chunk_log(*args, **kwargs, settings=self.settings) + + def get_output_file_path(self, file_name: str) -> Path: + prefix = self.get_injectable("output_file_prefix", None) + if prefix: + file_name = "%s-%s" % (prefix, file_name) + return self.filesystem.get_output_dir().joinpath(file_name) diff --git a/activitysim/core/workflow/steps.py b/activitysim/core/workflow/steps.py index 5457d3a8b..b35b5acfb 100644 --- a/activitysim/core/workflow/steps.py +++ b/activitysim/core/workflow/steps.py @@ -4,19 +4,30 @@ import logging import time from inspect import get_annotations, getfullargspec -from typing import Callable, Mapping +from typing import Callable, Mapping, NamedTuple from pypyr.context import Context from pypyr.errors import KeyNotInContextError -from ..exceptions import DuplicateWorkflowNameError, DuplicateWorkflowTableError -from .util import get_formatted_or_default, get_formatted_or_raw +from activitysim.core.exceptions import ( + DuplicateWorkflowNameError, + DuplicateWorkflowTableError, +) +from activitysim.core.workflow.util import ( + get_formatted_or_default, + get_formatted_or_raw, +) logger = logging.getLogger(__name__) _STEP_LIBRARY = {} +class TableInfo(NamedTuple): + factory: Callable + predicates: tuple[str] + + def error_logging(func): def wrapper(*args, **kwargs): try: @@ -269,6 +280,9 @@ def run_step(context: Context = None) -> None: context["_salient_tables"] = {} context["_salient_tables"][self._step_name] = time.time() return outcome + elif self._kind == "temp_table": + context[self._step_name] = outcome + return outcome elif self._kind == "cached_object": context[self._step_name] = outcome return outcome @@ -294,6 +308,14 @@ def update_with_cache(whale, *args, **kwargs): elif self._kind == "table": Whale._LOADABLE_TABLES[self._step_name] = run_step return update_with_cache + elif self._kind == "temp_table": + Whale._LOADABLE_TABLES[self._step_name] = run_step + for i in _args[1:]: + if i not in Whale._PREDICATES: + Whale._PREDICATES[i] = {self._step_name} + else: + Whale._PREDICATES[i].add(self._step_name) + return update_with_cache elif self._kind == "step": Whale._RUNNABLE_STEPS[self._step_name] = run_step return wrapped_func @@ -315,6 +337,13 @@ def __new__(cls, wrapped_func=None, *, step_name=None): ) +class workflow_temp_table(workflow_step): + def __new__(cls, wrapped_func=None, *, step_name=None): + return super().__new__( + cls, wrapped_func, step_name=step_name, cache=True, kind="temp_table" + ) + + def _validate_workflow_function(f): from activitysim.core.workflow import Whale diff --git a/activitysim/examples/example_estimation/scripts/infer.py b/activitysim/examples/example_estimation/scripts/infer.py index 1075de496..7b4e7a7aa 100644 --- a/activitysim/examples/example_estimation/scripts/infer.py +++ b/activitysim/examples/example_estimation/scripts/infer.py @@ -400,8 +400,8 @@ def read_tdd_alts(): return tdds.tdd -def patch_tour_ids(persons, tours, joint_tour_participants): - def set_tour_index(tours, parent_tour_num_col, is_joint): +def patch_tour_ids(whale: workflow.Whale, persons, tours, joint_tour_participants): + def set_tour_index(whale, tours, parent_tour_num_col, is_joint): group_cols = ["person_id", "tour_category", "tour_type"] if "parent_tour_num" in tours: @@ -412,7 +412,7 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): ) return cid.set_tour_index( - tours, parent_tour_num_col=parent_tour_num_col, is_joint=is_joint + whale, tours, parent_tour_num_col=parent_tour_num_col, is_joint=is_joint ) assert "mandatory_tour_frequency" in persons @@ -423,6 +423,7 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): # mandatory tours ##################### mandatory_tours = set_tour_index( + whale, tours[tours.tour_category == "mandatory"], parent_tour_num_col=None, is_joint=False, @@ -447,7 +448,9 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): temp_point_persons.person_id, joint_tours.household_id ) - joint_tours = set_tour_index(joint_tours, parent_tour_num_col=None, is_joint=True) + joint_tours = set_tour_index( + whale, joint_tours, parent_tour_num_col=None, is_joint=True + ) joint_tours["person_id"] = joint_tours["cache_point_person_id"] del joint_tours["cache_point_person_id"] @@ -472,6 +475,7 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): ##################### non_mandatory_tours = set_tour_index( + whale, tours[tours.tour_category == "non_mandatory"], parent_tour_num_col=None, is_joint=False, @@ -520,7 +524,7 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): ) atwork_tours = set_tour_index( - atwork_tours, parent_tour_num_col="parent_tour_num", is_joint=False + whale, atwork_tours, parent_tour_num_col="parent_tour_num", is_joint=False ) del atwork_tours["parent_tour_num"] @@ -747,7 +751,7 @@ def check_controls(table_name, column_name): return True -def infer(configs_dir, input_dir, output_dir): +def infer(whale: workflow.Whale, configs_dir, input_dir, output_dir): households, persons, tours, joint_tour_participants, trips = read_tables( input_dir, survey_tables ) @@ -793,7 +797,7 @@ def infer(configs_dir, input_dir, output_dir): # patch_tour_ids tours, joint_tour_participants = patch_tour_ids( - persons, tours, joint_tour_participants + whale, persons, tours, joint_tour_participants ) survey_tables["tours"]["table"] = tours survey_tables["joint_tour_participants"]["table"] = joint_tour_participants @@ -856,4 +860,4 @@ def infer(configs_dir, input_dir, output_dir): if apply_controls: read_tables(input_dir, control_tables) -infer(configs_dir, input_dir, output_dir) +infer(whale, configs_dir, input_dir, output_dir) From 735df4e522ac25ac9420d7ea885fceb887894d74 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Mon, 13 Feb 2023 09:06:15 -0600 Subject: [PATCH 010/419] refactoring --- activitysim/abm/models/accessibility.py | 5 +-- .../abm/models/atwork_subtour_frequency.py | 3 +- activitysim/abm/models/auto_ownership.py | 1 - activitysim/abm/models/free_parking.py | 1 - activitysim/abm/models/initialize_los.py | 2 +- .../abm/models/joint_tour_composition.py | 11 +++--- .../abm/models/joint_tour_frequency.py | 7 ++-- .../abm/models/joint_tour_participation.py | 25 ++++++++----- .../abm/models/mandatory_tour_frequency.py | 1 - .../models/non_mandatory_tour_frequency.py | 6 ++-- activitysim/abm/models/stop_frequency.py | 1 - .../abm/models/telecommute_frequency.py | 1 - .../models/tour_scheduling_probabilistic.py | 4 +-- .../abm/models/transit_pass_ownership.py | 1 - .../abm/models/transit_pass_subsidy.py | 1 - .../abm/models/trip_departure_choice.py | 4 +-- activitysim/abm/models/trip_purpose.py | 4 +-- activitysim/abm/models/trip_scheduling.py | 2 +- .../abm/models/trip_scheduling_choice.py | 4 +-- activitysim/abm/models/util/cdap.py | 4 +-- activitysim/abm/models/util/mode.py | 1 - activitysim/abm/models/util/overlap.py | 14 ++++---- .../test/test_non_mandatory_tour_frequency.py | 2 +- activitysim/abm/models/util/tour_frequency.py | 15 ++++++-- .../abm/models/util/tour_scheduling.py | 2 +- .../models/util/vectorize_tour_scheduling.py | 2 +- activitysim/abm/models/vehicle_allocation.py | 1 - activitysim/abm/models/vehicle_type_choice.py | 1 - activitysim/abm/models/work_from_home.py | 1 - activitysim/core/chunk.py | 21 +++++++---- activitysim/core/interaction_sample.py | 4 +-- .../core/interaction_sample_simulate.py | 2 +- activitysim/core/interaction_simulate.py | 2 +- activitysim/core/logit.py | 6 ++-- activitysim/core/simulate.py | 36 ++++++++++--------- activitysim/core/test/test_simulate.py | 7 +++- activitysim/core/timetable.py | 4 +-- activitysim/core/workflow/state.py | 2 +- 38 files changed, 107 insertions(+), 104 deletions(-) diff --git a/activitysim/abm/models/accessibility.py b/activitysim/abm/models/accessibility.py index 47f848e78..c62fc4ed1 100644 --- a/activitysim/abm/models/accessibility.py +++ b/activitysim/abm/models/accessibility.py @@ -120,7 +120,6 @@ def compute_accessibility( land_use: pd.DataFrame, accessibility: pd.DataFrame, network_los: los.Network_LOS, - chunk_size: int, trace_od, ): @@ -171,9 +170,7 @@ def compute_accessibility( chooser_chunk, chunk_trace_label, chunk_sizer, - ) in chunk.adaptive_chunked_choosers( - whale, accessibility_df, chunk_size, trace_label - ): + ) in chunk.adaptive_chunked_choosers(whale, accessibility_df, trace_label): accessibilities = compute_accessibilities_for_zones( whale, diff --git a/activitysim/abm/models/atwork_subtour_frequency.py b/activitysim/abm/models/atwork_subtour_frequency.py index 3a213bd2e..81f98cb11 100644 --- a/activitysim/abm/models/atwork_subtour_frequency.py +++ b/activitysim/abm/models/atwork_subtour_frequency.py @@ -82,7 +82,6 @@ def atwork_subtour_frequency(whale: workflow.Whale, tours, persons_merged, chunk spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="atwork_subtour_frequency", estimator=estimator, @@ -108,7 +107,7 @@ def atwork_subtour_frequency(whale: workflow.Whale, tours, persons_merged, chunk work_tours = tours[tours.tour_type == "work"] assert not work_tours.atwork_subtour_frequency.isnull().any() - subtours = process_atwork_subtours(work_tours, alternatives) + subtours = process_atwork_subtours(whale, work_tours, alternatives) tours = whale.extend_table("tours", subtours) diff --git a/activitysim/abm/models/auto_ownership.py b/activitysim/abm/models/auto_ownership.py index 79d4c293b..9be6a1558 100644 --- a/activitysim/abm/models/auto_ownership.py +++ b/activitysim/abm/models/auto_ownership.py @@ -51,7 +51,6 @@ def auto_ownership_simulate( spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="auto_ownership", log_alt_losers=log_alt_losers, diff --git a/activitysim/abm/models/free_parking.py b/activitysim/abm/models/free_parking.py index bba1e9838..ca45956b0 100644 --- a/activitysim/abm/models/free_parking.py +++ b/activitysim/abm/models/free_parking.py @@ -67,7 +67,6 @@ def free_parking( spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="free_parking_at_work", estimator=estimator, diff --git a/activitysim/abm/models/initialize_los.py b/activitysim/abm/models/initialize_los.py index 06b018ead..13fe7eea2 100644 --- a/activitysim/abm/models/initialize_los.py +++ b/activitysim/abm/models/initialize_los.py @@ -141,7 +141,7 @@ def compute_utilities_for_attribute_tuple( chunk_trace_label, chunk_sizer, ) in chunk.adaptive_chunked_choosers( - whale, choosers_df, chunk_size, trace_label, chunk_tag=chunk_tag + whale, choosers_df, trace_label, chunk_tag=chunk_tag ): # we should count choosers_df as chunk overhead since its pretty big and was custom made for compute_utilities if chooser_chunk._is_view: diff --git a/activitysim/abm/models/joint_tour_composition.py b/activitysim/abm/models/joint_tour_composition.py index 51b8db297..f7247b532 100644 --- a/activitysim/abm/models/joint_tour_composition.py +++ b/activitysim/abm/models/joint_tour_composition.py @@ -19,7 +19,11 @@ def add_null_results(whale, trace_label, tours): @workflow.step def joint_tour_composition( - whale: workflow.Whale, tours, households, persons, chunk_size + whale: workflow.Whale, + tours, + households: pd.DataFrame, + persons: pd.DataFrame, + chunk_size, ): """ This model predicts the makeup of the travel party (adults, children, or mixed). @@ -38,10 +42,8 @@ def joint_tour_composition( estimator = estimation.manager.begin_estimation(whale, "joint_tour_composition") # - only interested in households with joint_tours - households = households.to_frame() households = households[households.num_hh_joint_tours > 0] - persons = persons.to_frame() persons = persons[persons.household_id.isin(households.index)] logger.info( @@ -53,7 +55,7 @@ def joint_tour_composition( if preprocessor_settings: locals_dict = { "persons": persons, - "hh_time_window_overlap": hh_time_window_overlap, + "hh_time_window_overlap": lambda *x: hh_time_window_overlap(whale, *x), } expressions.assign_columns( @@ -90,7 +92,6 @@ def joint_tour_composition( spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="composition", estimator=estimator, diff --git a/activitysim/abm/models/joint_tour_frequency.py b/activitysim/abm/models/joint_tour_frequency.py index da75746b1..ac75f0a56 100644 --- a/activitysim/abm/models/joint_tour_frequency.py +++ b/activitysim/abm/models/joint_tour_frequency.py @@ -33,12 +33,10 @@ def joint_tour_frequency(whale: workflow.Whale, households, persons, chunk_size) # - only interested in households with more than one cdap travel_active person and # - at least one non-preschooler - households = households.to_frame() multi_person_households = households[households.participates_in_jtf_model].copy() # - only interested in persons in multi_person_households # FIXME - gratuitous pathological efficiency move, just let yaml specify persons? - persons = persons.to_frame() persons = persons[persons.household_id.isin(multi_person_households.index)] logger.info( @@ -51,7 +49,7 @@ def joint_tour_frequency(whale: workflow.Whale, households, persons, chunk_size) if preprocessor_settings: locals_dict = { "persons": persons, - "hh_time_window_overlap": hh_time_window_overlap, + "hh_time_window_overlap": lambda *x: hh_time_window_overlap(whale, *x), } expressions.assign_columns( @@ -83,7 +81,6 @@ def joint_tour_frequency(whale: workflow.Whale, households, persons, chunk_size) spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="joint_tour_frequency", estimator=estimator, @@ -111,7 +108,7 @@ def joint_tour_frequency(whale: workflow.Whale, households, persons, chunk_size) temp_point_persons = temp_point_persons.set_index("household_id") temp_point_persons = temp_point_persons[["person_id", "home_zone_id"]] - joint_tours = process_joint_tours(choices, alternatives, temp_point_persons) + joint_tours = process_joint_tours(whale, choices, alternatives, temp_point_persons) tours = whale.extend_table("tours", joint_tours) diff --git a/activitysim/abm/models/joint_tour_participation.py b/activitysim/abm/models/joint_tour_participation.py index bcb949a9e..08f3961b0 100644 --- a/activitysim/abm/models/joint_tour_participation.py +++ b/activitysim/abm/models/joint_tour_participation.py @@ -116,7 +116,13 @@ def get_tour_satisfaction(candidates, participate): return satisfaction -def participants_chooser(probs, choosers, spec, trace_label): +def participants_chooser( + whale: workflow.Whale, + probs: pd.DataFrame, + choosers: pd.DataFrame, + spec: pd.DataFrame, + trace_label: str, +) -> tuple[pd.Series, pd.Series]: """ custom alternative to logit.make_choices for simulate.simple_simulate @@ -241,7 +247,7 @@ def participants_chooser(probs, choosers, spec, trace_label): return choices, rands -def annotate_jtp(model_settings, trace_label): +def annotate_jtp(whale: workflow.Whale, model_settings, trace_label): # - annotate persons persons = whale.get_dataframe("persons") expressions.assign_columns( @@ -264,11 +270,13 @@ def add_null_results(whale, model_settings, trace_label): whale.add_table("joint_tour_participants", participants) # - run annotations - annotate_jtp(model_settings, trace_label) + annotate_jtp(whale, model_settings, trace_label) @workflow.step -def joint_tour_participation(whale: workflow.Whale, tours, persons_merged, chunk_size): +def joint_tour_participation( + whale: workflow.Whale, tours: pd.DataFrame, persons_merged: pd.DataFrame, chunk_size +): """ Predicts for each eligible person to participate or not participate in each joint tour. """ @@ -277,7 +285,6 @@ def joint_tour_participation(whale: workflow.Whale, tours, persons_merged, chunk model_settings = whale.filesystem.read_model_settings(model_settings_file_name) trace_hh_id = whale.settings.trace_hh_id - tours = tours.to_frame() joint_tours = tours[tours.tour_category == "joint"] # - if no joint tours @@ -285,8 +292,6 @@ def joint_tour_participation(whale: workflow.Whale, tours, persons_merged, chunk add_null_results(whale, model_settings, trace_label) return - persons_merged = persons_merged.to_frame() - # - create joint_tour_participation_candidates table candidates = joint_tour_participation_candidates(joint_tours, persons_merged) tracing.register_traceable_table(whale, "joint_tour_participants", candidates) @@ -301,7 +306,9 @@ def joint_tour_participation(whale: workflow.Whale, tours, persons_merged, chunk preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: locals_dict = { - "person_time_window_overlap": person_time_window_overlap, + "person_time_window_overlap": lambda x: person_time_window_overlap( + whale, x + ), "persons": persons_merged, } @@ -417,7 +424,7 @@ def joint_tour_participation(whale: workflow.Whale, tours, persons_merged, chunk whale.add_table("tours", tours) # - run annotations - annotate_jtp(model_settings, trace_label) + annotate_jtp(whale, model_settings, trace_label) if trace_hh_id: tracing.trace_df(participants, label="joint_tour_participation.participants") diff --git a/activitysim/abm/models/mandatory_tour_frequency.py b/activitysim/abm/models/mandatory_tour_frequency.py index ad168bccb..2f923f459 100644 --- a/activitysim/abm/models/mandatory_tour_frequency.py +++ b/activitysim/abm/models/mandatory_tour_frequency.py @@ -92,7 +92,6 @@ def mandatory_tour_frequency(whale: workflow.Whale, persons_merged, chunk_size): spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="mandatory_tour_frequency", estimator=estimator, diff --git a/activitysim/abm/models/non_mandatory_tour_frequency.py b/activitysim/abm/models/non_mandatory_tour_frequency.py index a774513fd..52b2b9718 100644 --- a/activitysim/abm/models/non_mandatory_tour_frequency.py +++ b/activitysim/abm/models/non_mandatory_tour_frequency.py @@ -165,7 +165,7 @@ def non_mandatory_tour_frequency( # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_dict = {"person_max_window": person_max_window} + locals_dict = {"person_max_window": lambda x: person_max_window(whale, x)} expressions.assign_columns( whale, @@ -338,7 +338,9 @@ def non_mandatory_tour_frequency( """ create the non_mandatory tours based on extended_tour_counts """ - non_mandatory_tours = process_non_mandatory_tours(persons, extended_tour_counts) + non_mandatory_tours = process_non_mandatory_tours( + whale, persons, extended_tour_counts + ) assert len(non_mandatory_tours) == extended_tour_counts.sum().sum() if estimator: diff --git a/activitysim/abm/models/stop_frequency.py b/activitysim/abm/models/stop_frequency.py index 236bcb9f9..d714facf5 100644 --- a/activitysim/abm/models/stop_frequency.py +++ b/activitysim/abm/models/stop_frequency.py @@ -152,7 +152,6 @@ def stop_frequency( spec=segment_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, segment_name), trace_choice_name="stops", estimator=estimator, diff --git a/activitysim/abm/models/telecommute_frequency.py b/activitysim/abm/models/telecommute_frequency.py index 33b402019..2827f3bd1 100755 --- a/activitysim/abm/models/telecommute_frequency.py +++ b/activitysim/abm/models/telecommute_frequency.py @@ -70,7 +70,6 @@ def telecommute_frequency( spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="telecommute_frequency", estimator=estimator, diff --git a/activitysim/abm/models/tour_scheduling_probabilistic.py b/activitysim/abm/models/tour_scheduling_probabilistic.py index dd0cc9ff5..014ef0a36 100644 --- a/activitysim/abm/models/tour_scheduling_probabilistic.py +++ b/activitysim/abm/models/tour_scheduling_probabilistic.py @@ -53,9 +53,7 @@ def run_tour_scheduling_probabilistic( chooser_chunk, chunk_trace_label, chunk_sizer, - ) in chunk.adaptive_chunked_choosers( - whale, tours_df, chunk_size, trace_label, trace_label - ): + ) in chunk.adaptive_chunked_choosers(whale, tours_df, trace_label, trace_label): choices = ps.make_scheduling_choices( whale, chooser_chunk, diff --git a/activitysim/abm/models/transit_pass_ownership.py b/activitysim/abm/models/transit_pass_ownership.py index 9720c63f7..e462ba7ec 100644 --- a/activitysim/abm/models/transit_pass_ownership.py +++ b/activitysim/abm/models/transit_pass_ownership.py @@ -64,7 +64,6 @@ def transit_pass_ownership( spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="transit_pass_ownership", estimator=estimator, diff --git a/activitysim/abm/models/transit_pass_subsidy.py b/activitysim/abm/models/transit_pass_subsidy.py index 08dca03e4..43d79116e 100644 --- a/activitysim/abm/models/transit_pass_subsidy.py +++ b/activitysim/abm/models/transit_pass_subsidy.py @@ -64,7 +64,6 @@ def transit_pass_subsidy( spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="transit_pass_subsidy", estimator=estimator, diff --git a/activitysim/abm/models/trip_departure_choice.py b/activitysim/abm/models/trip_departure_choice.py index 09a42527b..225ea8275 100644 --- a/activitysim/abm/models/trip_departure_choice.py +++ b/activitysim/abm/models/trip_departure_choice.py @@ -426,9 +426,7 @@ def apply_stage_two_model(whale, omnibus_spec, trips, chunk_size, trace_label): chooser_chunk, chunk_trace_label, chunk_sizer, - ) in chunk.adaptive_chunked_choosers_by_chunk_id( - side_trips, chunk_size, trace_label - ): + ) in chunk.adaptive_chunked_choosers_by_chunk_id(whale, side_trips, trace_label): for is_outbound, trip_segment in chooser_chunk.groupby(OUTBOUND): direction = OUTBOUND if is_outbound else "inbound" spec = get_spec_for_segment(omnibus_spec, direction) diff --git a/activitysim/abm/models/trip_purpose.py b/activitysim/abm/models/trip_purpose.py index 6fbfa3feb..336f6993c 100644 --- a/activitysim/abm/models/trip_purpose.py +++ b/activitysim/abm/models/trip_purpose.py @@ -242,9 +242,7 @@ def run_trip_purpose( trips_chunk, chunk_trace_label, chunk_sizer, - ) in chunk.adaptive_chunked_choosers( - whale, trips_df, chunk_size, chunk_tag, trace_label - ): + ) in chunk.adaptive_chunked_choosers(whale, trips_df, chunk_tag, trace_label): choices = choose_intermediate_trip_purpose( whale, trips_chunk, diff --git a/activitysim/abm/models/trip_scheduling.py b/activitysim/abm/models/trip_scheduling.py index 80c75770f..99f153674 100644 --- a/activitysim/abm/models/trip_scheduling.py +++ b/activitysim/abm/models/trip_scheduling.py @@ -468,7 +468,7 @@ def trip_scheduling(whale: workflow.Whale, trips, tours, chunk_size, trace_hh_id chunk_trace_label, chunk_sizer, ) in chunk.adaptive_chunked_choosers_by_chunk_id( - trips_df, chunk_size, trace_label, trace_label + whale, trips_df, trace_label, trace_label ): i = 0 while (i < max_iterations) and not trips_chunk.empty: diff --git a/activitysim/abm/models/trip_scheduling_choice.py b/activitysim/abm/models/trip_scheduling_choice.py index 221560b7d..fd0d767af 100644 --- a/activitysim/abm/models/trip_scheduling_choice.py +++ b/activitysim/abm/models/trip_scheduling_choice.py @@ -263,9 +263,7 @@ def run_trip_scheduling_choice( choosers, chunk_trace_label, chunk_sizer, - ) in chunk.adaptive_chunked_choosers( - whale, indirect_tours, chunk_size, trace_label - ): + ) in chunk.adaptive_chunked_choosers(whale, indirect_tours, trace_label): # Sort the choosers and get the schedule alternatives choosers = choosers.sort_index() schedules = generate_schedule_alternatives(choosers).sort_index() diff --git a/activitysim/abm/models/util/cdap.py b/activitysim/abm/models/util/cdap.py index 1310fb012..f10329692 100644 --- a/activitysim/abm/models/util/cdap.py +++ b/activitysim/abm/models/util/cdap.py @@ -1047,9 +1047,7 @@ def run_cdap( persons_chunk, chunk_trace_label, chunk_sizer, - ) in chunk.adaptive_chunked_choosers_by_chunk_id( - whale, persons, chunk_size, trace_label - ): + ) in chunk.adaptive_chunked_choosers_by_chunk_id(whale, persons, trace_label): cdap_results = _run_cdap( whale, persons_chunk, diff --git a/activitysim/abm/models/util/mode.py b/activitysim/abm/models/util/mode.py index c85eb13a6..c185642c4 100644 --- a/activitysim/abm/models/util/mode.py +++ b/activitysim/abm/models/util/mode.py @@ -59,7 +59,6 @@ def mode_choice_simulate( nest_spec=nest_spec, skims=skims, locals_d=locals_d, - chunk_size=chunk_size, want_logsums=want_logsums, trace_label=trace_label, trace_choice_name=trace_choice_name, diff --git a/activitysim/abm/models/util/overlap.py b/activitysim/abm/models/util/overlap.py index a014b7b15..d6b47080a 100644 --- a/activitysim/abm/models/util/overlap.py +++ b/activitysim/abm/models/util/overlap.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd -from activitysim.core import inject, tracing +from activitysim.core import inject, tracing, workflow logger = logging.getLogger(__name__) @@ -90,7 +90,7 @@ def rle(a): return row_id, start_pos, run_length, run_val -def p2p_time_window_overlap(p1_ids, p2_ids): +def p2p_time_window_overlap(whale: workflow.Whale, p1_ids, p2_ids): """ Parameters @@ -163,11 +163,11 @@ def person_pairs(persons): return p2p -def hh_time_window_overlap(households, persons): +def hh_time_window_overlap(whale: workflow.Whale, households, persons): p2p = person_pairs(persons) - p2p["max_overlap"] = p2p_time_window_overlap(p2p.person1, p2p.person2) + p2p["max_overlap"] = p2p_time_window_overlap(whale, p2p.person1, p2p.person2) hh_overlap = ( p2p.groupby(["household_id", "p2p_type"]) @@ -186,11 +186,11 @@ def hh_time_window_overlap(households, persons): return hh_overlap -def person_time_window_overlap(persons): +def person_time_window_overlap(whale: workflow.Whale, persons): p2p = person_pairs(persons) - p2p["max_overlap"] = p2p_time_window_overlap(p2p.person1, p2p.person2) + p2p["max_overlap"] = p2p_time_window_overlap(whale, p2p.person1, p2p.person2) p_overlap = ( pd.concat( @@ -221,7 +221,7 @@ def person_time_window_overlap(persons): return p_overlap -def person_max_window(persons): +def person_max_window(whale: workflow.Whale, persons): timetable = whale.get_injectable("timetable") diff --git a/activitysim/abm/models/util/test/test_non_mandatory_tour_frequency.py b/activitysim/abm/models/util/test/test_non_mandatory_tour_frequency.py index b7fac8044..80e540986 100644 --- a/activitysim/abm/models/util/test/test_non_mandatory_tour_frequency.py +++ b/activitysim/abm/models/util/test/test_non_mandatory_tour_frequency.py @@ -42,7 +42,7 @@ def test_nmtf(): tour_counts.index = persons.index # assign person ids to the index # - create the non_mandatory tours - nmt = process_non_mandatory_tours(persons, tour_counts) + nmt = process_non_mandatory_tours(whale, persons, tour_counts) idx = nmt.index diff --git a/activitysim/abm/models/util/tour_frequency.py b/activitysim/abm/models/util/tour_frequency.py index f527bc488..e60f80a36 100644 --- a/activitysim/abm/models/util/tour_frequency.py +++ b/activitysim/abm/models/util/tour_frequency.py @@ -260,7 +260,7 @@ def process_mandatory_tours( return tours -def process_non_mandatory_tours(persons, tour_counts): +def process_non_mandatory_tours(whale: workflow.Whale, persons, tour_counts): """ This method processes the non_mandatory_tour_frequency column that comes out of the model of the same name and turns into a DataFrame that @@ -313,7 +313,11 @@ def process_non_mandatory_tours(persons, tour_counts): return tours -def process_atwork_subtours(work_tours, atwork_subtour_frequency_alts): +def process_atwork_subtours( + whale: workflow.Whale, + work_tours: pd.DataFrame, + atwork_subtour_frequency_alts: pd.DataFrame, +): """ This method processes the atwork_subtour_frequency column that comes @@ -401,7 +405,12 @@ def process_atwork_subtours(work_tours, atwork_subtour_frequency_alts): return tours -def process_joint_tours(joint_tour_frequency, joint_tour_frequency_alts, point_persons): +def process_joint_tours( + whale: workflow.Whale, + joint_tour_frequency, + joint_tour_frequency_alts, + point_persons, +): """ This method processes the joint_tour_frequency column that comes out of the model of the same name and turns into a DataFrame that represents the diff --git a/activitysim/abm/models/util/tour_scheduling.py b/activitysim/abm/models/util/tour_scheduling.py index f3c0d5b07..a8c6850d1 100644 --- a/activitysim/abm/models/util/tour_scheduling.py +++ b/activitysim/abm/models/util/tour_scheduling.py @@ -176,7 +176,7 @@ def run_tour_scheduling( tdds=choices.reindex(nth_tours.index), ) - timetable.replace_table() + timetable.replace_table(whale) # choices are tdd alternative ids # we want to add start, end, and duration columns to tours, which we have in tdd_alts table diff --git a/activitysim/abm/models/util/vectorize_tour_scheduling.py b/activitysim/abm/models/util/vectorize_tour_scheduling.py index d81612bf2..6e65c6b1c 100644 --- a/activitysim/abm/models/util/vectorize_tour_scheduling.py +++ b/activitysim/abm/models/util/vectorize_tour_scheduling.py @@ -852,7 +852,7 @@ def schedule_tours( chunk_trace_label, chunk_sizer, ) in chunk.adaptive_chunked_choosers( - whale, tours, chunk_size, tour_trace_label, tour_chunk_tag + whale, tours, tour_trace_label, tour_chunk_tag ): choices = _schedule_tours( whale, diff --git a/activitysim/abm/models/vehicle_allocation.py b/activitysim/abm/models/vehicle_allocation.py index 379769345..ee81bbaec 100644 --- a/activitysim/abm/models/vehicle_allocation.py +++ b/activitysim/abm/models/vehicle_allocation.py @@ -201,7 +201,6 @@ def vehicle_allocation( nest_spec=nest_spec, skims=skims, locals_d=locals_dict, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="vehicle_allocation", estimator=estimator, diff --git a/activitysim/abm/models/vehicle_type_choice.py b/activitysim/abm/models/vehicle_type_choice.py index 699b6a7d0..8ae33a14b 100644 --- a/activitysim/abm/models/vehicle_type_choice.py +++ b/activitysim/abm/models/vehicle_type_choice.py @@ -414,7 +414,6 @@ def iterate_vehicle_type_choice( log_alt_losers=log_alt_losers, nest_spec=nest_spec, locals_d=locals_dict, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="vehicle_type", estimator=estimator, diff --git a/activitysim/abm/models/work_from_home.py b/activitysim/abm/models/work_from_home.py index b353d8210..8b23ff39e 100755 --- a/activitysim/abm/models/work_from_home.py +++ b/activitysim/abm/models/work_from_home.py @@ -96,7 +96,6 @@ def work_from_home(whale: workflow.Whale, persons_merged, persons, chunk_size): spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="work_from_home", estimator=estimator, diff --git a/activitysim/core/chunk.py b/activitysim/core/chunk.py index 13764382f..314cb9580 100644 --- a/activitysim/core/chunk.py +++ b/activitysim/core/chunk.py @@ -1126,7 +1126,12 @@ def chunk_log_skip(): None -def adaptive_chunked_choosers(whale, choosers, chunk_size, trace_label, chunk_tag=None): +def adaptive_chunked_choosers( + whale: workflow.Whale, + choosers: pd.DataFrame, + trace_label: str, + chunk_tag: str = None, +): # generator to iterate over choosers if whale.settings.chunk_training_mode == MODE_CHUNKLESS: @@ -1140,6 +1145,7 @@ def adaptive_chunked_choosers(whale, choosers, chunk_size, trace_label, chunk_ta return chunk_tag = chunk_tag or trace_label + chunk_size = whale.settings.chunk_size num_choosers = len(choosers.index) assert num_choosers > 0 @@ -1184,11 +1190,10 @@ def adaptive_chunked_choosers(whale, choosers, chunk_size, trace_label, chunk_ta def adaptive_chunked_choosers_and_alts( whale: workflow.Whale, - choosers, - alternatives, - chunk_size, - trace_label, - chunk_tag=None, + choosers: pd.DataFrame, + alternatives: pd.DataFrame, + trace_label: str, + chunk_tag: str = None, ): """ generator to iterate over choosers and alternatives in chunk_size chunks @@ -1262,6 +1267,7 @@ def adaptive_chunked_choosers_and_alts( f"with {num_choosers} choosers and {num_alternatives} alternatives" ) + chunk_size = whale.settings.chunk_size chunk_sizer = ChunkSizer(chunk_tag, trace_label, num_choosers, chunk_size) rows_per_chunk, estimated_number_of_chunks = chunk_sizer.initial_rows_per_chunk() assert (rows_per_chunk > 0) and (rows_per_chunk <= num_choosers) @@ -1320,7 +1326,7 @@ def adaptive_chunked_choosers_and_alts( def adaptive_chunked_choosers_by_chunk_id( - whale: workflow.Whale, choosers, chunk_size, trace_label, chunk_tag=None + whale: workflow.Whale, choosers: pd.DataFrame, trace_label: str, chunk_tag=None ): # generator to iterate over choosers in chunk_size chunks # like chunked_choosers but based on chunk_id field rather than dataframe length @@ -1344,6 +1350,7 @@ def adaptive_chunked_choosers_by_chunk_id( num_choosers = choosers["chunk_id"].max() + 1 assert num_choosers > 0 + chunk_size = whale.settings.chunk_size chunk_sizer = ChunkSizer(chunk_tag, trace_label, num_choosers, chunk_size) rows_per_chunk, estimated_number_of_chunks = chunk_sizer.initial_rows_per_chunk() diff --git a/activitysim/core/interaction_sample.py b/activitysim/core/interaction_sample.py index f10cb389c..eec7e5c49 100644 --- a/activitysim/core/interaction_sample.py +++ b/activitysim/core/interaction_sample.py @@ -575,9 +575,7 @@ def interaction_sample( chooser_chunk, chunk_trace_label, chunk_sizer, - ) in chunk.adaptive_chunked_choosers( - whale, choosers, chunk_size, trace_label, chunk_tag - ): + ) in chunk.adaptive_chunked_choosers(whale, choosers, trace_label, chunk_tag): choices = _interaction_sample( whale, diff --git a/activitysim/core/interaction_sample_simulate.py b/activitysim/core/interaction_sample_simulate.py index 264b85e07..c6045872c 100644 --- a/activitysim/core/interaction_sample_simulate.py +++ b/activitysim/core/interaction_sample_simulate.py @@ -426,7 +426,7 @@ def interaction_sample_simulate( chunk_trace_label, chunk_sizer, ) in chunk.adaptive_chunked_choosers_and_alts( - whale, choosers, alternatives, chunk_size, trace_label, chunk_tag + whale, choosers, alternatives, trace_label, chunk_tag ): choices = _interaction_sample_simulate( whale, diff --git a/activitysim/core/interaction_simulate.py b/activitysim/core/interaction_simulate.py index d6ea27e4c..3a1450813 100644 --- a/activitysim/core/interaction_simulate.py +++ b/activitysim/core/interaction_simulate.py @@ -938,7 +938,7 @@ def interaction_simulate( chooser_chunk, chunk_trace_label, chunk_sizer, - ) in chunk.adaptive_chunked_choosers(whale, choosers, chunk_size, trace_label): + ) in chunk.adaptive_chunked_choosers(whale, choosers, trace_label): choices = _interaction_simulate( chooser_chunk, diff --git a/activitysim/core/logit.py b/activitysim/core/logit.py index 7860d63d5..20322de83 100644 --- a/activitysim/core/logit.py +++ b/activitysim/core/logit.py @@ -216,11 +216,11 @@ def utils_to_probs( def make_choices( whale: workflow.Whale, - probs, - trace_label=None, + probs: pd.DataFrame, + trace_label: str = None, trace_choosers=None, allow_bad_probs=False, -): +) -> tuple[pd.Series, pd.Series]: """ Make choices for each chooser from among a set of alternatives. diff --git a/activitysim/core/simulate.py b/activitysim/core/simulate.py index ebff82726..b1b111f3a 100644 --- a/activitysim/core/simulate.py +++ b/activitysim/core/simulate.py @@ -7,6 +7,7 @@ from builtins import range from collections import OrderedDict from datetime import timedelta +from typing import Callable import numpy as np import pandas as pd @@ -31,6 +32,11 @@ logger = logging.getLogger(__name__) +CustomChooser_T = Callable[ + [workflow.Whale, pd.DataFrame, pd.DataFrame, pd.DataFrame, str], + tuple[pd.Series, pd.Series], +] + def random_rows(whale: workflow.Whale, df, n): # only sample if df has more than n rows @@ -1067,7 +1073,7 @@ def eval_mnl( choosers, spec, locals_d, - custom_chooser, + custom_chooser: CustomChooser_T, estimator, log_alt_losers=False, want_logsums=False, @@ -1166,9 +1172,7 @@ def eval_mnl( ) if custom_chooser: - choices, rands = custom_chooser( - probs=probs, choosers=choosers, spec=spec, trace_label=trace_label - ) + choices, rands = custom_chooser(whale, probs, choosers, spec, trace_label) else: choices, rands = logit.make_choices(whale, probs, trace_label=trace_label) @@ -1190,13 +1194,15 @@ def eval_nl( spec, nest_spec, locals_d, - custom_chooser, + custom_chooser: CustomChooser_T, estimator, log_alt_losers=False, want_logsums=False, trace_label=None, trace_choice_name=None, trace_column_names=None, + *, + chunk_sizer: chunk.ChunkSizer, ): """ Run a nested-logit simulation for when the model spec does not involve alternative @@ -1373,7 +1379,7 @@ def _simple_simulate( nest_spec, skims=None, locals_d=None, - custom_chooser=None, + custom_chooser: CustomChooser_T = None, log_alt_losers=False, want_logsums=False, estimator=None, @@ -1409,7 +1415,7 @@ def _simple_simulate( locals_d : Dict This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ - custom_chooser : Estimator object + custom_chooser : CustomChooser_T estimator : function(df, label, table_name) called to report intermediate table results (used for estimation) @@ -1492,7 +1498,6 @@ def simple_simulate( nest_spec, skims=None, locals_d=None, - chunk_size=0, custom_chooser=None, log_alt_losers=False, want_logsums=False, @@ -1518,7 +1523,7 @@ def simple_simulate( chooser_chunk, chunk_trace_label, chunk_sizer, - ) in chunk.adaptive_chunked_choosers(whale, choosers, chunk_size, trace_label): + ) in chunk.adaptive_chunked_choosers(whale, choosers, trace_label): choices = _simple_simulate( whale, chooser_chunk, @@ -1573,9 +1578,7 @@ def simple_simulate_by_chunk_id( chooser_chunk, chunk_trace_label, chunk_sizer, - ) in chunk.adaptive_chunked_choosers_by_chunk_id( - whale, choosers, chunk_size, trace_label - ): + ) in chunk.adaptive_chunked_choosers_by_chunk_id(whale, choosers, trace_label): choices = _simple_simulate( whale, chooser_chunk, @@ -1589,11 +1592,12 @@ def simple_simulate_by_chunk_id( estimator=estimator, trace_label=chunk_trace_label, trace_choice_name=trace_choice_name, + chunk_sizer=chunk_sizer, ) result_list.append(choices) - chunk.log_df(trace_label, "result_list", result_list) + chunk_sizer.log_df(trace_label, "result_list", result_list) if len(result_list) > 1: choices = pd.concat(result_list) @@ -1744,7 +1748,7 @@ def eval_nl_logsums( locals_d, trace_label=None, *, - chunk_sizer, + chunk_sizer: chunk.ChunkSizer, ): """ like eval_nl except return logsums instead of making choices @@ -1892,9 +1896,7 @@ def simple_simulate_logsums( chooser_chunk, chunk_trace_label, chunk_sizer, - ) in chunk.adaptive_chunked_choosers( - whale, choosers, chunk_size, trace_label, chunk_tag - ): + ) in chunk.adaptive_chunked_choosers(whale, choosers, trace_label, chunk_tag): logsums = _simple_simulate_logsums( whale, chooser_chunk, diff --git a/activitysim/core/test/test_simulate.py b/activitysim/core/test/test_simulate.py index c4d7b1b1f..a56bb542c 100644 --- a/activitysim/core/test/test_simulate.py +++ b/activitysim/core/test/test_simulate.py @@ -82,8 +82,13 @@ def test_simple_simulate_chunked(data, spec): inject.add_injectable("settings", {"check_for_variability": False}) + # whale -. set chunk_size as 2 + choices = simulate.simple_simulate( - whale, choosers=data, spec=spec, nest_spec=None, chunk_size=2 + whale, + choosers=data, + spec=spec, + nest_spec=None, ) expected = pd.Series([1, 1, 1], index=data.index) pdt.assert_series_equal(choices, expected, check_dtype=False) diff --git a/activitysim/core/timetable.py b/activitysim/core/timetable.py index c70d80893..6b41eab7b 100644 --- a/activitysim/core/timetable.py +++ b/activitysim/core/timetable.py @@ -352,7 +352,7 @@ def __init__(self, windows_df, tdd_alts_df, table_name=None): self.checkpoint_df = None # series to map window row index value to window row's ordinal index - from ..core.fast_mapping import FastMapping + from activitysim.core.fast_mapping import FastMapping self.window_row_ix = FastMapping( pd.Series(list(range(len(windows_df.index))), index=windows_df.index) @@ -632,7 +632,7 @@ def adjacent_window_run_length(self, window_row_ids, periods, before): assert len(window_row_ids) == len(periods) trace_label = "tt.adjacent_window_run_length" - with chunk.chunk_log(trace_label, settings=whale.settings): + with chunk.chunk_log(trace_label, settings=whale.settings) as chunk_sizer: available_run_length = _available_run_length_2( self.windows, self.window_row_ix._mapper, diff --git a/activitysim/core/workflow/state.py b/activitysim/core/workflow/state.py index a4905faab..856b78bf0 100644 --- a/activitysim/core/workflow/state.py +++ b/activitysim/core/workflow/state.py @@ -1084,7 +1084,7 @@ def extend_table(self, table_name, df, axis=0): assert axis in [0, 1] if self.is_table(table_name): - table_df = self.get_table(table_name) + table_df = self.get_dataframe(table_name) if axis == 0: # don't expect indexes to overlap From dab2d93d70287fdd99d715dada4a5b236683924f Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Mon, 13 Feb 2023 09:29:33 -0600 Subject: [PATCH 011/419] refactor --- .../abm/models/atwork_subtour_destination.py | 1 - .../abm/models/disaggregate_accessibility.py | 1 - .../abm/models/joint_tour_destination.py | 12 +-- .../abm/models/joint_tour_participation.py | 1 - .../abm/models/non_mandatory_destination.py | 1 - activitysim/abm/models/trip_destination.py | 1 + .../abm/models/util/tour_destination.py | 34 +++--- activitysim/abm/models/util/tour_od.py | 100 ++++++++++-------- .../abm/tables/disaggregate_accessibility.py | 6 +- activitysim/core/simulate.py | 18 ++-- 10 files changed, 89 insertions(+), 86 deletions(-) diff --git a/activitysim/abm/models/atwork_subtour_destination.py b/activitysim/abm/models/atwork_subtour_destination.py index 7d423faa4..61d940a77 100644 --- a/activitysim/abm/models/atwork_subtour_destination.py +++ b/activitysim/abm/models/atwork_subtour_destination.py @@ -67,7 +67,6 @@ def atwork_subtour_destination( model_settings, network_los, estimator, - chunk_size, trace_label, ) diff --git a/activitysim/abm/models/disaggregate_accessibility.py b/activitysim/abm/models/disaggregate_accessibility.py index 96954b7ba..daafd5fce 100644 --- a/activitysim/abm/models/disaggregate_accessibility.py +++ b/activitysim/abm/models/disaggregate_accessibility.py @@ -653,7 +653,6 @@ def get_disaggregate_logsums( model_settings=model_settings, network_los=network_los, estimator=estimator, - chunk_size=chunk_size, trace_label=trace_label, skip_choice=True, ) diff --git a/activitysim/abm/models/joint_tour_destination.py b/activitysim/abm/models/joint_tour_destination.py index f8b9460a8..04e7fd7e3 100644 --- a/activitysim/abm/models/joint_tour_destination.py +++ b/activitysim/abm/models/joint_tour_destination.py @@ -4,7 +4,7 @@ import pandas as pd -from activitysim.core import config, inject, tracing, workflow +from activitysim.core import config, inject, los, tracing, workflow from activitysim.core.util import assign_in_place from .util import estimation, tour_destination @@ -15,10 +15,10 @@ @workflow.step def joint_tour_destination( whale: workflow.Whale, - tours, - persons_merged, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, households_merged, - network_los, + network_los: los.Network_LOS, chunk_size, ): """ @@ -41,11 +41,8 @@ def joint_tour_destination( ) # choosers are tours - in a sense tours are choosing their destination - tours = tours.to_frame() joint_tours = tours[tours.tour_category == "joint"] - persons_merged = persons_merged.to_frame() - # - if no joint tours if joint_tours.shape[0] == 0: tracing.no_results("joint_tour_destination") @@ -72,7 +69,6 @@ def joint_tour_destination( model_settings, network_los, estimator, - chunk_size, trace_label, ) diff --git a/activitysim/abm/models/joint_tour_participation.py b/activitysim/abm/models/joint_tour_participation.py index 08f3961b0..0c1c96803 100644 --- a/activitysim/abm/models/joint_tour_participation.py +++ b/activitysim/abm/models/joint_tour_participation.py @@ -353,7 +353,6 @@ def joint_tour_participation( spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="participation", custom_chooser=participants_chooser, diff --git a/activitysim/abm/models/non_mandatory_destination.py b/activitysim/abm/models/non_mandatory_destination.py index 5641a5dc7..49212e0fb 100644 --- a/activitysim/abm/models/non_mandatory_destination.py +++ b/activitysim/abm/models/non_mandatory_destination.py @@ -81,7 +81,6 @@ def non_mandatory_tour_destination( model_settings, network_los, estimator, - chunk_size, trace_label, ) diff --git a/activitysim/abm/models/trip_destination.py b/activitysim/abm/models/trip_destination.py index ef9f69e93..76fe3d6a9 100644 --- a/activitysim/abm/models/trip_destination.py +++ b/activitysim/abm/models/trip_destination.py @@ -833,6 +833,7 @@ def trip_destination_simulate( chunk_tag = "trip_destination.simulate" spec = simulate.spec_for_segment( + whale, model_settings, spec_id="DESTINATION_SPEC", segment_name=primary_purpose, diff --git a/activitysim/abm/models/util/tour_destination.py b/activitysim/abm/models/util/tour_destination.py index 0ec399cb8..0f2a50cb2 100644 --- a/activitysim/abm/models/util/tour_destination.py +++ b/activitysim/abm/models/util/tour_destination.py @@ -16,16 +16,16 @@ DUMP = False -class SizeTermCalculator(object): +class SizeTermCalculator: """ convenience object to provide size_terms for a selector (e.g. non_mandatory) for various segments (e.g. tour_type or purpose) returns size terms for specified segment in df or series form """ - def __init__(self, size_term_selector): + def __init__(self, whale: workflow.Whale, size_term_selector): # do this once so they can request size_terms for various segments (tour_type or purpose) - land_use = inject.get_table("land_use") + land_use = whale.get_dataframe("land_use") size_terms = whale.get_injectable("size_terms") self.destination_size_terms = tour_destination_size_terms( land_use, size_terms, size_term_selector @@ -57,11 +57,6 @@ def dest_size_terms_df(self, segment_name, trace_label): return size_terms - # def dest_size_terms_series(self, segment_name): - # # return size terms as as series - # # convenient (and no copy overhead) if reindexing and assigning into alts column - # return self.destination_size_terms[segment_name] - def _destination_sample( whale: workflow.Whale, @@ -78,6 +73,7 @@ def _destination_sample( zone_layer=None, ): model_spec = simulate.spec_for_segment( + whale, model_settings, spec_id="SAMPLE_SPEC", segment_name=spec_segment_name, @@ -694,6 +690,7 @@ def run_destination_simulate( chunk_tag = "tour_destination.simulate" model_spec = simulate.spec_for_segment( + whale, model_settings, spec_id="SPEC", segment_name=spec_segment_name, @@ -788,18 +785,19 @@ def run_destination_simulate( def run_tour_destination( whale: workflow.Whale, - tours, - persons_merged, - want_logsums, - want_sample_table, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + want_logsums: bool, + want_sample_table: bool, model_settings, - network_los, + network_los: los.Network_LOS, estimator, - chunk_size, trace_label, skip_choice=False, ): - size_term_calculator = SizeTermCalculator(model_settings["SIZE_TERM_SELECTOR"]) + size_term_calculator = SizeTermCalculator( + whale, model_settings["SIZE_TERM_SELECTOR"] + ) # maps segment names to compact (integer) ids segments = model_settings["SEGMENTS"] @@ -842,7 +840,7 @@ def run_tour_destination( network_los, segment_destination_size_terms, estimator, - chunk_size=chunk_size, + chunk_size=whale.settings.chunk_size, trace_label=tracing.extend_trace_label(segment_trace_label, "sample"), ) @@ -855,7 +853,7 @@ def run_tour_destination( location_sample_df, model_settings, network_los, - chunk_size=chunk_size, + chunk_size=whale.settings.chunk_size, trace_label=tracing.extend_trace_label(segment_trace_label, "logsums"), ) @@ -872,7 +870,7 @@ def run_tour_destination( network_los=network_los, destination_size_terms=segment_destination_size_terms, estimator=estimator, - chunk_size=chunk_size, + chunk_size=whale.settings.chunk_size, trace_label=tracing.extend_trace_label(segment_trace_label, "simulate"), skip_choice=skip_choice, ) diff --git a/activitysim/abm/models/util/tour_od.py b/activitysim/abm/models/util/tour_od.py index 68434a501..5a17227f5 100644 --- a/activitysim/abm/models/util/tour_od.py +++ b/activitysim/abm/models/util/tour_od.py @@ -23,6 +23,7 @@ from . import logsums as logsum from . import trip +from .tour_destination import SizeTermCalculator logger = logging.getLogger(__name__) DUMP = False @@ -129,6 +130,7 @@ def _od_sample( trace_label, ): model_spec = simulate.spec_for_segment( + whale, model_settings, spec_id="SAMPLE_SPEC", segment_name=spec_segment_name, @@ -637,52 +639,52 @@ def od_presample( return maz_choices -class SizeTermCalculator(object): - """ - convenience object to provide size_terms for a selector (e.g. - non_mandatory) for various segments (e.g. tour_type or purpose) - returns size terms for specified segment in df or series form. - """ - - def __init__(self, size_term_selector): - # do this once so they can request size_terms for various segments (tour_type or purpose) - land_use = inject.get_table("land_use") - self.land_use = land_use - size_terms = whale.get_injectable("size_terms") - self.destination_size_terms = tour_destination_size_terms( - self.land_use, size_terms, size_term_selector - ) - - assert not self.destination_size_terms.isna().any(axis=None) - - def omnibus_size_terms_df(self): - return self.destination_size_terms - - def dest_size_terms_df(self, segment_name, trace_label): - # return size terms as df with one column named 'size_term' - # convenient if creating or merging with alts - - size_terms = self.destination_size_terms[[segment_name]].copy() - size_terms.columns = ["size_term"] - - # FIXME - no point in considering impossible alternatives (where dest size term is zero) - logger.debug( - f"SizeTermCalculator dropping {(~(size_terms.size_term > 0)).sum()} " - f"of {len(size_terms)} rows where size_term is zero for {segment_name}" - ) - size_terms = size_terms[size_terms.size_term > 0] - - if len(size_terms) == 0: - logger.warning( - f"SizeTermCalculator: no zones with non-zero size terms for {segment_name} in {trace_label}" - ) - - return size_terms - - def dest_size_terms_series(self, segment_name): - # return size terms as as series - # convenient (and no copy overhead) if reindexing and assigning into alts column - return self.destination_size_terms[segment_name] +# class SizeTermCalculatorOD: # class SizeTermCalculator +# """ +# convenience object to provide size_terms for a selector (e.g. +# non_mandatory) for various segments (e.g. tour_type or purpose) +# returns size terms for specified segment in df or series form. +# """ +# +# def __init__(self, size_term_selector): +# # do this once so they can request size_terms for various segments (tour_type or purpose) +# land_use = inject.get_table("land_use") +# self.land_use = land_use +# size_terms = whale.get_injectable("size_terms") +# self.destination_size_terms = tour_destination_size_terms( +# self.land_use, size_terms, size_term_selector +# ) +# +# assert not self.destination_size_terms.isna().any(axis=None) +# +# def omnibus_size_terms_df(self): +# return self.destination_size_terms +# +# def dest_size_terms_df(self, segment_name, trace_label): +# # return size terms as df with one column named 'size_term' +# # convenient if creating or merging with alts +# +# size_terms = self.destination_size_terms[[segment_name]].copy() +# size_terms.columns = ["size_term"] +# +# # FIXME - no point in considering impossible alternatives (where dest size term is zero) +# logger.debug( +# f"SizeTermCalculator dropping {(~(size_terms.size_term > 0)).sum()} " +# f"of {len(size_terms)} rows where size_term is zero for {segment_name}" +# ) +# size_terms = size_terms[size_terms.size_term > 0] +# +# if len(size_terms) == 0: +# logger.warning( +# f"SizeTermCalculator: no zones with non-zero size terms for {segment_name} in {trace_label}" +# ) +# +# return size_terms +# +# def dest_size_terms_series(self, segment_name): +# # return size terms as as series +# # convenient (and no copy overhead) if reindexing and assigning into alts column +# return self.destination_size_terms[segment_name] def run_od_sample( @@ -697,6 +699,7 @@ def run_od_sample( trace_label, ): model_spec = simulate.spec_for_segment( + whale, model_settings, spec_id="SAMPLE_SPEC", segment_name=spec_segment_name, @@ -947,6 +950,7 @@ def run_od_simulate( """ model_spec = simulate.spec_for_segment( + whale, model_settings, spec_id="SPEC", segment_name=spec_segment_name, @@ -1050,7 +1054,9 @@ def run_tour_od( trace_hh_id, trace_label, ): - size_term_calculator = SizeTermCalculator(model_settings["SIZE_TERM_SELECTOR"]) + size_term_calculator = SizeTermCalculator( + whale, model_settings["SIZE_TERM_SELECTOR"] + ) preprocessor_settings = model_settings.get("preprocessor", None) origin_col_name = model_settings["ORIG_COL_NAME"] diff --git a/activitysim/abm/tables/disaggregate_accessibility.py b/activitysim/abm/tables/disaggregate_accessibility.py index 6e57f9b7f..66cace2e2 100644 --- a/activitysim/abm/tables/disaggregate_accessibility.py +++ b/activitysim/abm/tables/disaggregate_accessibility.py @@ -13,7 +13,9 @@ logger = logging.getLogger(__name__) -def find_nearest_accessibility_zone(choosers, accessibility_df, method="skims"): +def find_nearest_accessibility_zone( + whale: workflow.Whale, choosers, accessibility_df, method="skims" +): """ Matches choosers zone to the nearest accessibility zones. Can be achieved by querying the skims or by nearest neighbor of centroids @@ -173,7 +175,7 @@ def disaggregate_accessibility(whale: workflow.Whale): # Note that from here on the 'home_zone_id' is the matched name if "nearest_accessibility_zone_id" not in persons_merged_df.columns: persons_merged_df = find_nearest_accessibility_zone( - persons_merged_df, proto_accessibility_df, nearest_method + whale, persons_merged_df, proto_accessibility_df, nearest_method ) # Copy home_zone_id in proto-table to match the temporary 'nearest_zone_id' diff --git a/activitysim/core/simulate.py b/activitysim/core/simulate.py index b1b111f3a..ca848ac54 100644 --- a/activitysim/core/simulate.py +++ b/activitysim/core/simulate.py @@ -7,11 +7,12 @@ from builtins import range from collections import OrderedDict from datetime import timedelta -from typing import Callable +from typing import Callable, Optional import numpy as np import pandas as pd +from activitysim.abm.models.util.estimation import Estimator from activitysim.core import ( assign, chunk, @@ -180,8 +181,12 @@ def read_model_coefficients( @workflow.func def spec_for_segment( - whale: workflow.Whale, model_settings, spec_id, segment_name, estimator -): + whale: workflow.Whale, + model_settings, + spec_id: str, + segment_name: str, + estimator: Optional[Estimator], +) -> pd.DataFrame: """ Select spec for specified segment from omnibus spec containing columns for each segment @@ -409,8 +414,8 @@ def eval_coefficients( whale: workflow.Whale, spec: pd.DataFrame, coefficients: dict | pd.DataFrame, - estimator, -): + estimator: Optional[Estimator], +) -> pd.DataFrame: spec = spec.copy() # don't clobber input spec if isinstance(coefficients, pd.DataFrame): @@ -1560,7 +1565,6 @@ def simple_simulate_by_chunk_id( nest_spec, skims=None, locals_d=None, - chunk_size=0, custom_chooser=None, log_alt_losers=False, want_logsums=False, @@ -1571,7 +1575,7 @@ def simple_simulate_by_chunk_id( """ chunk_by_chunk_id wrapper for simple_simulate """ - + choices = None result_list = [] for ( i, From f599888d1e6f61b71d1268248808a7694462dffc Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Mon, 13 Feb 2023 10:06:39 -0600 Subject: [PATCH 012/419] refactoring --- .../abm/models/atwork_subtour_destination.py | 12 ++++---- .../abm/models/atwork_subtour_mode_choice.py | 11 +++++--- .../abm/models/atwork_subtour_scheduling.py | 7 ++--- .../abm/models/disaggregate_accessibility.py | 2 +- .../abm/models/joint_tour_scheduling.py | 11 ++++---- .../abm/models/non_mandatory_destination.py | 9 +++--- .../models/non_mandatory_tour_frequency.py | 21 +++++++------- .../abm/models/parking_location_choice.py | 19 +++++++------ activitysim/abm/models/stop_frequency.py | 7 ++--- activitysim/abm/models/summarize.py | 10 +------ .../abm/models/telecommute_frequency.py | 7 ++--- activitysim/abm/models/tour_mode_choice.py | 14 ++++++---- .../models/tour_scheduling_probabilistic.py | 28 ++++++------------- activitysim/abm/models/trip_destination.py | 26 ++++++----------- activitysim/abm/models/trip_scheduling.py | 1 - activitysim/abm/models/util/mode.py | 3 +- .../models/util/probabilistic_scheduling.py | 4 +-- .../abm/models/util/tour_destination.py | 13 +++------ .../models/util/vectorize_tour_scheduling.py | 10 +++++-- activitysim/abm/models/work_from_home.py | 5 ++-- activitysim/abm/tables/vehicles.py | 10 +++---- 21 files changed, 99 insertions(+), 131 deletions(-) diff --git a/activitysim/abm/models/atwork_subtour_destination.py b/activitysim/abm/models/atwork_subtour_destination.py index 61d940a77..0debddf82 100644 --- a/activitysim/abm/models/atwork_subtour_destination.py +++ b/activitysim/abm/models/atwork_subtour_destination.py @@ -2,8 +2,10 @@ # See full license in LICENSE.txt. import logging +import pandas as pd + from activitysim.abm.models.util import estimation, tour_destination -from activitysim.core import config, inject, tracing, workflow +from activitysim.core import config, inject, los, tracing, workflow from activitysim.core.util import assign_in_place logger = logging.getLogger(__name__) @@ -12,7 +14,10 @@ @workflow.step def atwork_subtour_destination( - whale: workflow.Whale, tours, persons_merged, network_los, chunk_size + whale: workflow.Whale, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + network_los: los.Network_LOS, ): trace_label = "atwork_subtour_destination" model_settings_file_name = "atwork_subtour_destination.yaml" @@ -36,9 +41,6 @@ def atwork_subtour_destination( whale.settings.want_dest_choice_sample_tables and sample_table_name is not None ) - persons_merged = persons_merged.to_frame() - - tours = tours.to_frame() subtours = tours[tours.tour_category == "atwork"] # - if no atwork subtours diff --git a/activitysim/abm/models/atwork_subtour_mode_choice.py b/activitysim/abm/models/atwork_subtour_mode_choice.py index ccc1fba03..bc342b0c3 100644 --- a/activitysim/abm/models/atwork_subtour_mode_choice.py +++ b/activitysim/abm/models/atwork_subtour_mode_choice.py @@ -15,7 +15,11 @@ @workflow.step def atwork_subtour_mode_choice( - whale: workflow.Whale, tours, persons_merged, network_los, chunk_size + whale: workflow.Whale, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + network_los, + chunk_size, ): """ At-work subtour mode choice simulate @@ -31,7 +35,6 @@ def atwork_subtour_mode_choice( logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME") mode_column_name = "tour_mode" - tours = tours.to_frame() subtours = tours[tours.tour_category == "atwork"] # - if no atwork subtours @@ -41,7 +44,7 @@ def atwork_subtour_mode_choice( subtours_merged = pd.merge( subtours, - persons_merged.to_frame(), + persons_merged, left_on="person_id", right_index=True, how="left", @@ -130,6 +133,7 @@ def atwork_subtour_mode_choice( # FIXME run_tour_mode_choice_simulate writes choosers post-annotation choices_df = run_tour_mode_choice_simulate( + whale, subtours_merged, tour_purpose="atwork", model_settings=model_settings, @@ -139,7 +143,6 @@ def atwork_subtour_mode_choice( skims=skims, constants=constants, estimator=estimator, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="tour_mode_choice", ) diff --git a/activitysim/abm/models/atwork_subtour_scheduling.py b/activitysim/abm/models/atwork_subtour_scheduling.py index 5a7bde235..9a8567eb0 100644 --- a/activitysim/abm/models/atwork_subtour_scheduling.py +++ b/activitysim/abm/models/atwork_subtour_scheduling.py @@ -22,8 +22,8 @@ @workflow.step def atwork_subtour_scheduling( whale: workflow.Whale, - tours, - persons_merged, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, tdd_alts, skim_dict, chunk_size, @@ -52,8 +52,6 @@ def atwork_subtour_scheduling( whale, model_spec, coefficients_df, estimator ) - persons_merged = persons_merged.to_frame() - logger.info("Running %s with %d tours", trace_label, len(subtours)) # preprocessor @@ -78,6 +76,7 @@ def atwork_subtour_scheduling( # we don't need to update timetable because subtours are scheduled inside work trip windows choices = vectorize_subtour_scheduling( + whale, parent_tours, subtours, persons_merged, diff --git a/activitysim/abm/models/disaggregate_accessibility.py b/activitysim/abm/models/disaggregate_accessibility.py index daafd5fce..efaf84bcc 100644 --- a/activitysim/abm/models/disaggregate_accessibility.py +++ b/activitysim/abm/models/disaggregate_accessibility.py @@ -691,7 +691,7 @@ def compute_disaggregate_accessibility( # Re-Register tables in this step, necessary for multiprocessing for tablename in ["proto_households", "proto_persons", "proto_tours"]: - df = inject.get_table(tablename).to_frame() + df = whale.get_dataframe(tablename) traceables = whale.get_injectable("traceable_tables") if tablename not in whale.get_rn_generator().channels: whale.get_rn_generator().add_channel(tablename, df) diff --git a/activitysim/abm/models/joint_tour_scheduling.py b/activitysim/abm/models/joint_tour_scheduling.py index 7e8bec68b..6e4750884 100644 --- a/activitysim/abm/models/joint_tour_scheduling.py +++ b/activitysim/abm/models/joint_tour_scheduling.py @@ -16,7 +16,10 @@ @workflow.step def joint_tour_scheduling( - whale: workflow.Whale, tours, persons_merged, tdd_alts, chunk_size + whale: workflow.Whale, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + tdd_alts: pd.DataFrame, ): """ This model predicts the departure time and duration of each joint tour @@ -37,8 +40,6 @@ def joint_tour_scheduling( # use inject.get_table as this won't exist if there are no joint_tours joint_tour_participants = whale.get_dataframe("joint_tour_participants") - persons_merged = persons_merged.to_frame() - logger.info("Running %s with %d joint tours", trace_label, joint_tours.shape[0]) # it may seem peculiar that we are concerned with persons rather than households @@ -96,7 +97,7 @@ def joint_tour_scheduling( spec=model_spec, model_settings=model_settings, estimator=estimator, - chunk_size=chunk_size, + chunk_size=whale.settings.chunk_size, trace_label=trace_label, sharrow_skip=sharrow_skip, ) @@ -123,7 +124,7 @@ def joint_tour_scheduling( nth_participants.person_id, reindex(choices, nth_participants.tour_id) ) - timetable.replace_table() + timetable.replace_table(whale) # choices are tdd alternative ids # we want to add start, end, and duration columns to tours, which we have in tdd_alts table diff --git a/activitysim/abm/models/non_mandatory_destination.py b/activitysim/abm/models/non_mandatory_destination.py index 49212e0fb..becee347a 100644 --- a/activitysim/abm/models/non_mandatory_destination.py +++ b/activitysim/abm/models/non_mandatory_destination.py @@ -14,7 +14,10 @@ @workflow.step def non_mandatory_tour_destination( - whale: workflow.Whale, tours, persons_merged, network_los, chunk_size + whale: workflow.Whale, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + network_los, ): """ Given the tour generation from the above, each tour needs to have a @@ -35,10 +38,6 @@ def non_mandatory_tour_destination( whale.settings.want_dest_choice_sample_tables and sample_table_name is not None ) - tours = tours.to_frame() - - persons_merged = persons_merged.to_frame() - # choosers are tours - in a sense tours are choosing their destination non_mandatory_tours = tours[tours.tour_category == "non_mandatory"] diff --git a/activitysim/abm/models/non_mandatory_tour_frequency.py b/activitysim/abm/models/non_mandatory_tour_frequency.py index 52b2b9718..8443d959b 100644 --- a/activitysim/abm/models/non_mandatory_tour_frequency.py +++ b/activitysim/abm/models/non_mandatory_tour_frequency.py @@ -31,7 +31,11 @@ def extension_probs(whale: workflow.Whale): def extend_tour_counts( - whale: workflow.Whale, persons, tour_counts, alternatives, trace_label + whale: workflow.Whale, + persons: pd.DataFrame, + tour_counts: pd.DataFrame, + alternatives, + trace_label: str, ): """ extend tour counts based on a probability table @@ -50,7 +54,6 @@ def extend_tour_counts( alternatives alternatives from nmtv interaction_simulate only need this to know max possible frequency for a tour type - trace_hh_id trace_label Returns @@ -67,7 +70,6 @@ def extend_tour_counts( """ assert tour_counts.index.name == persons.index.name - trace_hh_id = whale.settings.trace_hh_id PROBABILITY_COLUMNS = ["0_tours", "1_tours", "2_tours"] JOIN_COLUMNS = ["ptype", "has_mandatory_tour", "has_joint_tour"] @@ -82,7 +84,7 @@ def extend_tour_counts( logger.info("extend_tour_counts - no persons eligible for tour_count extension") return tour_counts - have_trace_targets = trace_hh_id and tracing.has_trace_targets( + have_trace_targets = whale.settings.trace_hh_id and tracing.has_trace_targets( whale, extend_tour_counts ) @@ -137,7 +139,7 @@ def extend_tour_counts( @workflow.step def non_mandatory_tour_frequency( - whale: workflow.Whale, persons, persons_merged, chunk_size, trace_hh_id + whale: workflow.Whale, persons: pd.DataFrame, persons_merged: pd.DataFrame ): """ This model predicts the frequency of making non-mandatory trips @@ -159,7 +161,7 @@ def non_mandatory_tour_frequency( alternatives["tot_tours"] = alternatives.sum(axis=1) # filter based on results of CDAP - choosers = persons_merged.to_frame() + choosers = persons_merged choosers = choosers[choosers.cdap_activity.isin(["M", "N"])] # - preprocessor @@ -243,7 +245,7 @@ def non_mandatory_tour_frequency( spec=segment_spec, log_alt_losers=log_alt_losers, locals_d=constants, - chunk_size=chunk_size, + chunk_size=whale.settings.chunk_size, trace_label="non_mandatory_tour_frequency.%s" % segment_name, trace_choice_name="non_mandatory_tour_frequency", estimator=estimator, @@ -266,7 +268,6 @@ def non_mandatory_tour_frequency( choices = pd.concat(choices_list).sort_index() # add non_mandatory_tour_frequency column to persons - persons = persons.to_frame() # we expect there to be an alt with no tours - which we can use to backfill non-travelers no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] # need to reindex as we only handled persons with cdap_activity in ['M', 'N'] @@ -299,10 +300,10 @@ def non_mandatory_tour_frequency( # - extend_tour_counts - probabalistic extended_tour_counts = extend_tour_counts( + whale, choosers, modeled_tour_counts.copy(), alternatives, - trace_hh_id, tracing.extend_trace_label(trace_label, "extend_tour_counts"), ) @@ -402,7 +403,7 @@ def non_mandatory_tour_frequency( value_counts=True, ) - if trace_hh_id: + if whale.settings.trace_hh_id: tracing.trace_df( non_mandatory_tours, label="non_mandatory_tour_frequency.non_mandatory_tours", diff --git a/activitysim/abm/models/parking_location_choice.py b/activitysim/abm/models/parking_location_choice.py index 224a6ebaa..559991332 100644 --- a/activitysim/abm/models/parking_location_choice.py +++ b/activitysim/abm/models/parking_location_choice.py @@ -110,7 +110,9 @@ def parking_destination_simulate( choices - pandas.Series destination alt chosen """ - trace_label = tracing.extend_trace_label(trace_label, "trip_destination_simulate") + trace_label = tracing.extend_trace_label( + trace_label, "parking_destination_simulate" + ) spec = get_spec_for_segment(model_settings, "SPECIFICATION", segment_name) @@ -119,7 +121,7 @@ def parking_destination_simulate( alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] - logger.info("Running trip_destination_simulate with %d trips", len(trips)) + logger.info("Running parking_destination_simulate with %d trips", len(trips)) locals_dict = config.get_model_constants(model_settings).copy() locals_dict.update(skims) @@ -175,7 +177,6 @@ def choose_parking_location( destination_sample.index = np.repeat(trips.index.values, len(alternatives)) destination_sample.index.name = trips.index.name - # # - trip_destination_simulate destinations = parking_destination_simulate( segment_name=segment_name, trips=trips, @@ -278,9 +279,9 @@ def run_parking_destination( @workflow.step def parking_location( whale: workflow.Whale, - trips, - trips_merged, - land_use, + trips: pd.DataFrame, + trips_merged: pd.DataFrame, + land_use: pd.DataFrame, network_los, chunk_size, ): @@ -298,9 +299,9 @@ def parking_location( preprocessor_settings = model_settings.get("PREPROCESSOR", None) - trips_df = trips.to_frame() - trips_merged_df = trips_merged.to_frame() - land_use_df = land_use.to_frame() + trips_df = trips + trips_merged_df = trips_merged + land_use_df = land_use proposed_trip_departure_period = model_settings["TRIP_DEPARTURE_PERIOD"] # TODO: the number of skim time periods should be more readily available than this diff --git a/activitysim/abm/models/stop_frequency.py b/activitysim/abm/models/stop_frequency.py index d714facf5..102f4fb78 100644 --- a/activitysim/abm/models/stop_frequency.py +++ b/activitysim/abm/models/stop_frequency.py @@ -14,11 +14,10 @@ @workflow.step def stop_frequency( whale: workflow.Whale, - tours, - tours_merged, + tours: pd.DataFrame, + tours_merged: pd.DataFrame, stop_frequency_alts, network_los, - chunk_size, ): """ stop frequency model @@ -52,8 +51,6 @@ def stop_frequency( model_settings = whale.filesystem.read_model_settings(model_settings_file_name) - tours = tours.to_frame() - tours_merged = tours_merged.to_frame() assert not tours_merged.household_id.isnull().any() assert not (tours_merged.origin == -1).any() assert not (tours_merged.destination == -1).any() diff --git a/activitysim/abm/models/summarize.py b/activitysim/abm/models/summarize.py index 9ab09dcf8..d63e218ab 100644 --- a/activitysim/abm/models/summarize.py +++ b/activitysim/abm/models/summarize.py @@ -207,7 +207,6 @@ def summarize( households: pd.DataFrame, households_merged: pd.DataFrame, trips: pd.DataFrame, - tours: pd.DataFrame, tours_merged: pd.DataFrame, land_use: pd.DataFrame, ): @@ -238,14 +237,7 @@ def summarize( ) # Load dataframes from pipeline - persons = persons.to_frame() - persons_merged = persons_merged.to_frame() - households = households.to_frame() - households_merged = households_merged.to_frame() - trips = trips.to_frame() - tours = tours_merged.to_frame() - tours_merged = tours_merged.to_frame() - land_use = land_use.to_frame() + tours = tours_merged # - trips_merged - merge trips and tours_merged trips_merged = pd.merge( diff --git a/activitysim/abm/models/telecommute_frequency.py b/activitysim/abm/models/telecommute_frequency.py index 2827f3bd1..c66b9ddc7 100755 --- a/activitysim/abm/models/telecommute_frequency.py +++ b/activitysim/abm/models/telecommute_frequency.py @@ -12,7 +12,7 @@ @workflow.step def telecommute_frequency( - whale: workflow.Whale, persons_merged, persons, chunk_size, trace_hh_id + whale: workflow.Whale, persons_merged: pd.DataFrame, persons: pd.DataFrame ): """ This model predicts the frequency of telecommute for a person (worker) who @@ -25,7 +25,7 @@ def telecommute_frequency( trace_label = "telecommute_frequency" model_settings_file_name = "telecommute_frequency.yaml" - choosers = persons_merged.to_frame() + choosers = persons_merged choosers = choosers[choosers.workplace_zone_id > -1] logger.info("Running %s with %d persons", trace_label, len(choosers)) @@ -85,7 +85,6 @@ def telecommute_frequency( estimator.write_override_choices(choices) estimator.end_estimation() - persons = persons.to_frame() persons["telecommute_frequency"] = ( choices.reindex(persons.index).fillna("").astype(str) ) @@ -96,5 +95,5 @@ def telecommute_frequency( "telecommute_frequency", persons.telecommute_frequency, value_counts=True ) - if trace_hh_id: + if whale.settings.trace_hh_id: tracing.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/abm/models/tour_mode_choice.py b/activitysim/abm/models/tour_mode_choice.py index e7f5f19cc..7e9885b4a 100644 --- a/activitysim/abm/models/tour_mode_choice.py +++ b/activitysim/abm/models/tour_mode_choice.py @@ -175,7 +175,10 @@ def get_trip_mc_logsums_for_all_modes( @workflow.step def tour_mode_choice_simulate( - whale: workflow.Whale, tours, persons_merged, network_los, chunk_size, trace_hh_id + whale: workflow.Whale, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + network_los, ): """ Tour mode choice simulate @@ -188,14 +191,13 @@ def tour_mode_choice_simulate( mode_column_name = "tour_mode" segment_column_name = "tour_purpose" - primary_tours = tours.to_frame() + primary_tours = tours assert not (primary_tours.tour_category == "atwork").any() logger.info("Running %s with %d tours" % (trace_label, primary_tours.shape[0])) tracing.print_summary("tour_types", primary_tours.tour_type, value_counts=True) - persons_merged = persons_merged.to_frame() primary_tours_merged = pd.merge( primary_tours, persons_merged, @@ -328,6 +330,7 @@ def tour_mode_choice_simulate( assert tours_segment.index.name == "tour_id" choices_df = run_tour_mode_choice_simulate( + whale, tours_segment, tour_purpose, model_settings, @@ -337,7 +340,6 @@ def tour_mode_choice_simulate( skims=skims, constants=constants, estimator=estimator, - chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, tour_purpose), trace_choice_name="tour_mode_choice", ) @@ -396,7 +398,7 @@ def tour_mode_choice_simulate( assign_in_place(primary_tours, choices_df) # update tours table with mode choice (and optionally logsums) - all_tours = tours.to_frame() + all_tours = tours assign_in_place(all_tours, choices_df) if whale.is_table("school_escort_tours") & model_settings.get( @@ -414,7 +416,7 @@ def tour_mode_choice_simulate( if model_settings.get("annotate_tours"): annotate.annotate_tours(model_settings, trace_label) - if trace_hh_id: + if whale.settings.trace_hh_id: tracing.trace_df( primary_tours, label=tracing.extend_trace_label(trace_label, mode_column_name), diff --git a/activitysim/abm/models/tour_scheduling_probabilistic.py b/activitysim/abm/models/tour_scheduling_probabilistic.py index 014ef0a36..827c00c85 100644 --- a/activitysim/abm/models/tour_scheduling_probabilistic.py +++ b/activitysim/abm/models/tour_scheduling_probabilistic.py @@ -14,18 +14,17 @@ def run_tour_scheduling_probabilistic( whale: workflow.Whale, - tours_df, - scheduling_probs, - probs_join_cols, - depart_alt_base, - chunk_size, - trace_label, - trace_hh_id, + tours_df: pd.DataFrame, + scheduling_probs: pd.DataFrame, + probs_join_cols: str | list[str], + depart_alt_base: int, + trace_label: str, ): """Make probabilistic tour scheduling choices in chunks Parameters ---------- + whale: workflow.Whale tours_df : pandas.DataFrame table of tours scheduling_probs : pandas.DataFrame @@ -35,12 +34,8 @@ def run_tour_scheduling_probabilistic( depart_alt_base : int int to add to probs column index to get time period it represents. e.g. depart_alt_base = 5 means first column (column 0) represents 5 am - chunk_size : int - size of chooser chunks, set in main settings.yaml trace_label : str label to append to tracing logs and table names - trace_hh_id : int - households to trace Returns ------- @@ -64,7 +59,6 @@ def run_tour_scheduling_probabilistic( first_trip_in_leg=False, report_failed_trips=True, trace_label=chunk_trace_label, - trace_hh_id=trace_hh_id, trace_choice_col_name="depart_return", clip_earliest_latest=False, ) @@ -75,9 +69,7 @@ def run_tour_scheduling_probabilistic( @workflow.step -def tour_scheduling_probabilistic( - whale: workflow.Whale, tours, chunk_size, trace_hh_id -): +def tour_scheduling_probabilistic(whale: workflow.Whale, tours: pd.DataFrame): """Makes tour departure and arrival choices by sampling from a probability lookup table This model samples tour scheduling choices from an exogenously defined probability @@ -87,7 +79,7 @@ def tour_scheduling_probabilistic( Parameters ---------- - tours : orca.DataFrameWrapper + tours : DataFrame lazy-loaded table of tours chunk_size : int size of chooser chunks, defined in main settings.yaml @@ -105,7 +97,7 @@ def tour_scheduling_probabilistic( ) scheduling_probs = pd.read_csv(scheduling_probs_filepath) probs_join_cols = model_settings["PROBS_JOIN_COLS"] - tours_df = tours.to_frame() + tours_df = tours # trip_scheduling is a probabilistic model ane we don't support estimation, # but we do need to override choices in estimation mode @@ -124,9 +116,7 @@ def tour_scheduling_probabilistic( scheduling_probs, probs_join_cols, depart_alt_base, - chunk_size, trace_label, - trace_hh_id, ) # convert alt index choices to depart/return times diff --git a/activitysim/abm/models/trip_destination.py b/activitysim/abm/models/trip_destination.py index 76fe3d6a9..5d9edb739 100644 --- a/activitysim/abm/models/trip_destination.py +++ b/activitysim/abm/models/trip_destination.py @@ -86,7 +86,6 @@ def _destination_sample( skims, alt_dest_col_name, estimator, - chunk_size, chunk_tag, trace_label, zone_layer=None, @@ -155,7 +154,7 @@ def _destination_sample( spec=spec, skims=skims, locals_d=locals_dict, - chunk_size=chunk_size, + chunk_size=whale.settings.chunk_size, chunk_tag=chunk_tag, trace_label=trace_label, zone_layer=zone_layer, @@ -192,7 +191,6 @@ def destination_sample( skims, alt_dest_col_name, estimator, - chunk_size, chunk_tag=chunk_tag, trace_label=trace_label, ) @@ -476,7 +474,6 @@ def destination_presample( skim_hotel, network_los, estimator, - chunk_size, trace_label, ): trace_label = tracing.extend_trace_label(trace_label, "presample") @@ -515,7 +512,6 @@ def destination_presample( skims, alt_dest_col_name, estimator, - chunk_size, chunk_tag=chunk_tag, trace_label=trace_label, zone_layer="taz", @@ -596,7 +592,6 @@ def trip_destination_sample( skim_hotel, network_los, estimator, - chunk_size, trace_label, ) @@ -673,15 +668,14 @@ def compute_ood_logsums( def compute_logsums( - whale, + whale: workflow.Whale, primary_purpose, - trips, + trips: pd.DataFrame, destination_sample, - tours_merged, + tours_merged: pd.DataFrame, model_settings, skim_hotel, - chunk_size, - trace_label, + trace_label: str, ): """ Calculate mode choice logsums using the same recipe as for trip_mode_choice, but do it twice @@ -771,7 +765,7 @@ def compute_logsums( logsum_spec, od_skims, locals_dict, - chunk_size, + whale.settings.chunk_size, trace_label=tracing.extend_trace_label(trace_label, "od"), chunk_tag=chunk_tag, ) @@ -799,7 +793,7 @@ def compute_logsums( logsum_spec, dp_skims, locals_dict, - chunk_size, + whale.settings.chunk_size, trace_label=tracing.extend_trace_label(trace_label, "dp"), chunk_tag=chunk_tag, ) @@ -817,7 +811,6 @@ def trip_destination_simulate( size_term_matrix, skim_hotel, estimator, - chunk_size, trace_label, ): """ @@ -878,7 +871,7 @@ def trip_destination_simulate( zero_prob_choice_val=NO_DESTINATION, skims=skims, locals_d=locals_dict, - chunk_size=chunk_size, + chunk_size=whale.settings.chunk_size, chunk_tag=chunk_tag, trace_label=trace_label, trace_choice_name="trip_dest", @@ -964,13 +957,11 @@ def choose_trip_destination( tours_merged=tours_merged, model_settings=model_settings, skim_hotel=skim_hotel, - chunk_size=chunk_size, trace_label=trace_label, ) t0 = print_elapsed_time("%s.compute_logsums" % trace_label, t0) - # - trip_destination_simulate destinations = trip_destination_simulate( whale, primary_purpose=primary_purpose, @@ -981,7 +972,6 @@ def choose_trip_destination( size_term_matrix=size_term_matrix, skim_hotel=skim_hotel, estimator=estimator, - chunk_size=chunk_size, trace_label=trace_label, ) diff --git a/activitysim/abm/models/trip_scheduling.py b/activitysim/abm/models/trip_scheduling.py index 99f153674..76bdb6094 100644 --- a/activitysim/abm/models/trip_scheduling.py +++ b/activitysim/abm/models/trip_scheduling.py @@ -255,7 +255,6 @@ def schedule_trips_in_leg( depart_alt_base, first_trip_in_leg=first_trip_in_leg, report_failed_trips=is_last_iteration, - trace_hh_id=trace_hh_id, trace_label=nth_trace_label, ) diff --git a/activitysim/abm/models/util/mode.py b/activitysim/abm/models/util/mode.py index c185642c4..22076819c 100644 --- a/activitysim/abm/models/util/mode.py +++ b/activitysim/abm/models/util/mode.py @@ -93,7 +93,6 @@ def run_tour_mode_choice_simulate( skims, constants, estimator, - chunk_size, trace_label=None, trace_choice_name=None, ): @@ -149,7 +148,7 @@ def run_tour_mode_choice_simulate( nest_spec=nest_spec, skims=skims, locals_d=locals_dict, - chunk_size=chunk_size, + chunk_size=whale.settings.chunk_size, mode_column_name=mode_column_name, logsum_column_name=logsum_column_name, trace_label=trace_label, diff --git a/activitysim/abm/models/util/probabilistic_scheduling.py b/activitysim/abm/models/util/probabilistic_scheduling.py index 9ecea3ce3..782884cc9 100644 --- a/activitysim/abm/models/util/probabilistic_scheduling.py +++ b/activitysim/abm/models/util/probabilistic_scheduling.py @@ -230,7 +230,6 @@ def make_scheduling_choices( depart_alt_base, first_trip_in_leg, report_failed_trips, - trace_hh_id, trace_label, trace_choice_col_name="depart", clip_earliest_latest=True, @@ -254,7 +253,6 @@ def make_scheduling_choices( int to add to probs column index to get time period it represents. e.g. depart_alt_base = 5 means first column (column 0) represents 5 am report_failed_trips : bool - trace_hh_id trace_label Returns @@ -262,7 +260,7 @@ def make_scheduling_choices( choices: pd.Series time periods depart choices, one per trip (except for trips with zero probs) """ - + trace_hh_id = whale.settings.trace_hh_id choosers = pd.merge( choosers_df.reset_index(), probs_spec, on=probs_join_cols, how="left" ).set_index(choosers_df.index.name) diff --git a/activitysim/abm/models/util/tour_destination.py b/activitysim/abm/models/util/tour_destination.py index 0f2a50cb2..dedffbc14 100644 --- a/activitysim/abm/models/util/tour_destination.py +++ b/activitysim/abm/models/util/tour_destination.py @@ -60,16 +60,15 @@ def dest_size_terms_df(self, segment_name, trace_label): def _destination_sample( whale: workflow.Whale, - spec_segment_name, - choosers, + spec_segment_name: str, + choosers: pd.DataFrame, destination_size_terms, skims, estimator, model_settings, alt_dest_col_name, - chunk_size, chunk_tag, - trace_label, + trace_label: str, zone_layer=None, ): model_spec = simulate.spec_for_segment( @@ -115,7 +114,7 @@ def _destination_sample( spec=model_spec, skims=skims, locals_d=locals_d, - chunk_size=chunk_size, + chunk_size=whale.settings.chunk_size, chunk_tag=chunk_tag, trace_label=trace_label, zone_layer=zone_layer, @@ -167,7 +166,6 @@ def destination_sample( estimator, model_settings, alt_dest_col_name, - chunk_size, chunk_tag=chunk_tag, trace_label=trace_label, ) @@ -462,7 +460,6 @@ def destination_presample( network_los, destination_size_terms, estimator, - chunk_size, trace_label, ): trace_label = tracing.extend_trace_label(trace_label, "presample") @@ -497,7 +494,6 @@ def destination_presample( estimator, model_settings, DEST_TAZ, - chunk_size, chunk_tag=chunk_tag, trace_label=trace_label, zone_layer="taz", @@ -569,7 +565,6 @@ def run_destination_sample( network_los, destination_size_terms, estimator, - chunk_size, trace_label, ) diff --git a/activitysim/abm/models/util/vectorize_tour_scheduling.py b/activitysim/abm/models/util/vectorize_tour_scheduling.py index 6e65c6b1c..379f04a7e 100644 --- a/activitysim/abm/models/util/vectorize_tour_scheduling.py +++ b/activitysim/abm/models/util/vectorize_tour_scheduling.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd -from activitysim.core import chunk, config, expressions, inject, los, simulate +from activitysim.core import chunk, config, expressions, los, simulate from activitysim.core import timetable as tt from activitysim.core import tracing, workflow from activitysim.core.interaction_sample_simulate import interaction_sample_simulate @@ -550,7 +550,9 @@ def tdd_interaction_dataset( return alt_tdd -def run_alts_preprocessor(model_settings, alts, segment, locals_dict, trace_label): +def run_alts_preprocessor( + whale: workflow.Whale, model_settings, alts, segment, locals_dict, trace_label +): """ run preprocessor on alts, as specified by ALTS_PREPROCESSOR in model_settings @@ -758,7 +760,7 @@ def _schedule_tours( logsum_tour_purpose # FIXME this is not always right - see note above ) alt_tdd = run_alts_preprocessor( - model_settings, alt_tdd, spec_segment, locals_d, tour_trace_label + whale, model_settings, alt_tdd, spec_segment, locals_d, tour_trace_label ) chunk_sizer.log_df(tour_trace_label, "alt_tdd", alt_tdd) @@ -998,6 +1000,7 @@ def vectorize_tour_scheduling( if RUN_ALTS_PREPROCESSOR_BEFORE_MERGE: locals_dict = {} alts = run_alts_preprocessor( + whale, model_settings, alts, spec_segment_name, @@ -1061,6 +1064,7 @@ def vectorize_tour_scheduling( def vectorize_subtour_scheduling( + whale: workflow.Whale, parent_tours, subtours, persons_merged, diff --git a/activitysim/abm/models/work_from_home.py b/activitysim/abm/models/work_from_home.py index 8b23ff39e..83d31c98d 100755 --- a/activitysim/abm/models/work_from_home.py +++ b/activitysim/abm/models/work_from_home.py @@ -11,7 +11,7 @@ @workflow.step -def work_from_home(whale: workflow.Whale, persons_merged, persons, chunk_size): +def work_from_home(whale: workflow.Whale, persons_merged, persons): """ This model predicts whether a person (worker) works from home. The output from this model is TRUE (if works from home) or FALSE (works away from home). @@ -22,7 +22,7 @@ def work_from_home(whale: workflow.Whale, persons_merged, persons, chunk_size): trace_label = "work_from_home" model_settings_file_name = "work_from_home.yaml" - choosers = persons_merged.to_frame() + choosers = persons_merged model_settings = whale.filesystem.read_model_settings(model_settings_file_name) chooser_filter_column_name = model_settings.get( "CHOOSER_FILTER_COLUMN_NAME", "is_worker" @@ -153,7 +153,6 @@ def work_from_home(whale: workflow.Whale, persons_merged, persons, chunk_size): estimator.write_override_choices(choices) estimator.end_estimation() - persons = persons.to_frame() persons["work_from_home"] = choices.reindex(persons.index).fillna(0).astype(bool) persons["is_out_of_home_worker"] = ( persons[chooser_filter_column_name] & ~persons["work_from_home"] diff --git a/activitysim/abm/tables/vehicles.py b/activitysim/abm/tables/vehicles.py index feaf05003..4c3680738 100644 --- a/activitysim/abm/tables/vehicles.py +++ b/activitysim/abm/tables/vehicles.py @@ -10,7 +10,7 @@ @workflow.table -def vehicles(whale: workflow.Whale, households): +def vehicles(whale: workflow.Whale, households: pd.DataFrame): """Creates the vehicles table and load it as an injectable This method initializes the `vehicles` table, where the number of rows @@ -18,7 +18,7 @@ def vehicles(whale: workflow.Whale, households): Parameters ---------- - households : orca.DataFrameWrapper + households : DataFrame Returns ------- @@ -26,9 +26,7 @@ def vehicles(whale: workflow.Whale, households): """ # initialize vehicles table - vehicles = households.to_frame().loc[ - households.index.repeat(households["auto_ownership"]) - ] + vehicles = households.loc[households.index.repeat(households["auto_ownership"])] vehicles = vehicles.reset_index()[["household_id"]] vehicles["vehicle_num"] = vehicles.groupby("household_id").cumcount() + 1 @@ -45,7 +43,7 @@ def vehicles(whale: workflow.Whale, households): return vehicles -@workflow.table +@workflow.temp_table def vehicles_merged( whale: workflow.Whale, vehicles: pd.DataFrame, households_merged: pd.DataFrame ): From 0e31934538ea041695eba20b211ffcfa35c0dd47 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Mon, 13 Feb 2023 11:19:55 -0600 Subject: [PATCH 013/419] refactoring --- activitysim/abm/models/accessibility.py | 2 +- .../abm/models/atwork_subtour_destination.py | 2 +- .../abm/models/atwork_subtour_frequency.py | 3 +- .../abm/models/atwork_subtour_mode_choice.py | 2 +- .../abm/models/atwork_subtour_scheduling.py | 10 +- activitysim/abm/models/auto_ownership.py | 2 +- activitysim/abm/models/free_parking.py | 2 +- activitysim/abm/models/initialize_tours.py | 2 +- .../abm/models/joint_tour_composition.py | 2 +- .../abm/models/joint_tour_destination.py | 2 +- .../abm/models/joint_tour_frequency.py | 4 +- .../abm/models/joint_tour_participation.py | 4 +- .../abm/models/joint_tour_scheduling.py | 2 +- activitysim/abm/models/location_choice.py | 10 +- .../abm/models/mandatory_scheduling.py | 4 +- .../abm/models/mandatory_tour_frequency.py | 4 +- .../abm/models/non_mandatory_destination.py | 2 +- .../abm/models/non_mandatory_scheduling.py | 4 +- .../models/non_mandatory_tour_frequency.py | 10 +- .../abm/models/parking_location_choice.py | 2 +- activitysim/abm/models/school_escorting.py | 2 +- activitysim/abm/models/stop_frequency.py | 9 +- activitysim/abm/models/summarize.py | 2 +- .../abm/models/telecommute_frequency.py | 2 +- activitysim/abm/models/tour_mode_choice.py | 2 +- activitysim/abm/models/tour_od_choice.py | 14 +- .../abm/models/transit_pass_ownership.py | 2 +- .../abm/models/transit_pass_subsidy.py | 11 +- .../abm/models/trip_departure_choice.py | 26 ++- activitysim/abm/models/trip_destination.py | 30 +-- activitysim/abm/models/trip_matrices.py | 2 +- activitysim/abm/models/trip_mode_choice.py | 23 ++- activitysim/abm/models/trip_purpose.py | 6 +- .../models/trip_purpose_and_destination.py | 2 +- activitysim/abm/models/util/cdap.py | 44 ++--- activitysim/abm/models/util/mode.py | 16 +- .../models/util/probabilistic_scheduling.py | 12 +- .../abm/models/util/tour_destination.py | 20 +- activitysim/abm/models/util/tour_od.py | 18 +- .../models/util/vectorize_tour_scheduling.py | 2 +- activitysim/abm/models/vehicle_allocation.py | 2 +- activitysim/abm/models/vehicle_type_choice.py | 2 +- activitysim/abm/models/work_from_home.py | 2 +- activitysim/abm/tables/households.py | 2 +- activitysim/abm/tables/persons.py | 8 +- activitysim/abm/tables/shadow_pricing.py | 2 +- activitysim/abm/tables/tours.py | 30 ++- activitysim/cli/run.py | 2 +- activitysim/core/chunk.py | 171 ++++++++++++------ activitysim/core/expressions.py | 7 +- activitysim/core/interaction_sample.py | 20 +- .../core/interaction_sample_simulate.py | 18 +- activitysim/core/interaction_simulate.py | 31 ++-- activitysim/core/pathbuilder.py | 2 +- activitysim/core/simulate.py | 46 ++--- activitysim/core/tracing.py | 20 +- activitysim/core/workflow/state.py | 84 ++++++++- docs/howitworks.rst | 6 +- 58 files changed, 474 insertions(+), 301 deletions(-) diff --git a/activitysim/abm/models/accessibility.py b/activitysim/abm/models/accessibility.py index c62fc4ed1..5c5951223 100644 --- a/activitysim/abm/models/accessibility.py +++ b/activitysim/abm/models/accessibility.py @@ -98,7 +98,7 @@ def compute_accessibilities_for_zones( df = pd.concat([od_df[trace_od_rows], trace_results], axis=1) # dump the trace results table (with _temp variables) to aid debugging - tracing.trace_df( + whale.trace_df( df, label="accessibility", index_label="skim_offset", diff --git a/activitysim/abm/models/atwork_subtour_destination.py b/activitysim/abm/models/atwork_subtour_destination.py index 0debddf82..a6f87bf8c 100644 --- a/activitysim/abm/models/atwork_subtour_destination.py +++ b/activitysim/abm/models/atwork_subtour_destination.py @@ -99,6 +99,6 @@ def atwork_subtour_destination( whale.extend_table(sample_table_name, save_sample_df) if whale.settings.trace_hh_id: - tracing.trace_df( + whale.trace_df( tours, label="atwork_subtour_destination", columns=["destination"] ) diff --git a/activitysim/abm/models/atwork_subtour_frequency.py b/activitysim/abm/models/atwork_subtour_frequency.py index 81f98cb11..bac9ac40e 100644 --- a/activitysim/abm/models/atwork_subtour_frequency.py +++ b/activitysim/abm/models/atwork_subtour_frequency.py @@ -50,7 +50,6 @@ def atwork_subtour_frequency(whale: workflow.Whale, tours, persons_merged, chunk ) # merge persons into work_tours - persons_merged = persons_merged.to_frame() work_tours = pd.merge( work_tours, persons_merged, left_on="person_id", right_index=True ) @@ -119,4 +118,4 @@ def atwork_subtour_frequency(whale: workflow.Whale, tours, persons_merged, chunk ) if trace_hh_id: - tracing.trace_df(tours, label="atwork_subtour_frequency.tours") + whale.trace_df(tours, label="atwork_subtour_frequency.tours") diff --git a/activitysim/abm/models/atwork_subtour_mode_choice.py b/activitysim/abm/models/atwork_subtour_mode_choice.py index bc342b0c3..8e9418dbb 100644 --- a/activitysim/abm/models/atwork_subtour_mode_choice.py +++ b/activitysim/abm/models/atwork_subtour_mode_choice.py @@ -200,7 +200,7 @@ def atwork_subtour_mode_choice( whale.add_table("tours", tours) if trace_hh_id: - tracing.trace_df( + whale.trace_df( tours[tours.tour_category == "atwork"], label=tracing.extend_trace_label(trace_label, mode_column_name), slicer="tour_id", diff --git a/activitysim/abm/models/atwork_subtour_scheduling.py b/activitysim/abm/models/atwork_subtour_scheduling.py index 9a8567eb0..c68ddbb7a 100644 --- a/activitysim/abm/models/atwork_subtour_scheduling.py +++ b/activitysim/abm/models/atwork_subtour_scheduling.py @@ -61,7 +61,7 @@ def atwork_subtour_scheduling( "od_skims": od_skim_wrapper, } expressions.annotate_preprocessors( - subtours, constants, skims, model_settings, trace_label + whale, subtours, constants, skims, model_settings, trace_label ) # parent_tours table with columns ['tour_id', 'tdd'] index = tour_id @@ -105,7 +105,7 @@ def atwork_subtour_scheduling( whale.add_table("tours", tours) if trace_hh_id: - tracing.trace_df( + whale.trace_df( tours[tours.tour_category == "atwork"], label="atwork_subtour_scheduling", slicer="person_id", @@ -117,12 +117,12 @@ def atwork_subtour_scheduling( subtours = tours[tours.tour_category == "atwork"] parent_tours = tours[tours.index.isin(subtours.parent_tour_id)] - tracing.dump_df(DUMP, subtours, trace_label, "sub_tours") - tracing.dump_df(DUMP, parent_tours, trace_label, "parent_tours") + whale.dump_df(DUMP, subtours, trace_label, "sub_tours") + whale.dump_df(DUMP, parent_tours, trace_label, "parent_tours") parent_tours["parent_tour_id"] = parent_tours.index subtours = pd.concat([parent_tours, subtours]) - tracing.dump_df( + whale.dump_df( DUMP, tt.tour_map( parent_tours, subtours, tdd_alts, persons_id_col="parent_tour_id" diff --git a/activitysim/abm/models/auto_ownership.py b/activitysim/abm/models/auto_ownership.py index 9be6a1558..5c09b1d5c 100644 --- a/activitysim/abm/models/auto_ownership.py +++ b/activitysim/abm/models/auto_ownership.py @@ -73,4 +73,4 @@ def auto_ownership_simulate( ) if trace_hh_id: - tracing.trace_df(households, label="auto_ownership", warn_if_empty=True) + whale.trace_df(households, label="auto_ownership", warn_if_empty=True) diff --git a/activitysim/abm/models/free_parking.py b/activitysim/abm/models/free_parking.py index ca45956b0..35936decf 100644 --- a/activitysim/abm/models/free_parking.py +++ b/activitysim/abm/models/free_parking.py @@ -94,4 +94,4 @@ def free_parking( ) if trace_hh_id: - tracing.trace_df(persons, label=trace_label, warn_if_empty=True) + whale.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/abm/models/initialize_tours.py b/activitysim/abm/models/initialize_tours.py index e12785ca0..68ea39a7e 100644 --- a/activitysim/abm/models/initialize_tours.py +++ b/activitysim/abm/models/initialize_tours.py @@ -132,4 +132,4 @@ def initialize_tours(whale: workflow.Whale, network_los, households, persons): raise RuntimeError(f"{tours_without_persons.sum()} tours with bad person_id") if trace_hh_id: - tracing.trace_df(tours, label="initialize_tours", warn_if_empty=True) + whale.trace_df(tours, label="initialize_tours", warn_if_empty=True) diff --git a/activitysim/abm/models/joint_tour_composition.py b/activitysim/abm/models/joint_tour_composition.py index f7247b532..1ab7034fb 100644 --- a/activitysim/abm/models/joint_tour_composition.py +++ b/activitysim/abm/models/joint_tour_composition.py @@ -118,7 +118,7 @@ def joint_tour_composition( ) if whale.settings.trace_hh_id: - tracing.trace_df( + whale.trace_df( joint_tours, label="joint_tour_composition.joint_tours", slicer="household_id", diff --git a/activitysim/abm/models/joint_tour_destination.py b/activitysim/abm/models/joint_tour_destination.py index 04e7fd7e3..b2c047ccd 100644 --- a/activitysim/abm/models/joint_tour_destination.py +++ b/activitysim/abm/models/joint_tour_destination.py @@ -97,4 +97,4 @@ def joint_tour_destination( whale.extend_table(sample_table_name, save_sample_df) if trace_hh_id: - tracing.trace_df(joint_tours, label="joint_tour_destination.joint_tours") + whale.trace_df(joint_tours, label="joint_tour_destination.joint_tours") diff --git a/activitysim/abm/models/joint_tour_frequency.py b/activitysim/abm/models/joint_tour_frequency.py index ac75f0a56..411f15bc7 100644 --- a/activitysim/abm/models/joint_tour_frequency.py +++ b/activitysim/abm/models/joint_tour_frequency.py @@ -138,9 +138,9 @@ def joint_tour_frequency(whale: workflow.Whale, households, persons, chunk_size) ) if trace_hh_id: - tracing.trace_df(households, label="joint_tour_frequency.households") + whale.trace_df(households, label="joint_tour_frequency.households") - tracing.trace_df( + whale.trace_df( joint_tours, label="joint_tour_frequency.joint_tours", slicer="household_id" ) diff --git a/activitysim/abm/models/joint_tour_participation.py b/activitysim/abm/models/joint_tour_participation.py index 0c1c96803..b1646015c 100644 --- a/activitysim/abm/models/joint_tour_participation.py +++ b/activitysim/abm/models/joint_tour_participation.py @@ -426,6 +426,6 @@ def joint_tour_participation( annotate_jtp(whale, model_settings, trace_label) if trace_hh_id: - tracing.trace_df(participants, label="joint_tour_participation.participants") + whale.trace_df(participants, label="joint_tour_participation.participants") - tracing.trace_df(joint_tours, label="joint_tour_participation.joint_tours") + whale.trace_df(joint_tours, label="joint_tour_participation.joint_tours") diff --git a/activitysim/abm/models/joint_tour_scheduling.py b/activitysim/abm/models/joint_tour_scheduling.py index 6e4750884..81655e362 100644 --- a/activitysim/abm/models/joint_tour_scheduling.py +++ b/activitysim/abm/models/joint_tour_scheduling.py @@ -139,6 +139,6 @@ def joint_tour_scheduling( joint_tours = tours[tours.tour_category == "joint"] if trace_hh_id: - tracing.trace_df( + whale.trace_df( joint_tours, label="joint_tour_scheduling", slicer="household_id" ) diff --git a/activitysim/abm/models/location_choice.py b/activitysim/abm/models/location_choice.py index 6742de5b4..3c06876bc 100644 --- a/activitysim/abm/models/location_choice.py +++ b/activitysim/abm/models/location_choice.py @@ -768,7 +768,7 @@ def run_location_choice( estimation_trace_label = tracing.extend_trace_label( trace_label, f"estimation.{segment_name}.modeled_choices" ) - tracing.trace_df(choices_df, label=estimation_trace_label) + whale.trace_df(choices_df, label=estimation_trace_label) estimator.write_choices(choices_df.choice) choices_df.choice = estimator.get_survey_values( @@ -810,7 +810,7 @@ def run_location_choice( estimation_trace_label = tracing.extend_trace_label( trace_label, f"estimation.{segment_name}.survey_choices" ) - tracing.trace_df(choices_df, estimation_trace_label) + whale.trace_df(choices_df, estimation_trace_label) choices_list.append(choices_df) @@ -977,7 +977,7 @@ def iterate_location_choice( if locutor: spc.write_trace_files(whale, iteration) - if spc.use_shadow_pricing and spc.check_fit(iteration): + if spc.use_shadow_pricing and spc.check_fit(whale, iteration): logging.info( "%s converged after iteration %s" % ( @@ -1033,7 +1033,7 @@ def iterate_location_choice( whale.add_table("persons", persons_df) if whale.settings.trace_hh_id: - tracing.trace_df(persons_df, label=trace_label, warn_if_empty=True) + whale.trace_df(persons_df, label=trace_label, warn_if_empty=True) # - annotate households table if "annotate_households" in model_settings: @@ -1047,7 +1047,7 @@ def iterate_location_choice( whale.add_table("households", households_df) if whale.settings.trace_hh_id: - tracing.trace_df(households_df, label=trace_label, warn_if_empty=True) + whale.trace_df(households_df, label=trace_label, warn_if_empty=True) if logsum_column_name: tracing.print_summary( diff --git a/activitysim/abm/models/mandatory_scheduling.py b/activitysim/abm/models/mandatory_scheduling.py index 0a88d10bf..272d3ab08 100644 --- a/activitysim/abm/models/mandatory_scheduling.py +++ b/activitysim/abm/models/mandatory_scheduling.py @@ -63,7 +63,7 @@ def mandatory_tour_scheduling( # updated df for tracing mandatory_tours = tours[tours.tour_category == "mandatory"] - tracing.dump_df( + whale.dump_df( DUMP, tt.tour_map(persons_merged, mandatory_tours, tdd_alts), trace_label, @@ -71,7 +71,7 @@ def mandatory_tour_scheduling( ) if trace_hh_id: - tracing.trace_df( + whale.trace_df( mandatory_tours, label=trace_label, slicer="person_id", diff --git a/activitysim/abm/models/mandatory_tour_frequency.py b/activitysim/abm/models/mandatory_tour_frequency.py index 2f923f459..fd717fa7a 100644 --- a/activitysim/abm/models/mandatory_tour_frequency.py +++ b/activitysim/abm/models/mandatory_tour_frequency.py @@ -149,12 +149,12 @@ def mandatory_tour_frequency(whale: workflow.Whale, persons_merged, chunk_size): ) if trace_hh_id: - tracing.trace_df( + whale.trace_df( mandatory_tours, label="mandatory_tour_frequency.mandatory_tours", warn_if_empty=True, ) - tracing.trace_df( + whale.trace_df( persons, label="mandatory_tour_frequency.persons", warn_if_empty=True ) diff --git a/activitysim/abm/models/non_mandatory_destination.py b/activitysim/abm/models/non_mandatory_destination.py index becee347a..b066272f5 100644 --- a/activitysim/abm/models/non_mandatory_destination.py +++ b/activitysim/abm/models/non_mandatory_destination.py @@ -120,7 +120,7 @@ def non_mandatory_tour_destination( whale.extend_table(sample_table_name, save_sample_df) if trace_hh_id: - tracing.trace_df( + whale.trace_df( tours[tours.tour_category == "non_mandatory"], label="non_mandatory_tour_destination", slicer="person_id", diff --git a/activitysim/abm/models/non_mandatory_scheduling.py b/activitysim/abm/models/non_mandatory_scheduling.py index 84a999225..807304b1a 100644 --- a/activitysim/abm/models/non_mandatory_scheduling.py +++ b/activitysim/abm/models/non_mandatory_scheduling.py @@ -47,7 +47,7 @@ def non_mandatory_tour_scheduling( # updated df for tracing non_mandatory_tours = tours[tours.tour_category == "non_mandatory"] - tracing.dump_df( + whale.dump_df( DUMP, tt.tour_map(persons_merged, non_mandatory_tours, tdd_alts), trace_label, @@ -55,7 +55,7 @@ def non_mandatory_tour_scheduling( ) if trace_hh_id: - tracing.trace_df( + whale.trace_df( non_mandatory_tours, label=trace_label, slicer="person_id", diff --git a/activitysim/abm/models/non_mandatory_tour_frequency.py b/activitysim/abm/models/non_mandatory_tour_frequency.py index 8443d959b..c95fce5a0 100644 --- a/activitysim/abm/models/non_mandatory_tour_frequency.py +++ b/activitysim/abm/models/non_mandatory_tour_frequency.py @@ -123,12 +123,12 @@ def extend_tour_counts( tour_counts.loc[choices.index, tour_type] += choices if have_trace_targets: - tracing.trace_df( + whale.trace_df( choices, tracing.extend_trace_label(tour_type_trace_label, "choices"), columns=[None, "choice"], ) - tracing.trace_df( + whale.trace_df( rands, tracing.extend_trace_label(tour_type_trace_label, "rands"), columns=[None, "rand"], @@ -404,17 +404,17 @@ def non_mandatory_tour_frequency( ) if whale.settings.trace_hh_id: - tracing.trace_df( + whale.trace_df( non_mandatory_tours, label="non_mandatory_tour_frequency.non_mandatory_tours", warn_if_empty=True, ) - tracing.trace_df( + whale.trace_df( choosers, label="non_mandatory_tour_frequency.choosers", warn_if_empty=True ) - tracing.trace_df( + whale.trace_df( persons, label="non_mandatory_tour_frequency.annotated_persons", warn_if_empty=True, diff --git a/activitysim/abm/models/parking_location_choice.py b/activitysim/abm/models/parking_location_choice.py index 559991332..b9e32b49b 100644 --- a/activitysim/abm/models/parking_location_choice.py +++ b/activitysim/abm/models/parking_location_choice.py @@ -348,7 +348,7 @@ def parking_location( whale.add_table("trips", trips_df) if trace_hh_id: - tracing.trace_df( + whale.trace_df( trips_df, label=trace_label, slicer="trip_id", diff --git a/activitysim/abm/models/school_escorting.py b/activitysim/abm/models/school_escorting.py index 7254bd2c7..8a6153342 100644 --- a/activitysim/abm/models/school_escorting.py +++ b/activitysim/abm/models/school_escorting.py @@ -474,7 +474,7 @@ def school_escorting( ) if trace_hh_id: - tracing.trace_df(households, label=escorting_choice, warn_if_empty=True) + whale.trace_df(households, label=escorting_choice, warn_if_empty=True) if stage_num >= 1: choosers["Alt"] = choices diff --git a/activitysim/abm/models/stop_frequency.py b/activitysim/abm/models/stop_frequency.py index 102f4fb78..975afce7a 100644 --- a/activitysim/abm/models/stop_frequency.py +++ b/activitysim/abm/models/stop_frequency.py @@ -76,6 +76,7 @@ def stop_frequency( # this should be pre-slice as some expressions may count tours by type annotations = expressions.compute_columns( + whale, df=tours_merged, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -219,17 +220,17 @@ def stop_frequency( assert not trips_differ.any() if trace_hh_id: - tracing.trace_df( + whale.trace_df( tours, label="stop_frequency.tours", slicer="person_id", columns=None ) - tracing.trace_df( + whale.trace_df( trips, label="stop_frequency.trips", slicer="person_id", columns=None ) - tracing.trace_df(annotations, label="stop_frequency.annotations", columns=None) + whale.trace_df(annotations, label="stop_frequency.annotations", columns=None) - tracing.trace_df( + whale.trace_df( tours_merged, label="stop_frequency.tours_merged", slicer="person_id", diff --git a/activitysim/abm/models/summarize.py b/activitysim/abm/models/summarize.py index d63e218ab..f28a9332b 100644 --- a/activitysim/abm/models/summarize.py +++ b/activitysim/abm/models/summarize.py @@ -266,7 +266,7 @@ def summarize( # Annotate trips_merged expressions.annotate_preprocessors( - trips_merged, locals_d, skims, model_settings, "summarize" + whale, trips_merged, locals_d, skims, model_settings, "summarize" ) for table_name, df in locals_d.items(): diff --git a/activitysim/abm/models/telecommute_frequency.py b/activitysim/abm/models/telecommute_frequency.py index c66b9ddc7..42e3c998e 100755 --- a/activitysim/abm/models/telecommute_frequency.py +++ b/activitysim/abm/models/telecommute_frequency.py @@ -96,4 +96,4 @@ def telecommute_frequency( ) if whale.settings.trace_hh_id: - tracing.trace_df(persons, label=trace_label, warn_if_empty=True) + whale.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/abm/models/tour_mode_choice.py b/activitysim/abm/models/tour_mode_choice.py index 7e9885b4a..d5e5f1fd9 100644 --- a/activitysim/abm/models/tour_mode_choice.py +++ b/activitysim/abm/models/tour_mode_choice.py @@ -417,7 +417,7 @@ def tour_mode_choice_simulate( annotate.annotate_tours(model_settings, trace_label) if whale.settings.trace_hh_id: - tracing.trace_df( + whale.trace_df( primary_tours, label=tracing.extend_trace_label(trace_label, mode_column_name), slicer="tour_id", diff --git a/activitysim/abm/models/tour_od_choice.py b/activitysim/abm/models/tour_od_choice.py index b1722b3e3..5fb312065 100644 --- a/activitysim/abm/models/tour_od_choice.py +++ b/activitysim/abm/models/tour_od_choice.py @@ -13,10 +13,10 @@ @workflow.step def tour_od_choice( whale: workflow.Whale, - tours, - persons, - households, - land_use, + tours: pd.DataFrame, + persons: pd.DataFrame, + households: pd.DataFrame, + land_use: pd.DataFrame, network_los, chunk_size, ): @@ -63,8 +63,6 @@ def tour_od_choice( logsum_column_name = model_settings.get("OD_CHOICE_LOGSUM_COLUMN_NAME", None) want_logsums = logsum_column_name is not None - tours = tours.to_frame() - # interaction_sample_simulate insists choosers appear in same order as alts tours = tours.sort_index() @@ -119,8 +117,6 @@ def tour_od_choice( land_use.to_frame(columns="poe_id").poe_id ) - households = households.to_frame() - persons = persons.to_frame() households[origin_col_name] = tours.set_index("household_id")[ origin_col_name ].reindex(households.index) @@ -143,7 +139,7 @@ def tour_od_choice( whale.extend_table(sample_table_name, save_sample_df) if trace_hh_id: - tracing.trace_df( + whale.trace_df( tours, label="tours_od_choice", slicer="person_id", diff --git a/activitysim/abm/models/transit_pass_ownership.py b/activitysim/abm/models/transit_pass_ownership.py index e462ba7ec..27e646ff3 100644 --- a/activitysim/abm/models/transit_pass_ownership.py +++ b/activitysim/abm/models/transit_pass_ownership.py @@ -87,4 +87,4 @@ def transit_pass_ownership( ) if trace_hh_id: - tracing.trace_df(persons, label=trace_label, warn_if_empty=True) + whale.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/abm/models/transit_pass_subsidy.py b/activitysim/abm/models/transit_pass_subsidy.py index 43d79116e..3b013be05 100644 --- a/activitysim/abm/models/transit_pass_subsidy.py +++ b/activitysim/abm/models/transit_pass_subsidy.py @@ -3,6 +3,7 @@ import logging import numpy as np +import pandas as pd from activitysim.abm.models.util import estimation from activitysim.core import config, expressions, simulate, tracing, workflow @@ -12,7 +13,10 @@ @workflow.step def transit_pass_subsidy( - whale: workflow.Whale, persons_merged, persons, chunk_size, trace_hh_id + whale: workflow.Whale, + persons_merged: pd.DataFrame, + persons: pd.DataFrame, + trace_hh_id, ): """ Transit pass subsidy model. @@ -21,7 +25,7 @@ def transit_pass_subsidy( trace_label = "transit_pass_subsidy" model_settings_file_name = "transit_pass_subsidy.yaml" - choosers = persons_merged.to_frame() + choosers = persons_merged logger.info("Running %s with %d persons", trace_label, len(choosers)) model_settings = whale.filesystem.read_model_settings(model_settings_file_name) @@ -77,7 +81,6 @@ def transit_pass_subsidy( estimator.write_override_choices(choices) estimator.end_estimation() - persons = persons.to_frame() persons["transit_pass_subsidy"] = choices.reindex(persons.index) whale.add_table("persons", persons) @@ -87,4 +90,4 @@ def transit_pass_subsidy( ) if trace_hh_id: - tracing.trace_df(persons, label=trace_label, warn_if_empty=True) + whale.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/abm/models/trip_departure_choice.py b/activitysim/abm/models/trip_departure_choice.py index 225ea8275..dee216529 100644 --- a/activitysim/abm/models/trip_departure_choice.py +++ b/activitysim/abm/models/trip_departure_choice.py @@ -179,10 +179,10 @@ def choose_tour_leg_pattern( have_trace_targets = tracing.has_trace_targets(whale, trip_segment) if have_trace_targets: - tracing.trace_df( + whale.trace_df( trip_segment, tracing.extend_trace_label(trace_label, "choosers") ) - tracing.trace_df( + whale.trace_df( alternatives, tracing.extend_trace_label(trace_label, "alternatives"), transpose=False, @@ -208,7 +208,7 @@ def choose_tour_leg_pattern( interaction_df, trip_segment ) - tracing.trace_df( + whale.trace_df( interaction_df, tracing.extend_trace_label(trace_label, "interaction_df"), transpose=False, @@ -242,7 +242,7 @@ def choose_tour_leg_pattern( tracing.extend_trace_label(trace_label, "eval"), ) - tracing.trace_df( + whale.trace_df( interaction_utilities, tracing.extend_trace_label(trace_label, "interaction_utilities"), transpose=False, @@ -307,7 +307,7 @@ def choose_tour_leg_pattern( chunk.log_df(trace_label, "padded_utilities", None) if have_trace_targets: - tracing.trace_df( + whale.trace_df( utilities_df, tracing.extend_trace_label(trace_label, "utilities"), column_labels=["alternative", "utility"], @@ -325,7 +325,7 @@ def choose_tour_leg_pattern( chunk.log_df(trace_label, "utilities_df", None) if have_trace_targets: - tracing.trace_df( + whale.trace_df( probs, tracing.extend_trace_label(trace_label, "probs"), column_labels=["alternative", "probability"], @@ -357,12 +357,12 @@ def choose_tour_leg_pattern( chunk.log_df(trace_label, "choices", choices) if have_trace_targets: - tracing.trace_df( + whale.trace_df( choices, tracing.extend_trace_label(trace_label, "choices"), columns=[None, PATTERN_ID], ) - tracing.trace_df( + whale.trace_df( rands, tracing.extend_trace_label(trace_label, "rands"), columns=[None, "rand"], @@ -464,15 +464,13 @@ def apply_stage_two_model(whale, omnibus_spec, trips, chunk_size, trace_label): @workflow.step -def trip_departure_choice( - whale: workflow.Whale, trips, trips_merged, skim_dict, chunk_size, trace_hh_id -): +def trip_departure_choice(whale: workflow.Whale, trips, trips_merged, skim_dict): trace_label = "trip_departure_choice" model_settings = whale.filesystem.read_model_settings("trip_departure_choice.yaml") spec = whale.filesystem.read_model_spec(file_name=model_settings["SPECIFICATION"]) - trips_merged_df = trips_merged.to_frame() + trips_merged_df = trips_merged # add tour-based chunk_id so we can chunk all trips in tour together tour_ids = trips_merged[TOUR_ID].unique() trips_merged_df["chunk_id"] = reindex( @@ -514,10 +512,10 @@ def trip_departure_choice( ) choices = apply_stage_two_model( - whale, spec, trips_merged_df, chunk_size, trace_label + whale, spec, trips_merged_df, whale.settings.chunk_size, trace_label ) - trips_df = trips.to_frame() + trips_df = trips trip_length = len(trips_df) trips_df = pd.concat([trips_df, choices], axis=1) assert len(trips_df) == trip_length diff --git a/activitysim/abm/models/trip_destination.py b/activitysim/abm/models/trip_destination.py index 5d9edb739..f5f8d0b80 100644 --- a/activitysim/abm/models/trip_destination.py +++ b/activitysim/abm/models/trip_destination.py @@ -252,7 +252,7 @@ def choose_MAZ_for_TAZ( # write taz choices, pick_counts, probs trace_targets = tracing.trace_targets(whale, taz_sample) - tracing.trace_df( + whale.trace_df( taz_sample[trace_targets], label=tracing.extend_trace_label(trace_label, "taz_sample"), transpose=False, @@ -336,7 +336,7 @@ def choose_MAZ_for_TAZ( # write maz_sizes: maz_sizes[index,trip_id,dest_TAZ,zone_id,size_term] maz_sizes_trace_targets = tracing.trace_targets(maz_sizes, slicer="trip_id") trace_maz_sizes = maz_sizes[maz_sizes_trace_targets] - tracing.trace_df( + whale.trace_df( trace_maz_sizes, label=tracing.extend_trace_label(trace_label, "maz_sizes"), transpose=False, @@ -391,7 +391,7 @@ def choose_MAZ_for_TAZ( if have_trace_targets: taz_choices_trace_targets = tracing.trace_targets(taz_choices, slicer="trip_id") trace_taz_choices_df = taz_choices[taz_choices_trace_targets] - tracing.trace_df( + whale.trace_df( trace_taz_choices_df, label=tracing.extend_trace_label(trace_label, "taz_choices"), transpose=False, @@ -417,7 +417,7 @@ def choose_MAZ_for_TAZ( index=trace_taz_choices_df.index, ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df( + whale.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_alts"), transpose=False, @@ -433,7 +433,7 @@ def choose_MAZ_for_TAZ( index=trace_taz_choices_df.index, ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df( + whale.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_size_terms"), transpose=False, @@ -447,7 +447,7 @@ def choose_MAZ_for_TAZ( ) df = pd.concat([lhs_df, df], axis=1) df["rand"] = rands[taz_choices_trace_targets] - tracing.trace_df( + whale.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_probs"), transpose=False, @@ -644,7 +644,7 @@ def compute_ood_logsums( settings=whale.settings, ): expressions.annotate_preprocessors( - choosers, locals_dict, od_skims, logsum_settings, trace_label + whale, choosers, locals_dict, od_skims, logsum_settings, trace_label ) logsums = simulate.simple_simulate_logsums( @@ -1372,7 +1372,7 @@ def run_trip_destination( @workflow.step def trip_destination( - whale: workflow.Whale, trips, tours_merged, chunk_size, trace_hh_id + whale: workflow.Whale, trips: pd.DataFrame, tours_merged: pd.DataFrame ): """ Choose a destination for all intermediate trips based on trip purpose. @@ -1388,10 +1388,10 @@ def trip_destination( Parameters ---------- - trips : orca.DataFrameWrapper + trips : DataFrame The trips table. This table is edited in-place to add the trip destinations. - tours_merged : orca.DataFrameWrapper + tours_merged : DataFrame The tours table, with columns merge from persons and households as well. chunk_size : int If non-zero, iterate over trips using this chunk size. @@ -1409,8 +1409,8 @@ def trip_destination( "fail_some_trips_for_testing", False ) - trips_df = trips.to_frame() - tours_merged_df = tours_merged.to_frame() + trips_df = trips + tours_merged_df = tours_merged if whale.is_table("school_escort_trips"): school_escort_trips = whale.get_dataframe("school_escort_trips") @@ -1439,7 +1439,7 @@ def trip_destination( trips_df, tours_merged_df, estimator=estimator, - chunk_size=chunk_size, + chunk_size=whale.settings.chunk_size, trace_label=trace_label, fail_some_trips_for_testing=fail_some_trips_for_testing, ) @@ -1504,8 +1504,8 @@ def trip_destination( whale.add_table("trips", trips_df) - if trace_hh_id: - tracing.trace_df( + if whale.settings.trace_hh_id: + whale.trace_df( trips_df, label=trace_label, slicer="trip_id", diff --git a/activitysim/abm/models/trip_matrices.py b/activitysim/abm/models/trip_matrices.py index 5e02e01fd..783ff08fb 100644 --- a/activitysim/abm/models/trip_matrices.py +++ b/activitysim/abm/models/trip_matrices.py @@ -254,7 +254,7 @@ def annotate_trips( locals_dict.update(constants) expressions.annotate_preprocessors( - trips_df, locals_dict, skims, model_settings, trace_label + whale, trips_df, locals_dict, skims, model_settings, trace_label ) if not np.issubdtype(trips_df["trip_period"].dtype, np.integer): diff --git a/activitysim/abm/models/trip_mode_choice.py b/activitysim/abm/models/trip_mode_choice.py index e9cc2ab4e..95e91771b 100644 --- a/activitysim/abm/models/trip_mode_choice.py +++ b/activitysim/abm/models/trip_mode_choice.py @@ -24,9 +24,7 @@ @workflow.step -def trip_mode_choice( - whale: workflow.Whale, trips, network_los, chunk_size, trace_hh_id -): +def trip_mode_choice(whale: workflow.Whale, trips, network_los, trace_hh_id): """ Trip mode choice - compute trip_mode (same values as for tour_mode) for each trip. @@ -43,7 +41,7 @@ def trip_mode_choice( logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME") mode_column_name = "trip_mode" - trips_df = trips.to_frame() + trips_df = trips logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) # give trip mode choice the option to run without calling tours_merged. Useful for xborder @@ -203,7 +201,12 @@ def trip_mode_choice( settings=whale.settings, ): expressions.annotate_preprocessors( - trips_segment, locals_dict, skims, model_settings, segment_trace_label + whale, + trips_segment, + locals_dict, + skims, + model_settings, + segment_trace_label, ) if estimator: @@ -214,6 +217,7 @@ def trip_mode_choice( locals_dict["timeframe"] = "trip" choices = mode_choice_simulate( + whale, choosers=trips_segment, spec=simulate.eval_coefficients(whale, model_spec, coefficients, estimator), nest_spec=simulate.eval_nest_coefficients( @@ -221,7 +225,6 @@ def trip_mode_choice( ), skims=skims, locals_d=locals_dict, - chunk_size=chunk_size, mode_column_name=mode_column_name, logsum_column_name=logsum_column_name, trace_label=segment_trace_label, @@ -231,7 +234,7 @@ def trip_mode_choice( if trace_hh_id: # trace the coefficients - tracing.trace_df( + whale.trace_df( pd.Series(locals_dict), label=tracing.extend_trace_label(segment_trace_label, "constants"), transpose=False, @@ -241,7 +244,7 @@ def trip_mode_choice( # so we can trace with annotations assign_in_place(trips_segment, choices) - tracing.trace_df( + whale.trace_df( trips_segment, label=tracing.extend_trace_label(segment_trace_label, "trip_mode"), slicer="tour_id", @@ -276,7 +279,7 @@ def trip_mode_choice( ) estimator.write_override_choices(choices_df.trip_mode) estimator.end_estimation() - trips_df = trips.to_frame() + trips_df = trips assign_in_place(trips_df, choices_df) if whale.is_table("school_escort_tours") & model_settings.get( @@ -302,7 +305,7 @@ def trip_mode_choice( annotate.annotate_trips(whale, model_settings, trace_label) if trace_hh_id: - tracing.trace_df( + whale.trace_df( trips_df, label=tracing.extend_trace_label(trace_label, "trip_mode"), slicer="trip_id", diff --git a/activitysim/abm/models/trip_purpose.py b/activitysim/abm/models/trip_purpose.py index 336f6993c..b2b348807 100644 --- a/activitysim/abm/models/trip_purpose.py +++ b/activitysim/abm/models/trip_purpose.py @@ -154,10 +154,10 @@ def choose_intermediate_trip_purpose( ) if have_trace_targets: - tracing.trace_df( + whale.trace_df( choices, "%s.choices" % trace_label, columns=[None, "trip_purpose"] ) - tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) + whale.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) choices = choices.map(pd.Series(purpose_cols)) return choices @@ -324,7 +324,7 @@ def trip_purpose(whale: workflow.Whale, trips, chunk_size, trace_hh_id): whale.add_table("trips", trips_df) if trace_hh_id: - tracing.trace_df( + whale.trace_df( trips_df, label=trace_label, slicer="trip_id", diff --git a/activitysim/abm/models/trip_purpose_and_destination.py b/activitysim/abm/models/trip_purpose_and_destination.py index c9ea74060..25b018498 100644 --- a/activitysim/abm/models/trip_purpose_and_destination.py +++ b/activitysim/abm/models/trip_purpose_and_destination.py @@ -235,7 +235,7 @@ def trip_purpose_and_destination( del save_sample_df if trace_hh_id: - tracing.trace_df( + whale.trace_df( trips_df, label=trace_label, slicer="trip_id", diff --git a/activitysim/abm/models/util/cdap.py b/activitysim/abm/models/util/cdap.py index f10329692..ce95c00e9 100644 --- a/activitysim/abm/models/util/cdap.py +++ b/activitysim/abm/models/util/cdap.py @@ -157,11 +157,11 @@ def assign_cdap_rank( persons["cdap_rank"] = p["cdap_rank"] # assignment aligns on index values # if DUMP: - # tracing.trace_df(persons, '%s.DUMP.cdap_person_array' % trace_label, + # whale.trace_df(persons, '%s.DUMP.cdap_person_array' % trace_label, # transpose=False, slicer='NONE') if trace_hh_id: - tracing.trace_df(persons, "%s.cdap_rank" % trace_label) + whale.trace_df(persons, "%s.cdap_rank" % trace_label) return persons["cdap_rank"] @@ -209,7 +209,7 @@ def individual_utilities( indiv_utils[useful_columns] = persons[useful_columns] if trace_hh_id: - tracing.trace_df( + whale.trace_df( indiv_utils, "%s.indiv_utils" % trace_label, column_labels=["activity", "person"], @@ -354,7 +354,7 @@ def build_cdap_spec( # if DUMP: # # dump the interaction_coefficients table because it has been preprocessed - # tracing.trace_df(interaction_coefficients, + # whale.trace_df(interaction_coefficients, # '%s.hhsize%d_interaction_coefficients' % (trace_label, hhsize), # transpose=False, slicer='NONE') @@ -467,7 +467,7 @@ def build_cdap_spec( simulate.uniquify_spec_index(spec) if trace_spec: - tracing.trace_df( + whale.trace_df( spec, "%s.hhsize%d_spec" % (trace_label, hhsize), transpose=False, @@ -480,7 +480,7 @@ def build_cdap_spec( spec[c] = spec[c].map(lambda x: d.get(x, x or 0.0)).fillna(0) if trace_spec: - tracing.trace_df( + whale.trace_df( spec, "%s.hhsize%d_spec_patched" % (trace_label, hhsize), transpose=False, @@ -708,28 +708,28 @@ def household_activity_choices( if trace_hh_id: if hhsize > 1: - tracing.trace_df( + whale.trace_df( choosers, "%s.hhsize%d_choosers" % (trace_label, hhsize), column_labels=["expression", "person"], ) - tracing.trace_df( + whale.trace_df( utils, "%s.hhsize%d_utils" % (trace_label, hhsize), column_labels=["expression", "household"], ) - tracing.trace_df( + whale.trace_df( probs, "%s.hhsize%d_probs" % (trace_label, hhsize), column_labels=["expression", "household"], ) - tracing.trace_df( + whale.trace_df( choices, "%s.hhsize%d_activity_choices" % (trace_label, hhsize), column_labels=["expression", "household"], ) - tracing.trace_df( + whale.trace_df( rands, "%s.hhsize%d_rands" % (trace_label, hhsize), columns=[None, "rand"] ) @@ -778,7 +778,7 @@ def unpack_cdap_indiv_activity_choices(persons, hh_choices, trace_hh_id, trace_l cdap_indiv_activity_choices = indiv_activity["cdap_activity"] # if DUMP: - # tracing.trace_df(cdap_indiv_activity_choices, + # whale.trace_df(cdap_indiv_activity_choices, # '%s.DUMP.cdap_indiv_activity_choices' % trace_label, # transpose=False, slicer='NONE') @@ -850,31 +850,31 @@ def extra_hh_member_choices( choices = pd.Series(probs.columns[idx_choices].values, index=probs.index) # if DUMP: - # tracing.trace_df(proportions, '%s.DUMP.extra_proportions' % trace_label, + # whale.trace_df(proportions, '%s.DUMP.extra_proportions' % trace_label, # transpose=False, slicer='NONE') - # tracing.trace_df(probs, '%s.DUMP.extra_probs' % trace_label, + # whale.trace_df(probs, '%s.DUMP.extra_probs' % trace_label, # transpose=False, slicer='NONE') - # tracing.trace_df(choices, '%s.DUMP.extra_choices' % trace_label, + # whale.trace_df(choices, '%s.DUMP.extra_choices' % trace_label, # transpose=False, # slicer='NONE') if trace_hh_id: - tracing.trace_df( + whale.trace_df( proportions, "%s.extra_hh_member_choices_proportions" % trace_label, column_labels=["expression", "person"], ) - tracing.trace_df( + whale.trace_df( probs, "%s.extra_hh_member_choices_probs" % trace_label, column_labels=["expression", "person"], ) - tracing.trace_df( + whale.trace_df( choices, "%s.extra_hh_member_choices_choices" % trace_label, column_labels=["expression", "person"], ) - tracing.trace_df( + whale.trace_df( rands, "%s.extra_hh_member_choices_rands" % trace_label, columns=[None, "rand"], @@ -977,9 +977,9 @@ def _run_cdap( chunk_sizer.log_df(trace_label, "persons", persons) # if DUMP: - # tracing.trace_df(hh_activity_choices, '%s.DUMP.hh_activity_choices' % trace_label, + # whale.trace_df(hh_activity_choices, '%s.DUMP.hh_activity_choices' % trace_label, # transpose=False, slicer='NONE') - # tracing.trace_df(cdap_results, '%s.DUMP.cdap_results' % trace_label, + # whale.trace_df(cdap_results, '%s.DUMP.cdap_results' % trace_label, # transpose=False, slicer='NONE') result = persons[["cdap_rank", "cdap_activity"]] @@ -1072,7 +1072,7 @@ def run_cdap( cdap_results = pd.concat(result_list) if trace_hh_id: - tracing.trace_df( + whale.trace_df( cdap_results, label="cdap", columns=["cdap_rank", "cdap_activity"], diff --git a/activitysim/abm/models/util/mode.py b/activitysim/abm/models/util/mode.py index 22076819c..9711261f6 100644 --- a/activitysim/abm/models/util/mode.py +++ b/activitysim/abm/models/util/mode.py @@ -1,9 +1,11 @@ # ActivitySim # See full license in LICENSE.txt. import logging +from typing import Optional import pandas as pd +from activitysim.abm.models.util.estimation import Estimator from activitysim.core import config, expressions, simulate, tracing, workflow """ @@ -16,18 +18,18 @@ def mode_choice_simulate( - choosers, - spec, + whale: workflow.Whale, + choosers: pd.DataFrame, + spec: pd.DataFrame, nest_spec, skims, locals_d, - chunk_size, mode_column_name, logsum_column_name, - trace_label, + trace_label: str, trace_choice_name, trace_column_names=None, - estimator=None, + estimator: Optional[Estimator] = None, ): """ common method for both tour_mode_choice and trip_mode_choice @@ -130,7 +132,7 @@ def run_tour_mode_choice_simulate( choosers["out_period"] = network_los.skim_time_period_label(choosers[out_time]) expressions.annotate_preprocessors( - choosers, locals_dict, skims, model_settings, trace_label + whale, choosers, locals_dict, skims, model_settings, trace_label ) trace_column_names = choosers.index.name @@ -143,12 +145,12 @@ def run_tour_mode_choice_simulate( estimator.write_choosers(choosers) choices = mode_choice_simulate( + whale, choosers=choosers, spec=spec, nest_spec=nest_spec, skims=skims, locals_d=locals_dict, - chunk_size=whale.settings.chunk_size, mode_column_name=mode_column_name, logsum_column_name=logsum_column_name, trace_label=trace_label, diff --git a/activitysim/abm/models/util/probabilistic_scheduling.py b/activitysim/abm/models/util/probabilistic_scheduling.py index 782884cc9..a8822d9ab 100644 --- a/activitysim/abm/models/util/probabilistic_scheduling.py +++ b/activitysim/abm/models/util/probabilistic_scheduling.py @@ -267,7 +267,7 @@ def make_scheduling_choices( chunk.log_df(trace_label, "choosers", choosers) if trace_hh_id and tracing.has_trace_targets(whale, choosers_df): - tracing.trace_df(choosers, "%s.choosers" % trace_label) + whale.trace_df(choosers, "%s.choosers" % trace_label) # different pre-processing is required based on the scheduling mode chooser_probs = _preprocess_scheduling_probs( @@ -284,7 +284,7 @@ def make_scheduling_choices( chunk.log_df(trace_label, "chooser_probs", chooser_probs) if trace_hh_id and tracing.has_trace_targets(whale, choosers_df): - tracing.trace_df(chooser_probs, "%s.chooser_probs" % trace_label) + whale.trace_df(chooser_probs, "%s.chooser_probs" % trace_label) raw_choices, rands = logit.make_choices( whale, chooser_probs, trace_label=trace_label, trace_choosers=choosers @@ -294,12 +294,12 @@ def make_scheduling_choices( chunk.log_df(trace_label, "rands", rands) if trace_hh_id and tracing.has_trace_targets(whale, choosers_df): - tracing.trace_df( + whale.trace_df( raw_choices, "%s.choices" % trace_label, columns=[None, trace_choice_col_name], ) - tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) + whale.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) # different post-processing is required based on the scheduling mode choices, failed = _postprocess_scheduling_choices( @@ -324,10 +324,10 @@ def make_scheduling_choices( # trace before removing failures if trace_hh_id and tracing.has_trace_targets(whale, choosers_df): - tracing.trace_df( + whale.trace_df( choices, "%s.choices" % trace_label, columns=[None, trace_choice_col_name] ) - tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) + whale.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) # remove any failed choices if failed.any(): diff --git a/activitysim/abm/models/util/tour_destination.py b/activitysim/abm/models/util/tour_destination.py index dedffbc14..36239c653 100644 --- a/activitysim/abm/models/util/tour_destination.py +++ b/activitysim/abm/models/util/tour_destination.py @@ -257,7 +257,7 @@ def choose_MAZ_for_TAZ(whale: workflow.Whale, taz_sample, MAZ_size_terms, trace_ # write taz choices, pick_counts, probs trace_targets = tracing.trace_targets(taz_sample) - tracing.trace_df( + whale.trace_df( taz_sample[trace_targets], label=tracing.extend_trace_label(trace_label, "taz_sample"), transpose=False, @@ -327,7 +327,7 @@ def choose_MAZ_for_TAZ(whale: workflow.Whale, taz_sample, MAZ_size_terms, trace_ maz_sizes_trace_targets = tracing.trace_targets(maz_sizes, slicer=CHOOSER_ID) trace_maz_sizes = maz_sizes[maz_sizes_trace_targets] - tracing.trace_df( + whale.trace_df( trace_maz_sizes, label=tracing.extend_trace_label(trace_label, "maz_sizes"), transpose=False, @@ -380,7 +380,7 @@ def choose_MAZ_for_TAZ(whale: workflow.Whale, taz_sample, MAZ_size_terms, trace_ taz_choices, slicer=CHOOSER_ID ) trace_taz_choices_df = taz_choices[taz_choices_trace_targets] - tracing.trace_df( + whale.trace_df( trace_taz_choices_df, label=tracing.extend_trace_label(trace_label, "taz_choices"), transpose=False, @@ -406,7 +406,7 @@ def choose_MAZ_for_TAZ(whale: workflow.Whale, taz_sample, MAZ_size_terms, trace_ index=trace_taz_choices_df.index, ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df( + whale.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_alts"), transpose=False, @@ -422,7 +422,7 @@ def choose_MAZ_for_TAZ(whale: workflow.Whale, taz_sample, MAZ_size_terms, trace_ index=trace_taz_choices_df.index, ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df( + whale.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_size_terms"), transpose=False, @@ -436,7 +436,7 @@ def choose_MAZ_for_TAZ(whale: workflow.Whale, taz_sample, MAZ_size_terms, trace_ ) df = pd.concat([lhs_df, df], axis=1) df["rand"] = rands[taz_choices_trace_targets] - tracing.trace_df( + whale.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_probs"), transpose=False, @@ -643,8 +643,8 @@ def run_destination_logsums( logger.info("Running %s with %s rows", trace_label, len(choosers)) - tracing.dump_df(DUMP, persons_merged, trace_label, "persons_merged") - tracing.dump_df(DUMP, choosers, trace_label, "choosers") + whale.dump_df(DUMP, persons_merged, trace_label, "persons_merged") + whale.dump_df(DUMP, choosers, trace_label, "choosers") logsums = logsum.compute_logsums( whale, @@ -727,7 +727,7 @@ def run_destination_simulate( destination_size_terms.size_term, destination_sample[alt_dest_col_name] ) - tracing.dump_df(DUMP, destination_sample, trace_label, "alternatives") + whale.dump_df(DUMP, destination_sample, trace_label, "alternatives") constants = config.get_model_constants(model_settings) @@ -748,7 +748,7 @@ def run_destination_simulate( if constants is not None: locals_d.update(constants) - tracing.dump_df(DUMP, choosers, trace_label, "choosers") + whale.dump_df(DUMP, choosers, trace_label, "choosers") log_alt_losers = whale.settings.log_alt_losers diff --git a/activitysim/abm/models/util/tour_od.py b/activitysim/abm/models/util/tour_od.py index 5a17227f5..6a44a48e3 100644 --- a/activitysim/abm/models/util/tour_od.py +++ b/activitysim/abm/models/util/tour_od.py @@ -356,7 +356,7 @@ def choose_MAZ_for_TAZ( # write taz choices, pick_counts, probs trace_targets = tracing.trace_targets(taz_sample) - tracing.trace_df( + whale.trace_df( taz_sample[trace_targets], label=tracing.extend_trace_label(trace_label, "taz_sample"), transpose=False, @@ -436,7 +436,7 @@ def choose_MAZ_for_TAZ( maz_sizes_trace_targets = tracing.trace_targets(maz_sizes, slicer=CHOOSER_ID) trace_maz_sizes = maz_sizes[maz_sizes_trace_targets] - tracing.trace_df( + whale.trace_df( trace_maz_sizes, label=tracing.extend_trace_label(trace_label, "maz_sizes"), transpose=False, @@ -491,7 +491,7 @@ def choose_MAZ_for_TAZ( taz_choices, slicer=CHOOSER_ID ) trace_taz_choices_df = taz_choices[taz_choices_trace_targets] - tracing.trace_df( + whale.trace_df( trace_taz_choices_df, label=tracing.extend_trace_label(trace_label, "taz_choices"), transpose=False, @@ -517,7 +517,7 @@ def choose_MAZ_for_TAZ( index=trace_taz_choices_df.index, ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df( + whale.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_alts"), transpose=False, @@ -533,7 +533,7 @@ def choose_MAZ_for_TAZ( index=trace_taz_choices_df.index, ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df( + whale.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_size_terms"), transpose=False, @@ -547,7 +547,7 @@ def choose_MAZ_for_TAZ( ) df = pd.concat([lhs_df, df], axis=1) df["rand"] = rands[taz_choices_trace_targets] - tracing.trace_df( + whale.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_probs"), transpose=False, @@ -798,7 +798,7 @@ def run_od_logsums( logger.info("Running %s with %s rows", trace_label, len(choosers)) - tracing.dump_df(DUMP, choosers, trace_label, "choosers") + whale.dump_df(DUMP, choosers, trace_label, "choosers") # run trip mode choice to compute tour mode choice logsums if logsum_settings.get("COMPUTE_TRIP_MODE_CHOICE_LOGSUMS", False): @@ -996,7 +996,7 @@ def run_od_simulate( od_sample, lu, left_on=origin_col_name, right_index=True, how="left" ) - tracing.dump_df(DUMP, od_sample, trace_label, "alternatives") + whale.dump_df(DUMP, od_sample, trace_label, "alternatives") constants = config.get_model_constants(model_settings) @@ -1017,7 +1017,7 @@ def run_od_simulate( if constants is not None: locals_d.update(constants) - tracing.dump_df(DUMP, choosers, trace_label, "choosers") + whale.dump_df(DUMP, choosers, trace_label, "choosers") choices = interaction_sample_simulate( whale, choosers, diff --git a/activitysim/abm/models/util/vectorize_tour_scheduling.py b/activitysim/abm/models/util/vectorize_tour_scheduling.py index 379f04a7e..964699b09 100644 --- a/activitysim/abm/models/util/vectorize_tour_scheduling.py +++ b/activitysim/abm/models/util/vectorize_tour_scheduling.py @@ -412,7 +412,7 @@ def compute_tour_scheduling_logsums( skims, trace_label, ) - tracing.trace_df( + whale.trace_df( trace_logsums_df, label=tracing.extend_trace_label(trace_label, "representative_logsums"), slicer="NONE", diff --git a/activitysim/abm/models/vehicle_allocation.py b/activitysim/abm/models/vehicle_allocation.py index ee81bbaec..de456b274 100644 --- a/activitysim/abm/models/vehicle_allocation.py +++ b/activitysim/abm/models/vehicle_allocation.py @@ -243,4 +243,4 @@ def vehicle_allocation( annotate_vehicle_allocation(whale, model_settings, trace_label) if trace_hh_id: - tracing.trace_df(tours, label="vehicle_allocation", warn_if_empty=True) + whale.trace_df(tours, label="vehicle_allocation", warn_if_empty=True) diff --git a/activitysim/abm/models/vehicle_type_choice.py b/activitysim/abm/models/vehicle_type_choice.py index 8ae33a14b..112b24f82 100644 --- a/activitysim/abm/models/vehicle_type_choice.py +++ b/activitysim/abm/models/vehicle_type_choice.py @@ -588,4 +588,4 @@ def vehicle_type_choice( ) if trace_hh_id: - tracing.trace_df(vehicles, label="vehicle_type_choice", warn_if_empty=True) + whale.trace_df(vehicles, label="vehicle_type_choice", warn_if_empty=True) diff --git a/activitysim/abm/models/work_from_home.py b/activitysim/abm/models/work_from_home.py index 83d31c98d..c1cb879ec 100755 --- a/activitysim/abm/models/work_from_home.py +++ b/activitysim/abm/models/work_from_home.py @@ -174,4 +174,4 @@ def work_from_home(whale: workflow.Whale, persons_merged, persons): tracing.print_summary("work_from_home", persons.work_from_home, value_counts=True) if whale.settings.trace_hh_id: - tracing.trace_df(persons, label=trace_label, warn_if_empty=True) + whale.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/abm/tables/households.py b/activitysim/abm/tables/households.py index dd8ac88ef..6d14dc92b 100644 --- a/activitysim/abm/tables/households.py +++ b/activitysim/abm/tables/households.py @@ -112,7 +112,7 @@ def households(whale: workflow.Whale): tracing.register_traceable_table(whale, "households", df) if _trace_hh_id: - tracing.trace_df(df, "raw.households", warn_if_empty=True) + whale.trace_df(df, "raw.households", warn_if_empty=True) return df diff --git a/activitysim/abm/tables/persons.py b/activitysim/abm/tables/persons.py index 6ac3e6bb2..5fe4b500f 100644 --- a/activitysim/abm/tables/persons.py +++ b/activitysim/abm/tables/persons.py @@ -39,7 +39,7 @@ def persons(whale: workflow.Whale): tracing.register_traceable_table(whale, "persons", df) if trace_hh_id: - tracing.trace_df(df, "raw.persons", warn_if_empty=True) + whale.trace_df(df, "raw.persons", warn_if_empty=True) logger.debug(f"{len(df.household_id.unique())} unique household_ids in persons") logger.debug(f"{len(households.index.unique())} unique household_ids in households") @@ -100,12 +100,6 @@ def persons_merged( accessibility: pd.DataFrame, disaggregate_accessibility: pd.DataFrame = None, ): - # land_use = whale.get_dataframe("land_use") - # households = whale.get_dataframe("households") - # accessibility = whale.get_dataframe("accessibility") - # persons = whale.get_dataframe("persons") - # disaggregate_accessibility = whale.get_dataframe("disaggregate_accessibility") - def join(left, right, left_on): intersection = set(left.columns).intersection(right.columns) intersection.discard(left_on) # intersection is ok if it's the join key diff --git a/activitysim/abm/tables/shadow_pricing.py b/activitysim/abm/tables/shadow_pricing.py index 862eaa785..070ed2a7b 100644 --- a/activitysim/abm/tables/shadow_pricing.py +++ b/activitysim/abm/tables/shadow_pricing.py @@ -508,7 +508,7 @@ def set_choices(self, choices, segment_ids): self.choices_synced = self.synchronize_choices(choice_merged) - def check_fit(self, iteration): + def check_fit(self, whale: workflow.Whale, iteration): """ Check convergence criteria fit of modeled_size to target desired_size (For multiprocessing, this is global modeled_size summed across processes, diff --git a/activitysim/abm/tables/tours.py b/activitysim/abm/tables/tours.py index a3bd8a811..a81a3f8aa 100644 --- a/activitysim/abm/tables/tours.py +++ b/activitysim/abm/tables/tours.py @@ -2,14 +2,34 @@ # See full license in LICENSE.txt. import logging -from activitysim.core import inject +import pandas as pd + +from activitysim.core import inject, workflow logger = logging.getLogger(__name__) -@inject.table() -def tours_merged(tours, persons_merged): - return inject.merge_tables(tours.name, tables=[tours, persons_merged]) +@workflow.temp_table +def tours_merged( + whale: workflow.Whale, tours: pd.DataFrame, persons_merged: pd.DataFrame +): + # return inject.merge_tables(tours.name, tables=[tours, persons_merged]) + def join(left, right, left_on): + intersection = set(left.columns).intersection(right.columns) + intersection.discard(left_on) # intersection is ok if it's the join key + right = right.drop(intersection, axis=1) + return pd.merge( + left, + right, + left_on=left_on, + right_index=True, + ) + + return join( + tours, + persons_merged, + left_on="person_id", + ) -inject.broadcast("persons_merged", "tours", cast_index=True, onto_on="person_id") +# inject.broadcast("persons_merged", "tours", cast_index=True, onto_on="person_id") diff --git a/activitysim/cli/run.py b/activitysim/cli/run.py index 89d4f549c..b8207ab8a 100644 --- a/activitysim/cli/run.py +++ b/activitysim/cli/run.py @@ -422,7 +422,7 @@ def run(args): logger.exception("activitysim run encountered an unrecoverable error") raise - chunk.consolidate_logs() + chunk.consolidate_logs(whale) mem.consolidate_logs() from ..core.flow import TimeLogger diff --git a/activitysim/core/chunk.py b/activitysim/core/chunk.py index 314cb9580..c4f310064 100644 --- a/activitysim/core/chunk.py +++ b/activitysim/core/chunk.py @@ -128,7 +128,7 @@ ledger_lock = threading.Lock() -def chunk_method(): +def chunk_method(whale: workflow.Whale): method = SETTINGS.get("chunk_method") if method is None: method = SETTINGS.setdefault("chunk_method", whale.settings.chunk_method) @@ -138,13 +138,13 @@ def chunk_method(): return method -def chunk_metric(): +def chunk_metric(whale: workflow.Whale): return SETTINGS.setdefault( - "chunk_metric", USS if chunk_method() in USS_CHUNK_METHODS else "rss" + "chunk_metric", USS if chunk_method(whale) in USS_CHUNK_METHODS else "rss" ) -def chunk_training_mode(): +def chunk_training_mode(whale: workflow.Whale): training_mode = SETTINGS.setdefault( "chunk_training_mode", whale.settings.chunk_training_mode ) @@ -169,13 +169,13 @@ def default_initial_rows_per_chunk(): ) -def min_available_chunk_ratio(): +def min_available_chunk_ratio(whale: workflow.Whale): return SETTINGS.setdefault( "min_available_chunk_ratio", whale.settings.min_available_chunk_ratio ) -def keep_chunk_logs(): +def keep_chunk_logs(whale: workflow.Whale): # if we are overwriting MEM_LOG_FILE then presumably we want to delete any subprocess files default = LOG_FILE_NAME == OMNIBUS_LOG_FILE_NAME @@ -194,7 +194,7 @@ def get_base_chunk_size(): return CHUNK_SIZERS[0].chunk_size -def overhead_for_chunk_method(overhead, method=None): +def overhead_for_chunk_method(whale: workflow.Whale, overhead, method=None): """ return appropriate overhead for row_size calculation based on current chunk_method @@ -223,7 +223,7 @@ def hybrid(xss, bytes): return hybrid_overhead - method = method or chunk_method() + method = method or chunk_method(whale) if method == HYBRID_RSS: oh = hybrid(overhead[RSS], overhead[BYTES]) @@ -236,14 +236,14 @@ def hybrid(xss, bytes): return oh -def consolidate_logs(): +def consolidate_logs(whale: workflow.Whale): glob_file_name = config.log_file_path(f"*{LOG_FILE_NAME}", prefix=False) glob_files = glob.glob(glob_file_name) if not glob_files: return - assert chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS), ( + assert chunk_training_mode(whale,) not in (MODE_PRODUCTION, MODE_CHUNKLESS), ( f"shouldn't be any chunk log files when chunk_training_mode" f" is {MODE_PRODUCTION} or {MODE_CHUNKLESS}" ) @@ -269,7 +269,7 @@ def consolidate_logs(): len(multi_depth_chunk_tag) == 0 ), f"consolidate_logs multi_depth_chunk_tags \n{multi_depth_chunk_tag.values}" - if not keep_chunk_logs(): + if not keep_chunk_logs(whale): util.delete_files(glob_files, "chunk.consolidate_logs") log_output_path = config.log_file_path(OMNIBUS_LOG_FILE_NAME, prefix=False) @@ -305,7 +305,7 @@ def consolidate_logs(): num_rows = omnibus_df[C_NUM_ROWS] for m in USS_CHUNK_METHODS: omnibus_df[f"{m}_row_size"] = np.ceil( - overhead_for_chunk_method(omnibus_df, m) / num_rows + overhead_for_chunk_method(whale, omnibus_df, m) / num_rows ).astype(int) omnibus_df = omnibus_df.sort_values(by=C_CHUNK_TAG) @@ -316,7 +316,12 @@ def consolidate_logs(): ) omnibus_df.to_csv(log_dir_output_path, mode="w", index=False) - if (chunk_training_mode() == MODE_RETRAIN) or not _HISTORIAN.have_cached_history: + if ( + chunk_training_mode( + whale, + ) + == MODE_RETRAIN + ) or not _HISTORIAN.have_cached_history: if whale.settings.resume_after: # FIXME logger.warning( @@ -332,7 +337,7 @@ def consolidate_logs(): omnibus_df.to_csv(cache_dir_output_path, mode="w", index=False) -class ChunkHistorian(object): +class ChunkHistorian: """ Utility for estimating row_size """ @@ -342,8 +347,13 @@ def __init__(self): self.have_cached_history = None self.cached_history_df = None - def load_cached_history(self): - if chunk_training_mode() == MODE_RETRAIN: + def load_cached_history(self, whale: workflow.Whale): + if ( + chunk_training_mode( + whale, + ) + == MODE_RETRAIN + ): # don't need cached history if retraining return @@ -374,10 +384,20 @@ def load_cached_history(self): else: self.have_cached_history = False - if chunk_training_mode() == MODE_CHUNKLESS: + if ( + chunk_training_mode( + whale, + ) + == MODE_CHUNKLESS + ): return - if chunk_training_mode() == MODE_PRODUCTION: + if ( + chunk_training_mode( + whale, + ) + == MODE_PRODUCTION + ): # raise RuntimeError(f"chunk_training_mode is {MODE_PRODUCTION} but no chunk_cache: {chunk_cache_path}") SETTINGS["chunk_training_mode"] = MODE_RETRAIN @@ -385,12 +405,12 @@ def load_cached_history(self): f"chunk_training_mode is {MODE_PRODUCTION} but no chunk_cache: {chunk_cache_path}" ) logger.warning( - f"chunk_training_mode falling back to {chunk_training_mode()}" + f"chunk_training_mode falling back to {chunk_training_mode(whale,)}" ) - def cached_history_for_chunk_tag(self, chunk_tag): + def cached_history_for_chunk_tag(self, whale: workflow.Whale, chunk_tag): history = {} - self.load_cached_history() + self.load_cached_history(whale) if self.have_cached_history: try: @@ -416,21 +436,25 @@ def cached_history_for_chunk_tag(self, chunk_tag): return history - def cached_row_size(self, chunk_tag): + def cached_row_size(self, whale: workflow.Whale, chunk_tag): row_size = 0 - cached_history = self.cached_history_for_chunk_tag(chunk_tag) + cached_history = self.cached_history_for_chunk_tag(whale, chunk_tag) if cached_history: cum_overhead = {m: cached_history[m] for m in METRICS} num_rows = cached_history[C_NUM_ROWS] # initial_row_size based on cum_overhead and rows_processed from chunk_cache - row_size = math.ceil(overhead_for_chunk_method(cum_overhead) / num_rows) + row_size = math.ceil( + overhead_for_chunk_method(whale, cum_overhead) / num_rows + ) return row_size - def write_history(self, history, chunk_tag): - assert chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS) + def write_history(self, whale: workflow.Whale, history, chunk_tag): + assert chunk_training_mode( + whale, + ) not in (MODE_PRODUCTION, MODE_CHUNKLESS) history_df = pd.DataFrame.from_dict(history) @@ -473,8 +497,12 @@ def __init__(self, trace_label, chunk_size, baseline_rss, baseline_uss, headroom self.hwm_uss = {"value": baseline_uss, "info": f"{trace_label}.init"} self.total_bytes = 0 - def audit(self, msg, bytes=0, rss=0, uss=0, from_rss_monitor=False): - assert chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS) + def audit( + self, whale: workflow.Whale, msg, bytes=0, rss=0, uss=0, from_rss_monitor=False + ): + assert chunk_training_mode( + whale, + ) not in (MODE_PRODUCTION, MODE_CHUNKLESS) MAX_OVERDRAFT = 0.2 @@ -490,7 +518,7 @@ def audit(self, msg, bytes=0, rss=0, uss=0, from_rss_monitor=False): f"bytes: {bytes} headroom: {self.headroom} chunk_size: {self.base_chunk_size} {msg}" ) - if chunk_metric() == RSS and rss > mem_panic_threshold: + if chunk_metric(whale) == RSS and rss > mem_panic_threshold: rss, _ = mem.get_rss(force_garbage_collect=True, uss=False) if rss > mem_panic_threshold: logger.warning( @@ -498,7 +526,7 @@ def audit(self, msg, bytes=0, rss=0, uss=0, from_rss_monitor=False): f"rss: {rss} chunk_size: {self.base_chunk_size} {msg}" ) - if chunk_metric() == USS and uss > mem_panic_threshold: + if chunk_metric(whale) == USS and uss > mem_panic_threshold: _, uss = mem.get_rss(force_garbage_collect=True, uss=True) if uss > mem_panic_threshold: logger.warning( @@ -550,7 +578,9 @@ def size_it(df): assert False return elements, bytes - assert chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS) + assert chunk_training_mode( + whale, + ) not in (MODE_PRODUCTION, MODE_CHUNKLESS) if df is None: elements, bytes = (0, 0) @@ -581,7 +611,9 @@ def size_it(df): self.total_bytes = sum(self.tables.values()) def check_local_hwm(self, hwm_trace_label, rss, uss, total_bytes): - assert chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS) + assert chunk_training_mode( + whale, + ) not in (MODE_PRODUCTION, MODE_CHUNKLESS) from_rss_monitor = total_bytes is None @@ -599,19 +631,23 @@ def check_local_hwm(self, hwm_trace_label, rss, uss, total_bytes): # total_bytes high water mark self.hwm_bytes["value"] = total_bytes self.hwm_bytes["info"] = info - self.audit(hwm_trace_label, bytes=total_bytes) + self.audit(whale, hwm_trace_label, bytes=total_bytes) if rss > self.hwm_rss["value"]: # rss high water mark self.hwm_rss["value"] = rss self.hwm_rss["info"] = info - self.audit(hwm_trace_label, rss=rss, from_rss_monitor=from_rss_monitor) + self.audit( + whale, hwm_trace_label, rss=rss, from_rss_monitor=from_rss_monitor + ) if uss > self.hwm_uss["value"]: # uss high water mark self.hwm_uss["value"] = uss self.hwm_uss["info"] = info - self.audit(hwm_trace_label, uss=uss, from_rss_monitor=from_rss_monitor) + self.audit( + whale, hwm_trace_label, uss=uss, from_rss_monitor=from_rss_monitor + ) # silently registers global high water mark mem.check_global_hwm(RSS, rss, hwm_trace_label) @@ -634,7 +670,12 @@ def get_hwm_bytes(self): def log_rss(trace_label, force=False): - if chunk_training_mode() == MODE_CHUNKLESS: + if ( + chunk_training_mode( + whale, + ) + == MODE_CHUNKLESS + ): # no memory tracing at all in chunkless mode return @@ -642,7 +683,12 @@ def log_rss(trace_label, force=False): hwm_trace_label = f"{trace_label}.log_rss" - if chunk_training_mode() == MODE_PRODUCTION: + if ( + chunk_training_mode( + whale, + ) + == MODE_PRODUCTION + ): # FIXME - this trace_memory_info call slows things down a lot so it is turned off for now # trace_ticks = 0 if force else mem.MEM_TRACE_TICK_LEN # mem.trace_memory_info(hwm_trace_label, trace_ticks=trace_ticks) @@ -657,7 +703,9 @@ def log_rss(trace_label, force=False): def log_df(trace_label, table_name, df): - if chunk_training_mode() in (MODE_PRODUCTION, MODE_CHUNKLESS): + if chunk_training_mode( + whale, + ) in (MODE_PRODUCTION, MODE_CHUNKLESS): return assert len(CHUNK_LEDGERS) > 0, f"log_df called without current chunker." @@ -707,7 +755,7 @@ def __init__( self.chunk_training_mode = chunk_training_mode if self.chunk_training_mode != MODE_CHUNKLESS: - if chunk_metric() == USS: + if chunk_metric(whale) == USS: self.rss, self.uss = mem.get_rss(force_garbage_collect=True, uss=True) else: self.rss, _ = mem.get_rss(force_garbage_collect=True, uss=False) @@ -738,7 +786,7 @@ def __init__( self.num_choosers = num_choosers self.rows_processed = 0 - min_chunk_ratio = min_available_chunk_ratio() + min_chunk_ratio = min_available_chunk_ratio(whale) assert ( 0 <= min_chunk_ratio <= 1 ), f"min_chunk_ratio setting {min_chunk_ratio} is not in range [0..1]" @@ -754,7 +802,9 @@ def __init__( # if production mode, to reduce volatility, initialize cum_overhead and cum_rows from cache if self.chunk_training_mode in [MODE_ADAPTIVE, MODE_PRODUCTION]: - cached_history = _HISTORIAN.cached_history_for_chunk_tag(self.chunk_tag) + cached_history = _HISTORIAN.cached_history_for_chunk_tag( + whale, self.chunk_tag + ) if cached_history: self.cum_overhead = {m: cached_history[m] for m in METRICS} self.cum_rows = cached_history[C_NUM_ROWS] @@ -772,7 +822,7 @@ def __init__( # need base_chunk_size to calc headroom self.headroom = self.available_headroom( - self.uss if chunk_metric() == USS else self.rss + self.uss if chunk_metric(whale) == USS else self.rss ) def close(self): @@ -782,7 +832,7 @@ def close(self): if ((self.depth == 1) or WRITE_SUBCHUNK_HISTORY) and ( self.chunk_training_mode not in (MODE_PRODUCTION, MODE_CHUNKLESS) ): - _HISTORIAN.write_history(self.history, self.chunk_tag) + _HISTORIAN.write_history(whale, self.history, self.chunk_tag) _chunk_sizer = CHUNK_SIZERS.pop() assert _chunk_sizer == self @@ -807,7 +857,7 @@ def available_headroom(self, xss): def initial_rows_per_chunk(self): # whatever the TRAINING_MODE, use cache to determine initial_row_size # (presumably preferable to default_initial_rows_per_chunk) - self.initial_row_size = _HISTORIAN.cached_row_size(self.chunk_tag) + self.initial_row_size = _HISTORIAN.cached_row_size(whale, self.chunk_tag) if self.chunk_size == 0: rows_per_chunk = self.num_choosers @@ -871,14 +921,14 @@ def adaptive_rows_per_chunk(self, i): prev_uss = self.uss if self.chunk_training_mode != MODE_PRODUCTION: - if chunk_metric() == USS: + if chunk_metric(whale) == USS: self.rss, self.uss = mem.get_rss(force_garbage_collect=True, uss=True) else: self.rss, _ = mem.get_rss(force_garbage_collect=True, uss=False) self.uss = 0 self.headroom = self.available_headroom( - self.uss if chunk_metric() == USS else self.rss + self.uss if chunk_metric(whale) == USS else self.rss ) rows_remaining = self.num_choosers - prev_rows_processed @@ -900,7 +950,7 @@ def adaptive_rows_per_chunk(self, i): self.cum_overhead[m] += overhead[m] observed_row_size = prev_cum_rows and math.ceil( - overhead_for_chunk_method(self.cum_overhead) / prev_cum_rows + overhead_for_chunk_method(whale, self.cum_overhead) / prev_cum_rows ) # rows_per_chunk is closest number of chooser rows to achieve chunk_size without exceeding it @@ -931,7 +981,7 @@ def adaptive_rows_per_chunk(self, i): # diagnostics not reported by ChunkHistorian - if chunk_metric() == USS: + if chunk_metric(whale) == USS: self.history.setdefault("prev_uss", []).append(prev_uss) self.history.setdefault("cur_uss", []).append(self.uss) else: @@ -1088,7 +1138,9 @@ def chunk_log(trace_label, chunk_tag=None, base=False, settings=None): # avoids breaking the assertion below. if settings is None: - _chunk_training_mode = chunk_training_mode() + _chunk_training_mode = chunk_training_mode( + whale, + ) else: _chunk_training_mode = settings.chunk_training_mode @@ -1179,7 +1231,12 @@ def adaptive_chunked_choosers( offset += rows_per_chunk - if chunk_training_mode() != MODE_CHUNKLESS: + if ( + chunk_training_mode( + whale, + ) + != MODE_CHUNKLESS + ): ( rows_per_chunk, estimated_number_of_chunks, @@ -1316,7 +1373,12 @@ def adaptive_chunked_choosers_and_alts( offset += rows_per_chunk alt_offset = alt_end - if chunk_training_mode() != MODE_CHUNKLESS: + if ( + chunk_training_mode( + whale, + ) + != MODE_CHUNKLESS + ): ( rows_per_chunk, estimated_number_of_chunks, @@ -1376,7 +1438,12 @@ def adaptive_chunked_choosers_by_chunk_id( offset += rows_per_chunk - if chunk_training_mode() != MODE_CHUNKLESS: + if ( + chunk_training_mode( + whale, + ) + != MODE_CHUNKLESS + ): ( rows_per_chunk, estimated_number_of_chunks, diff --git a/activitysim/core/expressions.py b/activitysim/core/expressions.py index 3ab0eb5c5..0ec3c9049 100644 --- a/activitysim/core/expressions.py +++ b/activitysim/core/expressions.py @@ -119,7 +119,7 @@ def compute_columns(whale, df, model_settings, locals_dict={}, trace_label=None) ) if trace_results is not None: - tracing.trace_df(trace_results, label=trace_label, slicer="NONE") + whale.trace_df(trace_results, label=trace_label, slicer="NONE") if trace_assigned_locals: tracing.write_csv( @@ -154,7 +154,9 @@ def assign_columns( # ################################################################################################## -def annotate_preprocessors(df, locals_dict, skims, model_settings, trace_label): +def annotate_preprocessors( + whale: workflow.Whale, df, locals_dict, skims, model_settings, trace_label +): locals_d = {} locals_d.update(locals_dict) @@ -170,6 +172,7 @@ def annotate_preprocessors(df, locals_dict, skims, model_settings, trace_label): for model_settings in preprocessor_settings: results = compute_columns( + whale, df=df, model_settings=model_settings, locals_dict=locals_d, diff --git a/activitysim/core/interaction_sample.py b/activitysim/core/interaction_sample.py index eec7e5c49..2f1501b47 100644 --- a/activitysim/core/interaction_sample.py +++ b/activitysim/core/interaction_sample.py @@ -186,8 +186,8 @@ def _interaction_sample( assert num_choosers > 0 if have_trace_targets: - tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, "choosers")) - tracing.trace_df( + whale.trace_df(choosers, tracing.extend_trace_label(trace_label, "choosers")) + whale.trace_df( alternatives, tracing.extend_trace_label(trace_label, "alternatives"), slicer="NONE", @@ -262,7 +262,7 @@ def _interaction_sample( interaction_df, choosers, alternative_count ) - tracing.trace_df( + whale.trace_df( interaction_df[trace_rows], tracing.extend_trace_label(trace_label, "interaction_df"), slicer="NONE", @@ -352,7 +352,7 @@ def _interaction_sample( if have_trace_targets and trace_rows is not None: try: - tracing.trace_df( + whale.trace_df( interaction_utilities[trace_rows], tracing.extend_trace_label(trace_label, "interaction_utilities"), slicer="NONE", @@ -361,7 +361,7 @@ def _interaction_sample( except ValueError: pass - tracing.dump_df(DUMP, interaction_utilities, trace_label, "interaction_utilities") + whale.dump_df(DUMP, interaction_utilities, trace_label, "interaction_utilities") # reshape utilities (one utility column and one row per row in interaction_utilities) # to a dataframe with one row per chooser and one column per alternative @@ -375,13 +375,13 @@ def _interaction_sample( chunk_sizer.log_df(trace_label, "interaction_utilities", None) if have_trace_targets: - tracing.trace_df( + whale.trace_df( utilities, tracing.extend_trace_label(trace_label, "utils"), column_labels=["alternative", "utility"], ) - tracing.dump_df(DUMP, utilities, trace_label, "utilities") + whale.dump_df(DUMP, utilities, trace_label, "utilities") # convert to probabilities (utilities exponentiated and normalized to probs) # probs is same shape as utilities, one row per chooser and one column for alternative @@ -397,7 +397,7 @@ def _interaction_sample( chunk_sizer.log_df(trace_label, "utilities", None) if have_trace_targets: - tracing.trace_df( + whale.trace_df( probs, tracing.extend_trace_label(trace_label, "probs"), column_labels=["alternative", "probability"], @@ -463,10 +463,10 @@ def _interaction_sample( # set index after groupby so we can trace on it choices_df.set_index(choosers.index.name, inplace=True) - tracing.dump_df(DUMP, choices_df, trace_label, "choices_df") + whale.dump_df(DUMP, choices_df, trace_label, "choices_df") if have_trace_targets: - tracing.trace_df( + whale.trace_df( choices_df, tracing.extend_trace_label(trace_label, "sampled_alternatives"), transpose=False, diff --git a/activitysim/core/interaction_sample_simulate.py b/activitysim/core/interaction_sample_simulate.py index c6045872c..ea985fe04 100644 --- a/activitysim/core/interaction_sample_simulate.py +++ b/activitysim/core/interaction_sample_simulate.py @@ -101,8 +101,8 @@ def _interaction_sample_simulate( have_trace_targets = tracing.has_trace_targets(whale, choosers) if have_trace_targets: - tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, "choosers")) - tracing.trace_df( + whale.trace_df(choosers, tracing.extend_trace_label(trace_label, "choosers")) + whale.trace_df( alternatives, tracing.extend_trace_label(trace_label, "alternatives"), transpose=False, @@ -142,7 +142,7 @@ def _interaction_sample_simulate( if have_trace_targets: trace_rows, trace_ids = tracing.interaction_trace_rows(interaction_df, choosers) - tracing.trace_df( + whale.trace_df( interaction_df, tracing.extend_trace_label(trace_label, "interaction_df"), transpose=False, @@ -183,7 +183,7 @@ def _interaction_sample_simulate( tracing.extend_trace_label(trace_label, "eval"), ) - tracing.trace_df( + whale.trace_df( interaction_utilities, tracing.extend_trace_label(trace_label, "interaction_utilities"), transpose=False, @@ -234,7 +234,7 @@ def _interaction_sample_simulate( chunk_sizer.log_df(trace_label, "padded_utilities", None) if have_trace_targets: - tracing.trace_df( + whale.trace_df( utilities_df, tracing.extend_trace_label(trace_label, "utilities"), column_labels=["alternative", "utility"], @@ -260,7 +260,7 @@ def _interaction_sample_simulate( chunk_sizer.log_df(trace_label, "utilities_df", None) if have_trace_targets: - tracing.trace_df( + whale.trace_df( probs, tracing.extend_trace_label(trace_label, "probs"), column_labels=["alternative", "probability"], @@ -309,18 +309,18 @@ def _interaction_sample_simulate( choices.loc[zero_probs] = zero_prob_choice_val if have_trace_targets: - tracing.trace_df( + whale.trace_df( choices, tracing.extend_trace_label(trace_label, "choices"), columns=[None, trace_choice_name], ) - tracing.trace_df( + whale.trace_df( rands, tracing.extend_trace_label(trace_label, "rands"), columns=[None, "rand"], ) if want_logsums: - tracing.trace_df( + whale.trace_df( logsums, tracing.extend_trace_label(trace_label, "logsum"), columns=[None, "logsum"], diff --git a/activitysim/core/interaction_simulate.py b/activitysim/core/interaction_simulate.py index 3a1450813..3e35a2a64 100644 --- a/activitysim/core/interaction_simulate.py +++ b/activitysim/core/interaction_simulate.py @@ -5,11 +5,12 @@ from builtins import zip from collections import OrderedDict from datetime import timedelta +from typing import Mapping import numpy as np import pandas as pd -from . import chunk, config, logit, simulate, tracing +from . import chunk, config, logit, simulate, tracing, workflow logger = logging.getLogger(__name__) @@ -582,11 +583,12 @@ def to_series(x): def _interaction_simulate( - choosers, - alternatives, - spec, + whale: workflow.Whale, + choosers: pd.DataFrame, + alternatives: pd.DataFrame, + spec: pd.DataFrame, skims=None, - locals_d=None, + locals_d: Mapping = None, sample_size=None, trace_label=None, trace_choice_name=None, @@ -646,8 +648,8 @@ def _interaction_simulate( have_trace_targets = tracing.has_trace_targets(whale, choosers) if have_trace_targets: - tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, "choosers")) - tracing.trace_df( + whale.trace_df(choosers, tracing.extend_trace_label(trace_label, "choosers")) + whale.trace_df( alternatives, tracing.extend_trace_label(trace_label, "alternatives"), slicer="NONE", @@ -752,7 +754,7 @@ def _interaction_simulate( interaction_df, choosers, sample_size ) - tracing.trace_df( + whale.trace_df( interaction_df[trace_rows], tracing.extend_trace_label(trace_label, "interaction_df"), slicer="NONE", @@ -787,7 +789,7 @@ def _interaction_simulate( tracing.extend_trace_label(trace_label, "eval"), ) - tracing.trace_df( + whale.trace_df( interaction_utilities[trace_rows], tracing.extend_trace_label(trace_label, "interaction_utils"), slicer="NONE", @@ -803,13 +805,13 @@ def _interaction_simulate( chunk_sizer.log_df(trace_label, "utilities", utilities) if have_trace_targets: - tracing.trace_df( + whale.trace_df( utilities, tracing.extend_trace_label(trace_label, "utils"), column_labels=["alternative", "utility"], ) - tracing.dump_df(DUMP, utilities, trace_label, "utilities") + whale.dump_df(DUMP, utilities, trace_label, "utilities") # convert to probabilities (utilities exponentiated and normalized to probs) # probs is same shape as utilities, one row per chooser and one column for alternative @@ -822,7 +824,7 @@ def _interaction_simulate( chunk_sizer.log_df(trace_label, "utilities", None) if have_trace_targets: - tracing.trace_df( + whale.trace_df( probs, tracing.extend_trace_label(trace_label, "probs"), column_labels=["alternative", "probability"], @@ -850,12 +852,12 @@ def _interaction_simulate( chunk_sizer.log_df(trace_label, "choices", choices) if have_trace_targets: - tracing.trace_df( + whale.trace_df( choices, tracing.extend_trace_label(trace_label, "choices"), columns=[None, trace_choice_name], ) - tracing.trace_df( + whale.trace_df( rands, tracing.extend_trace_label(trace_label, "rands"), columns=[None, "rand"], @@ -941,6 +943,7 @@ def interaction_simulate( ) in chunk.adaptive_chunked_choosers(whale, choosers, trace_label): choices = _interaction_simulate( + whale, chooser_chunk, alternatives, spec, diff --git a/activitysim/core/pathbuilder.py b/activitysim/core/pathbuilder.py index 6f283ef7d..6de35de91 100644 --- a/activitysim/core/pathbuilder.py +++ b/activitysim/core/pathbuilder.py @@ -114,7 +114,7 @@ def __init__(self, network_los): def trace_df(self, df, trace_label, extension): assert len(df) > 0 - tracing.trace_df( + whale.trace_df( df, label=tracing.extend_trace_label(trace_label, extension), slicer="NONE", diff --git a/activitysim/core/simulate.py b/activitysim/core/simulate.py index ca848ac54..6a64e5a99 100644 --- a/activitysim/core/simulate.py +++ b/activitysim/core/simulate.py @@ -682,14 +682,14 @@ def eval_utilities( expression_values_df = None if expression_values_sh is not None: - tracing.trace_df( + whale.trace_df( expression_values_sh, tracing.extend_trace_label(trace_label, "expression_values_sh"), slicer=None, transpose=False, ) if expression_values_df is not None: - tracing.trace_df( + whale.trace_df( expression_values_df, tracing.extend_trace_label(trace_label, "expression_values"), slicer=None, @@ -700,7 +700,7 @@ def eval_utilities( for c in spec.columns: name = f"expression_value_{c}" - tracing.trace_df( + whale.trace_df( expression_values_df.multiply(spec[c].values, axis=0), tracing.extend_trace_label(trace_label, name), slicer=None, @@ -1137,7 +1137,7 @@ def eval_mnl( have_trace_targets = tracing.has_trace_targets(whale, choosers) if have_trace_targets: - tracing.trace_df(choosers, "%s.choosers" % trace_label) + whale.trace_df(choosers, "%s.choosers" % trace_label) utilities = eval_utilities( whale, @@ -1154,7 +1154,7 @@ def eval_mnl( chunk_sizer.log_df(trace_label, "utilities", utilities) if have_trace_targets: - tracing.trace_df( + whale.trace_df( utilities, "%s.utilities" % trace_label, column_labels=["alternative", "utility"], @@ -1170,7 +1170,7 @@ def eval_mnl( if have_trace_targets: # report these now in case make_choices throws error on bad_choices - tracing.trace_df( + whale.trace_df( probs, "%s.probs" % trace_label, column_labels=["alternative", "probability"], @@ -1185,10 +1185,10 @@ def eval_mnl( chunk_sizer.log_df(trace_label, "probs", None) if have_trace_targets: - tracing.trace_df( + whale.trace_df( choices, "%s.choices" % trace_label, columns=[None, trace_choice_name] ) - tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) + whale.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) return choices @@ -1253,7 +1253,7 @@ def eval_nl( logit.validate_nest_spec(nest_spec, trace_label) if have_trace_targets: - tracing.trace_df(choosers, "%s.choosers" % trace_label) + whale.trace_df(choosers, "%s.choosers" % trace_label) choosers, spec_sh = _preprocess_tvpb_logsums_on_choosers(choosers, spec, locals_d) @@ -1273,7 +1273,7 @@ def eval_nl( chunk_sizer.log_df(trace_label, "raw_utilities", raw_utilities) if have_trace_targets: - tracing.trace_df( + whale.trace_df( raw_utilities, "%s.raw_utilities" % trace_label, column_labels=["alternative", "utility"], @@ -1287,7 +1287,7 @@ def eval_nl( chunk_sizer.log_df(trace_label, "raw_utilities", None) if have_trace_targets: - tracing.trace_df( + whale.trace_df( nested_exp_utilities, "%s.nested_exp_utilities" % trace_label, column_labels=["alternative", "utility"], @@ -1308,7 +1308,7 @@ def eval_nl( chunk_sizer.log_df(trace_label, "nested_exp_utilities", None) if have_trace_targets: - tracing.trace_df( + whale.trace_df( nested_probabilities, "%s.nested_probabilities" % trace_label, column_labels=["alternative", "probability"], @@ -1324,7 +1324,7 @@ def eval_nl( chunk_sizer.log_df(trace_label, "nested_probabilities", None) if have_trace_targets: - tracing.trace_df( + whale.trace_df( base_probabilities, "%s.base_probabilities" % trace_label, column_labels=["alternative", "probability"], @@ -1360,12 +1360,12 @@ def eval_nl( chunk_sizer.log_df(trace_label, "base_probabilities", None) if have_trace_targets: - tracing.trace_df( + whale.trace_df( choices, "%s.choices" % trace_label, columns=[None, trace_choice_name] ) - tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) + whale.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) if want_logsums: - tracing.trace_df( + whale.trace_df( logsums, "%s.logsums" % trace_label, columns=[None, "logsum"] ) @@ -1630,7 +1630,7 @@ def eval_mnl_logsums( # trace choosers if have_trace_targets: - tracing.trace_df(choosers, "%s.choosers" % trace_label) + whale.trace_df(choosers, "%s.choosers" % trace_label) utilities = eval_utilities( whale, @@ -1644,7 +1644,7 @@ def eval_mnl_logsums( chunk_sizer.log_df(trace_label, "utilities", utilities) if have_trace_targets: - tracing.trace_df( + whale.trace_df( utilities, "%s.raw_utilities" % trace_label, column_labels=["alternative", "utility"], @@ -1658,7 +1658,7 @@ def eval_mnl_logsums( # trace utilities if have_trace_targets: - tracing.trace_df( + whale.trace_df( logsums, "%s.logsums" % trace_label, column_labels=["alternative", "logsum"] ) @@ -1772,7 +1772,7 @@ def eval_nl_logsums( # trace choosers if have_trace_targets: - tracing.trace_df(choosers, "%s.choosers" % trace_label) + whale.trace_df(choosers, "%s.choosers" % trace_label) raw_utilities = eval_utilities( whale, @@ -1787,7 +1787,7 @@ def eval_nl_logsums( chunk_sizer.log_df(trace_label, "raw_utilities", raw_utilities) if have_trace_targets: - tracing.trace_df( + whale.trace_df( raw_utilities, "%s.raw_utilities" % trace_label, column_labels=["alternative", "utility"], @@ -1808,12 +1808,12 @@ def eval_nl_logsums( if have_trace_targets: # add logsum to nested_exp_utilities for tracing nested_exp_utilities["logsum"] = logsums - tracing.trace_df( + whale.trace_df( nested_exp_utilities, "%s.nested_exp_utilities" % trace_label, column_labels=["alternative", "utility"], ) - tracing.trace_df( + whale.trace_df( logsums, "%s.logsums" % trace_label, column_labels=["alternative", "logsum"] ) diff --git a/activitysim/core/tracing.py b/activitysim/core/tracing.py index 08b1a4a79..2f384c1d4 100644 --- a/activitysim/core/tracing.py +++ b/activitysim/core/tracing.py @@ -9,14 +9,13 @@ import time from builtins import next, range from collections import OrderedDict +from typing import Optional import numpy as np import pandas as pd import yaml -from activitysim.core import workflow - -from . import config +from activitysim.core import config, workflow # Configurations ASIM_LOGGER = "activitysim" @@ -673,19 +672,25 @@ def trace_id_for_chooser(id, choosers): return hh_id, column_name -def dump_df(dump_switch, df, trace_label, fname): +def dump_df(whale: workflow.Whale, dump_switch, df, trace_label, fname): if dump_switch: trace_label = extend_trace_label(trace_label, "DUMP.%s" % fname) trace_df( - df, trace_label, index_label=df.index.name, slicer="NONE", transpose=False + whale, + df, + trace_label, + index_label=df.index.name, + slicer="NONE", + transpose=False, ) def trace_df( + whale: workflow.Whale, df: pd.DataFrame, label: str, slicer=None, - columns=None, + columns: Optional[list[str]] = None, index_label=None, column_labels=None, transpose=True, @@ -696,6 +701,7 @@ def trace_df( Parameters ---------- + whale: workflow.Whale df: pandas.DataFrame traced dataframe label: str @@ -718,7 +724,7 @@ def trace_df( Nothing """ - target_ids, column = get_trace_target(df, slicer) + target_ids, column = get_trace_target(whale, df, slicer) if target_ids is not None: df = slice_ids(df, target_ids, column) diff --git a/activitysim/core/workflow/state.py b/activitysim/core/workflow/state.py index 856b78bf0..3ce0cbe38 100644 --- a/activitysim/core/workflow/state.py +++ b/activitysim/core/workflow/state.py @@ -5,7 +5,7 @@ import os from builtins import map, next from pathlib import Path -from typing import Any +from typing import Any, Optional import pandas as pd import xarray as xr @@ -254,11 +254,37 @@ def get(self, key, default: Any = NO_DEFAULT): return result def set(self, key, value): + """ + Set a new value for a key in the context. + + Also removes from the context all other keys predicated on this key. + They can be regenerated later (from fresh inputs) if needed. + + Parameters + ---------- + key : str + """ self.context[key] = value for i in self._PREDICATES.get(key, []): if i in self.context: logger.critical(f"update of {key} clears cached {i}") - del self.context[i] + self.drop(i) + + def drop(self, key): + """ + Remove a key from the context. + + Also removes from the context all other keys predicated on this key. + + Parameters + ---------- + key : str + """ + del self.context[key] + for i in self._PREDICATES.get(key, []): + if i in self.context: + logger.critical(f"dropping {key} clears cached {i}") + self.drop(i) def extract(self, func): return func(self) @@ -1114,7 +1140,6 @@ def extend_table(self, table_name, df, axis=0): return df def drop_table(self, table_name): - if self.is_table(table_name): logger.debug("drop_table dropping orca table '%s'" % table_name) self.context.pop(table_name, None) @@ -1214,3 +1239,56 @@ def get_output_file_path(self, file_name: str) -> Path: if prefix: file_name = "%s-%s" % (prefix, file_name) return self.filesystem.get_output_dir().joinpath(file_name) + + def trace_df( + self, + df: pd.DataFrame, + label: str, + slicer=None, + columns: Optional[list[str]] = None, + index_label=None, + column_labels=None, + transpose=True, + warn_if_empty=False, + ): + """ + Slice dataframe by traced household or person id dataframe and write to CSV + + Parameters + ---------- + whale: workflow.Whale + df: pandas.DataFrame + traced dataframe + label: str + tracer name + slicer: Object + slicer for subsetting + columns: list + columns to write + index_label: str + index name + column_labels: [str, str] + labels for columns in csv + transpose: boolean + whether to transpose file for legibility + warn_if_empty: boolean + write warning if sliced df is empty + """ + from activitysim.core.tracing import trace_df + + return trace_df( + self, + df, + label, + slicer=slicer, + columns=columns, + index_label=index_label, + column_labels=column_labels, + transpose=transpose, + warn_if_empty=warn_if_empty, + ) + + def dump_df(self, dump_switch, df, trace_label, fname): + from activitysim.core.tracing import dump_df + + return dump_df(self, dump_switch, df, trace_label, fname) diff --git a/docs/howitworks.rst b/docs/howitworks.rst index 91046053c..65e712729 100644 --- a/docs/howitworks.rst +++ b/docs/howitworks.rst @@ -2,7 +2,7 @@ How the System Works ==================== -This page describes how the software works, how multiprocessing works, and the primary example model data schema. The code snippets below may not exactly match the latest version of the software, but they are close enough to illustrate how the system works. +This page describes how the software works, how multiprocessing works, and the primary example model data schema. The code snippets below may not exactly match the latest version of the software, but they are close enough to illustrate how the system works. .. _how_the_system_works: @@ -209,7 +209,7 @@ as well. The various calls also setup logging, tracing, stable random number ma if trace_hh_id: tracing.register_traceable_table('persons', df) - tracing.trace_df(df, "raw.persons", warn_if_empty=True) + whale.trace_df(df, "raw.persons", warn_if_empty=True) return df @@ -1355,7 +1355,7 @@ Skims are named ___