Refactor initialization for simpler maintenance (#730)

* Split format_data into functions Co-authored-by: Matthew Gidden <matthew.gidden@gmail.com> * Reorder format_data Co-authored-by: Matthew Gidden <matthew.gidden@gmail.com> * Split out format_data_to_series from format_data * Fix unused import * Avoid copying data Co-authored-by: Matthew Gidden <matthew.gidden@gmail.com> * Add function doc-strings * Update release notes * Apply suggestions from code review Co-authored-by: Daniel Huppmann <dh@dergelbesalon.at> * Update pyam/utils.py --------- Co-authored-by: Matthew Gidden <matthew.gidden@gmail.com> Co-authored-by: Daniel Huppmann <dh@dergelbesalon.at>
IAMconsortium · Feb 24, 2023 · 128fb19 · 128fb19
1 parent 7a97516
commit 128fb19
Show file tree

Hide file tree

Showing 3 changed files with 86 additions and 58 deletions.
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,5 +1,6 @@
 # Next Release
 
+- [#727](https://github.com/IAMconsortium/pyam/pull/730) Refactor initialization code
 - [#729](https://github.com/IAMconsortium/pyam/pull/729) Improve performance at initialization
 - [#723](https://github.com/IAMconsortium/pyam/pull/723) Ensure correct order of `time` attribute
 

diff --git a/pyam/core.py b/pyam/core.py
@@ -162,8 +162,8 @@ def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs):
             _data = read_file(data, index=index, **kwargs)
 
         # cast data from pandas
-        elif isinstance(data, pd.DataFrame) or isinstance(data, pd.Series):
-            _data = format_data(data.copy(), index=index, **kwargs)
+        elif isinstance(data, (pd.DataFrame, pd.Series)):
+            _data = format_data(data, index=index, **kwargs)
 
         # unsupported `data` args
         elif islistable(data):

diff --git a/pyam/utils.py b/pyam/utils.py
@@ -12,11 +12,6 @@
 import pandas as pd
 from collections.abc import Iterable
 
-try:
-    import seaborn as sns
-except ImportError:
-    pass
-
 logger = logging.getLogger(__name__)
 
 # common indices
@@ -181,14 +176,10 @@ def read_file(path, *args, **kwargs):
     return format_data(read_pandas(path, *args, **kwargs), **format_kwargs)
 
 
-def format_data(df, index, **kwargs):
-    """Convert a pandas.Dataframe or pandas.Series to the required format"""
-    if isinstance(df, pd.Series):
-        df.name = df.name or "value"
-        df = df.to_frame()
+def _convert_r_columns(df):
+    """Check and convert R-style year columns"""
 
-    # check for R-style year columns, converting where necessary
-    def convert_r_columns(c):
+    def strip_R_integer_prefix(c):
         try:
             first = c[0]
             second = c[1:]
@@ -204,7 +195,11 @@ def convert_r_columns(c):
             pass
         return c
 
-    df.columns = df.columns.map(convert_r_columns)
+    return df.set_axis(df.columns.map(strip_R_integer_prefix), axis="columns")
+
+
+def _knead_data(df, **kwargs):
+    """Replace, rename and concat according to user arguments"""
 
     # if `value` is given but not `variable`,
     # melt value columns and use column name as `variable`
@@ -236,26 +231,26 @@ def convert_r_columns(c):
         else:
             raise ValueError(f"Invalid argument for casting `{col}: {value}`")
 
-    # all lower case
-    str_cols = [c for c in df.columns if isstr(c)]
-    df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True)
-
-    if "notes" in df.columns:  # this came from the database
-        logger.info("Ignoring notes column in dataframe")
-        df.drop(columns="notes", inplace=True)
-        col = df.columns[0]  # first column has database copyright notice
-        df = df[~df[col].str.contains("database", case=False)]
-        if "scenario" in df.columns and "model" not in df.columns:
-            # model and scenario are jammed together in RCP data
-            scen = df["scenario"]
-            df.loc[:, "model"] = scen.apply(lambda s: s.split("-")[0].strip())
-            df.loc[:, "scenario"] = scen.apply(
-                lambda s: "-".join(s.split("-")[1:]).strip()
-            )
+    return df
+
+
+def _format_from_legacy_database(df):
+    """Process data from legacy databases (SSP and earlier)"""
+
+    logger.info("Ignoring notes column in `data`")
+    df.drop(columns="notes", inplace=True)
+    col = df.columns[0]  # first column has database copyright notice
+    df = df[~df[col].str.contains("database", case=False)]
+    if "scenario" in df.columns and "model" not in df.columns:
+        # model and scenario are jammed together in RCP data
+        parts = df["scenario"].str.split("-", n=1, expand=True)
+        df = df.assign(model=parts[0].str.strip(), scenario=parts[1].str.strip())
 
-    # reset the index if meaningful entries are included there
-    if not list(df.index.names) == [None]:
-        df.reset_index(inplace=True)
+    return df
+
+
+def _intuit_column_groups(df, index):
+    """Check and categorise columns in dataframe"""
 
     # check that there is no column in the timeseries data with reserved names
     conflict_cols = [i for i in df.columns if i in ILLEGAL_COLS]
@@ -276,7 +271,6 @@ def convert_r_columns(c):
 
     # check whether data in wide format (standard IAMC) or long format (`value` column)
     if "value" in df.columns:
-
         # check if time column is given as `year` (int) or `time` (datetime)
         if "year" in df.columns and "time" not in df.columns:
             time_col = "year"
@@ -289,19 +283,8 @@ def convert_r_columns(c):
             for c in df.columns
             if c not in index + REQUIRED_COLS + [time_col, "value"]
         ]
-
-        # replace missing units by an empty string for user-friendly filtering
-        df.loc[df.unit.isnull(), "unit"] = ""
-
-        _validate_complete_index(df[index + REQUIRED_COLS + extra_cols])
-
-        # cast to pd.Series
-        idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols
-        df = df.set_index(idx_cols).value
-        df.dropna(inplace=True)
-
+        data_cols = []
     else:
-
         # if in wide format, check if columns are years (int) or datetime
         cols = [c for c in df.columns if c not in index + REQUIRED_COLS]
         year_cols, time_cols, extra_cols = [], [], []
@@ -323,23 +306,67 @@ def convert_r_columns(c):
 
         if year_cols and not time_cols:
             time_col = "year"
-            melt_cols = sorted(year_cols)
+            data_cols = sorted(year_cols)
         else:
             time_col = "time"
-            melt_cols = sorted(year_cols) + sorted(time_cols)
-        if not melt_cols:
+            data_cols = sorted(year_cols) + sorted(time_cols)
+        if not data_cols:
             raise ValueError("Missing time domain")
 
-        # replace missing units by an empty string for user-friendly filtering
-        df.loc[df.unit.isnull(), "unit"] = ""
+    return time_col, extra_cols, data_cols
+
+
+def _format_data_to_series(df, index):
+    """Convert a long or wide pandas dataframe to a series with the required columns"""
+
+    time_col, extra_cols, data_cols = _intuit_column_groups(df, index)
+
+    _validate_complete_index(df[index + REQUIRED_COLS + extra_cols])
+
+    idx_order = index + REQUIRED_COLS + [time_col] + extra_cols
+
+    if data_cols:
+        # wide format
+        df = (
+            df.set_index(index + REQUIRED_COLS + extra_cols)
+            .rename_axis(columns=time_col)
+            .stack(dropna=True)
+            .rename("value")
+            .reorder_levels(idx_order)
+        )
+    else:
+        # long format
+        df = df.set_index(idx_order)["value"].dropna()
+
+    return df, time_col, extra_cols
+
+
+def format_data(df, index, **kwargs):
+    """Convert a pandas.Dataframe or pandas.Series to the required format"""
+
+    if isinstance(df, pd.Series):
+        if not df.name:
+            df = df.rename("value")
+        df = df.reset_index()
+    elif not list(df.index.names) == [None]:
+        # reset the index if meaningful entries are included there
+        df = df.reset_index()
+
+    df = _convert_r_columns(df)
+
+    if kwargs:
+        df = _knead_data(df, **kwargs)
+
+    # cast all columns names to lower case
+    df.rename(columns={c: str(c).lower() for c in df.columns if isstr(c)}, inplace=True)
+
+    if "notes" in df.columns:  # this came from a legacy database (SSP or earlier)
+        df = _format_from_legacy_database(df)
 
-        _validate_complete_index(df[index + REQUIRED_COLS + extra_cols])
+    # replace missing units by an empty string for user-friendly filtering
+    df = df.assign(unit=df["unit"].fillna(""))
 
-        # cast to long format, set
-        df.set_index(index + REQUIRED_COLS + extra_cols, inplace=True)
-        df = df.stack(dropna=True)
-        df.name = "value"
-        df.index.names = df.index.names[:-1] + [time_col]
+    df, time_col, extra_cols = _format_data_to_series(df, index)
 
     # cast value column to numeric
     try: