cubewise-code · MariusWirtz · Jan 31, 2024 · Jan 27, 2024 · Jan 27, 2024 · Jan 27, 2024
diff --git a/TM1py/Services/CellService.py b/TM1py/Services/CellService.py
@@ -11,7 +11,7 @@
 from concurrent.futures.thread import ThreadPoolExecutor
 from contextlib import suppress
 from io import StringIO
-from typing import List, Union, Dict, Iterable, Tuple, Optional
+from typing import List, Union, Dict, Iterable, Tuple, Optional, Any
 
 import ijson
 from mdxpy import MdxHierarchySet, MdxBuilder, Member, MdxTuple
@@ -2240,21 +2240,24 @@ def execute_mdx_csv(self, mdx: Union[str, MdxBuilder], top: int = None, skip: in
                 cellset_id=cellset_id, top=top, skip=skip, skip_zeros=skip_zeros,
                 skip_rule_derived_cells=skip_rule_derived_cells, skip_consolidated_cells=skip_consolidated_cells,
                 csv_dialect=csv_dialect, line_separator=line_separator, value_separator=value_separator,
-                sandbox_name=sandbox_name, include_attributes=include_attributes, mdx_headers=mdx_headers, **kwargs)
+                sandbox_name=sandbox_name, include_attributes=include_attributes, mdx_headers=mdx_headers,
+                **kwargs)
 
         return self.extract_cellset_csv(
             cellset_id=cellset_id, top=top, skip=skip, skip_zeros=skip_zeros,
             skip_rule_derived_cells=skip_rule_derived_cells, skip_consolidated_cells=skip_consolidated_cells,
             csv_dialect=csv_dialect, line_separator=line_separator, value_separator=value_separator,
             sandbox_name=sandbox_name, include_attributes=include_attributes,
-            use_compact_json=use_compact_json, mdx_headers=mdx_headers, **kwargs)
+            use_compact_json=use_compact_json, mdx_headers=mdx_headers,
+            **kwargs)
 
     def execute_view_csv(self, cube_name: str, view_name: str, private: bool = False, top: int = None, skip: int = None,
                          skip_zeros: bool = True, skip_consolidated_cells: bool = False,
                          skip_rule_derived_cells: bool = False, csv_dialect: 'csv.Dialect' = None,
                          line_separator: str = "\r\n", value_separator: str = ",", sandbox_name: str = None,
                          use_iterative_json: bool = False, use_compact_json: bool = False, use_blob: bool = False,
-                         arranged_axes: Tuple[List, List, List] = None, mdx_headers: bool = False, **kwargs) -> str:
+                         arranged_axes: Tuple[List, List, List] = None, mdx_headers: bool = False,
+                         **kwargs) -> str:
         """ Optimized for performance. Get csv string of coordinates and values.
 
         :param cube_name: String, name of the cube
@@ -2278,7 +2281,7 @@ def execute_view_csv(self, cube_name: str, view_name: str, private: bool = False
          Allows function to skip retrieval of cellset composition.
          E.g.: arranged_axes=(["Year"], ["Region","Product"], ["Period", "Version"])
         :param mdx_headers: boolean, fully qualified hierarchy name as header instead of simple dimension name
-        :return: String
+        :return: dict, String
         """
         if use_blob:
             if use_iterative_json:
@@ -2360,6 +2363,10 @@ def execute_mdx_dataframe(self, mdx: Union[str, MdxBuilder], top: int = None, sk
                               sandbox_name: str = None, include_attributes: bool = False,
                               use_iterative_json: bool = False, use_compact_json: bool = False,
                               use_blob: bool = False, shaped: bool = False, mdx_headers: bool = False,
+                              fillna_numeric_attributes: bool = False,
+                              fillna_numeric_attributes_value: Any = 0,
+                              fillna_string_attributes: bool = False,
+                              fillna_string_attributes_value: Any = '',
                               **kwargs) -> 'pd.DataFrame':
         """ Optimized for performance. Get Pandas DataFrame from MDX Query.
 
@@ -2383,13 +2390,28 @@ def execute_mdx_dataframe(self, mdx: Union[str, MdxBuilder], top: int = None, sk
         :param use_blob: Has better performance on datasets > 1M cells and lower memory footprint in any case.
         :param shaped: preserve shape of view/mdx in data frame
         :param mdx_headers: boolean, fully qualified hierarchy name as header instead of simple dimension name
+        :param fillna_numeric_attributes: boolean, fills empty numerical attributes with fillna_numeric_attributes_value
+        :param fillna_string_attributes: boolean, fills empty string attributes with fillna_string_attributes_value
+        :param fillna_numeric_attributes_value: Any, value with which to replace na if fillna_numeric_attributes is True
+        :param fillna_string_attributes_value: Any, value with which to replace na if fillna_string_attributes is True
         :return: Pandas Dataframe
         """
+        if (fillna_numeric_attributes or fillna_string_attributes) and not include_attributes:
+            raise ValueError('Include attributes must be True if fillna_numeric or fillna_string is True.')
+
         # necessary to assure column order in line with cube view
         if shaped:
             skip_zeros = False
 
         if use_blob:
+            if any([
+                fillna_numeric_attributes,
+                fillna_numeric_attributes_value,
+                fillna_string_attributes,
+                fillna_string_attributes_value]
+            ):
+                raise ValueError("fillna attributes' feature must not be used with use_blob as True")
+
             raw_csv = self.execute_mdx_csv(
                 mdx=mdx,
                 top=top,
@@ -2412,7 +2434,12 @@ def execute_mdx_dataframe(self, mdx: Union[str, MdxBuilder], top: int = None, sk
                                               skip_rule_derived_cells=skip_rule_derived_cells,
                                               sandbox_name=sandbox_name, include_attributes=include_attributes,
                                               use_iterative_json=use_iterative_json, use_compact_json=use_compact_json,
-                                              shaped=shaped, mdx_headers=mdx_headers, **kwargs)
+                                              shaped=shaped, mdx_headers=mdx_headers,
+                                              fillna_numeric_attributes=fillna_numeric_attributes,
+                                              fillna_numeric_attributes_value=fillna_numeric_attributes_value,
+                                              fillna_string_attributes=fillna_string_attributes,
+                                              fillna_string_attributes_value=fillna_string_attributes_value,
+                                              **kwargs)
 
     @require_pandas
     def execute_mdx_dataframe_async(self, mdx_list: List[Union[str, MdxBuilder]], max_workers: int = 8,
@@ -3158,7 +3185,7 @@ def extract_cellset_raw(
                                                      skip_contexts=skip_contexts,
                                                      include_hierarchies=include_hierarchies,
                                                      sandbox_name=sandbox_name,
-                                                     **kwargs)
+                                                     **{**kwargs, 'delete_cellset': False})
         cells = self.extract_cellset_cells_raw(cellset_id=cellset_id,
                                                cell_properties=cell_properties,
                                                top=top,
@@ -3173,6 +3200,7 @@ def extract_cellset_raw(
         # Combine metadata and cells back into a single object
         return {**metadata, **cells}
 
+    @tidy_cellset
     def extract_cellset_metadata_raw(
             self,
             cellset_id: str,
@@ -3183,6 +3211,7 @@ def extract_cellset_metadata_raw(
             skip_contexts: bool = False,
             include_hierarchies: bool = False,
             sandbox_name: str = None,
+            delete_cellset: bool = False,
             **kwargs):
 
         # select Name property if member_properties is None or empty.
@@ -3618,6 +3647,7 @@ def extract_cellset_composition(
         url = "/Cellsets('{}')?$expand=" \
               "Cube($select=Name)," \
               "Axes($expand=Hierarchies($select=UniqueName))".format(cellset_id)
+
         url = add_url_parameters(url, **{"!sandbox": sandbox_name})
         response = self._rest.GET(url=url, **kwargs)
         response_json = response.json()
@@ -3686,15 +3716,13 @@ def extract_cellset_csv(
         :param mdx_headers: boolean. Fully qualified hierarchy name as header instead of simple dimension name
         :return: Raw format from TM1.
         """
-        if 'delete_cellset' in kwargs:
-            delete_cellset = kwargs.pop('delete_cellset')
-        else:
-            delete_cellset = True
+        delete_cellset = kwargs.pop('delete_cellset', True)
 
-        _, _, rows, columns = self.extract_cellset_composition(
+        cube, _, rows, columns = self.extract_cellset_composition(
             cellset_id,
             delete_cellset=False,
-            sandbox_name=sandbox_name, **kwargs)
+            sandbox_name=sandbox_name,
+            **kwargs)
 
         cellset_dict = self.extract_cellset_raw(
             cellset_id,
@@ -3804,6 +3832,7 @@ def extract_cellset_csv_iter_json(
                                 'Axes.item.Ordinal']
 
         attributes_prefixes = set()
+        attributes_by_dimension = None
         if include_attributes:
             attributes_by_dimension = self._get_attributes_by_dimension(cube)
             for _, attributes in attributes_by_dimension.items():
@@ -3903,6 +3932,10 @@ def extract_cellset_dataframe(
             use_compact_json: bool = False,
             shaped: bool = False,
             mdx_headers: bool = False,
+            fillna_numeric_attributes: bool = False,
+            fillna_numeric_attributes_value: Any = 0,
+            fillna_string_attributes: bool = False,
+            fillna_string_attributes_value: Any = '',
             **kwargs) -> 'pd.DataFrame':
         """ Build pandas data frame from cellset_id
 
@@ -3934,9 +3967,61 @@ def extract_cellset_dataframe(
                 cellset_id=cellset_id, top=top, skip=skip, skip_zeros=skip_zeros,
                 skip_rule_derived_cells=skip_rule_derived_cells, skip_consolidated_cells=skip_consolidated_cells,
                 value_separator='~', sandbox_name=sandbox_name, include_attributes=include_attributes,
-                use_compact_json=use_compact_json, mdx_headers=mdx_headers, **kwargs)
+                use_compact_json=use_compact_json, mdx_headers=mdx_headers,
+                # dont delete cellset if attribute types must be retrieved later
+                delete_cellset=not any([fillna_string_attributes, fillna_string_attributes]), **kwargs)
+
+        attribute_types_by_dimension = None
+        if fillna_string_attributes or fillna_string_attributes:
+            attribute_types_by_dimension = self._extract_attribute_types_by_dimension(
+                cellset_id=cellset_id,
+                sandbox_name=sandbox_name,
+                delete_cellset=True,
+                **kwargs)
+
+        return build_dataframe_from_csv(raw_csv, sep="~", shaped=shaped,
+                                        fillna_numeric_attributes=fillna_numeric_attributes,
+                                        fillna_string_attributes=fillna_string_attributes,
+                                        fillna_numeric_attributes_value=fillna_numeric_attributes_value,
+                                        fillna_string_attributes_value=fillna_string_attributes_value,
+                                        attribute_types_by_dimension=attribute_types_by_dimension, **kwargs)
+
+    def _extract_attribute_types_by_dimension(self, cellset_id: str, sandbox_name: str, delete_cellset: bool, **kwargs):
+        attribute_types_by_dimension = {}
 
-        return build_dataframe_from_csv(raw_csv, sep="~", shaped=shaped, **kwargs)
+        _, _, rows, columns = self.extract_cellset_composition(
+            cellset_id,
+            delete_cellset=False,
+            sandbox_name=sandbox_name, **kwargs)
+
+        metadata = self.extract_cellset_metadata_raw(
+            cellset_id=cellset_id,
+            elem_properties=['Name'],
+            member_properties=['Name', 'Attributes'],
+            top=1,
+            skip=0,
+            skip_contexts=True,
+            include_hierarchies=False,
+            sandbox_name=sandbox_name,
+            delete_cellset=delete_cellset,
+            **kwargs)
+        # gets the attribute names from the first member from the first tuple of each axis.
+        attributes_by_dimension = dict(zip(
+            rows + columns,
+            [list(member['Attributes'].keys()) for axes in metadata['Axes'][::-1] for member in
+             axes['Tuples'][0]['Members']]))
+        element_service = self.get_element_service()
+        for dimension in rows + columns:
+            attribute_types_by_dimension[dimension] = element_service.get_element_types(
+                '}ElementAttributes_' + dimension.split('].[')[0][1:],
+                '}ElementAttributes_' + dimension.split('].[')[0][1:])
+
+            attribute_types_by_dimension[dimension] = {
+                attribute_name: attribute_type for attribute_name, attribute_type in
+                attribute_types_by_dimension[dimension].items()
+                if attribute_name in attributes_by_dimension[dimension]}
+
+        return attribute_types_by_dimension
 
     @tidy_cellset
     @require_pandas

diff --git a/TM1py/Utils/Utils.py b/TM1py/Utils/Utils.py
@@ -533,7 +533,13 @@ def build_csv_from_cellset_dict(
     return csv_content.getvalue().strip()
 
 
-def build_dataframe_from_csv(raw_csv, sep='~', shaped: bool = False, **kwargs) -> 'pd.DataFrame':
+def build_dataframe_from_csv(raw_csv, sep='~', shaped: bool = False,
+                             fillna_numeric_attributes: bool = False,
+                             fillna_numeric_attributes_value: Any = 0,
+                             fillna_string_attributes: bool = False,
+                             fillna_string_attributes_value: Any = '',
+                             attribute_types_by_dimension: Dict[str, Dict[str, str]] | None = None,
+                             **kwargs) -> 'pd.DataFrame':
     if not raw_csv:
         return pd.DataFrame()
 
@@ -542,11 +548,34 @@ def build_dataframe_from_csv(raw_csv, sep='~', shaped: bool = False, **kwargs) -
         kwargs['dtype'] = {'Value': None, **{col: str for col in range(999)}}
     try:
         df = pd.read_csv(StringIO(raw_csv), sep=sep, na_values=["", None], keep_default_na=False, **kwargs)
+
     except ValueError:
         # retry with dtype 'str' for results with a mixed value column
         kwargs['dtype'] = {'Value': str, **{col: str for col in range(999)}}
         df = pd.read_csv(StringIO(raw_csv), sep=sep, na_values=["", None], keep_default_na=False, **kwargs)
 
+    if fillna_numeric_attributes:
+        fill_numeric_bool_list = [attr_type.lower() == 'numeric' for dimension, attributes in
+                                  attribute_types_by_dimension.items()
+                                  for attr_type in [dimension] + list(attributes.values())]
+        fill_numeric_bool_list += [False]  # for the value column
+        df = df.apply(
+            lambda col:
+            col.fillna(fillna_numeric_attributes_value) if fill_numeric_bool_list[
+                list(df.columns.values).index(col.name)] else col,
+            axis=0)
+
+    if fillna_string_attributes:
+        fill_string_bool_list = [attr_type.lower() == 'string' for dimension, attributes in
+                                 attribute_types_by_dimension.items()
+                                 for attr_type in [dimension] + list(attributes.values())]
+        fill_string_bool_list += [False]  # for the value column
+        df = df.apply(
+            lambda col:
+            col.fillna(fillna_string_attributes_value) if fill_string_bool_list[
+                list(df.columns.values).index(col.name)] else col,
+            axis=0)
+
     if not shaped:
         return df