Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue/empty string as nan in attribute #1045

Merged
merged 6 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 100 additions & 15 deletions TM1py/Services/CellService.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from concurrent.futures.thread import ThreadPoolExecutor
from contextlib import suppress
from io import StringIO
from typing import List, Union, Dict, Iterable, Tuple, Optional
from typing import List, Union, Dict, Iterable, Tuple, Optional, Any

import ijson
from mdxpy import MdxHierarchySet, MdxBuilder, Member, MdxTuple
Expand Down Expand Up @@ -2240,21 +2240,24 @@ def execute_mdx_csv(self, mdx: Union[str, MdxBuilder], top: int = None, skip: in
cellset_id=cellset_id, top=top, skip=skip, skip_zeros=skip_zeros,
skip_rule_derived_cells=skip_rule_derived_cells, skip_consolidated_cells=skip_consolidated_cells,
csv_dialect=csv_dialect, line_separator=line_separator, value_separator=value_separator,
sandbox_name=sandbox_name, include_attributes=include_attributes, mdx_headers=mdx_headers, **kwargs)
sandbox_name=sandbox_name, include_attributes=include_attributes, mdx_headers=mdx_headers,
**kwargs)

return self.extract_cellset_csv(
cellset_id=cellset_id, top=top, skip=skip, skip_zeros=skip_zeros,
skip_rule_derived_cells=skip_rule_derived_cells, skip_consolidated_cells=skip_consolidated_cells,
csv_dialect=csv_dialect, line_separator=line_separator, value_separator=value_separator,
sandbox_name=sandbox_name, include_attributes=include_attributes,
use_compact_json=use_compact_json, mdx_headers=mdx_headers, **kwargs)
use_compact_json=use_compact_json, mdx_headers=mdx_headers,
**kwargs)

def execute_view_csv(self, cube_name: str, view_name: str, private: bool = False, top: int = None, skip: int = None,
skip_zeros: bool = True, skip_consolidated_cells: bool = False,
skip_rule_derived_cells: bool = False, csv_dialect: 'csv.Dialect' = None,
line_separator: str = "\r\n", value_separator: str = ",", sandbox_name: str = None,
use_iterative_json: bool = False, use_compact_json: bool = False, use_blob: bool = False,
arranged_axes: Tuple[List, List, List] = None, mdx_headers: bool = False, **kwargs) -> str:
arranged_axes: Tuple[List, List, List] = None, mdx_headers: bool = False,
**kwargs) -> str:
""" Optimized for performance. Get csv string of coordinates and values.

:param cube_name: String, name of the cube
Expand All @@ -2278,7 +2281,7 @@ def execute_view_csv(self, cube_name: str, view_name: str, private: bool = False
Allows function to skip retrieval of cellset composition.
E.g.: arranged_axes=(["Year"], ["Region","Product"], ["Period", "Version"])
:param mdx_headers: boolean, fully qualified hierarchy name as header instead of simple dimension name
:return: String
:return: dict, String
"""
if use_blob:
if use_iterative_json:
Expand Down Expand Up @@ -2360,6 +2363,10 @@ def execute_mdx_dataframe(self, mdx: Union[str, MdxBuilder], top: int = None, sk
sandbox_name: str = None, include_attributes: bool = False,
use_iterative_json: bool = False, use_compact_json: bool = False,
use_blob: bool = False, shaped: bool = False, mdx_headers: bool = False,
fillna_numeric_attributes: bool = False,
fillna_numeric_attributes_value: Any = 0,
fillna_string_attributes: bool = False,
fillna_string_attributes_value: Any = '',
**kwargs) -> 'pd.DataFrame':
""" Optimized for performance. Get Pandas DataFrame from MDX Query.

Expand All @@ -2383,13 +2390,28 @@ def execute_mdx_dataframe(self, mdx: Union[str, MdxBuilder], top: int = None, sk
:param use_blob: Has better performance on datasets > 1M cells and lower memory footprint in any case.
:param shaped: preserve shape of view/mdx in data frame
:param mdx_headers: boolean, fully qualified hierarchy name as header instead of simple dimension name
:param fillna_numeric_attributes: boolean, fills empty numerical attributes with fillna_numeric_attributes_value
:param fillna_string_attributes: boolean, fills empty string attributes with fillna_string_attributes_value
:param fillna_numeric_attributes_value: Any, value with which to replace na if fillna_numeric_attributes is True
:param fillna_string_attributes_value: Any, value with which to replace na if fillna_string_attributes is True
:return: Pandas Dataframe
"""
if (fillna_numeric_attributes or fillna_string_attributes) and not include_attributes:
raise ValueError('Include attributes must be True if fillna_numeric or fillna_string is True.')

# necessary to assure column order in line with cube view
if shaped:
skip_zeros = False

if use_blob:
if any([
fillna_numeric_attributes,
fillna_numeric_attributes_value,
fillna_string_attributes,
fillna_string_attributes_value]
):
raise ValueError("fillna attributes' feature must not be used with use_blob as True")

raw_csv = self.execute_mdx_csv(
mdx=mdx,
top=top,
Expand All @@ -2412,7 +2434,12 @@ def execute_mdx_dataframe(self, mdx: Union[str, MdxBuilder], top: int = None, sk
skip_rule_derived_cells=skip_rule_derived_cells,
sandbox_name=sandbox_name, include_attributes=include_attributes,
use_iterative_json=use_iterative_json, use_compact_json=use_compact_json,
shaped=shaped, mdx_headers=mdx_headers, **kwargs)
shaped=shaped, mdx_headers=mdx_headers,
fillna_numeric_attributes=fillna_numeric_attributes,
fillna_numeric_attributes_value=fillna_numeric_attributes_value,
fillna_string_attributes=fillna_string_attributes,
fillna_string_attributes_value=fillna_string_attributes_value,
**kwargs)

@require_pandas
def execute_mdx_dataframe_async(self, mdx_list: List[Union[str, MdxBuilder]], max_workers: int = 8,
Expand Down Expand Up @@ -3158,7 +3185,7 @@ def extract_cellset_raw(
skip_contexts=skip_contexts,
include_hierarchies=include_hierarchies,
sandbox_name=sandbox_name,
**kwargs)
**{**kwargs, 'delete_cellset': False})
cells = self.extract_cellset_cells_raw(cellset_id=cellset_id,
cell_properties=cell_properties,
top=top,
Expand All @@ -3173,6 +3200,7 @@ def extract_cellset_raw(
# Combine metadata and cells back into a single object
return {**metadata, **cells}

@tidy_cellset
def extract_cellset_metadata_raw(
self,
cellset_id: str,
Expand All @@ -3183,6 +3211,7 @@ def extract_cellset_metadata_raw(
skip_contexts: bool = False,
include_hierarchies: bool = False,
sandbox_name: str = None,
delete_cellset: bool = False,
**kwargs):

# select Name property if member_properties is None or empty.
Expand Down Expand Up @@ -3618,6 +3647,7 @@ def extract_cellset_composition(
url = "/Cellsets('{}')?$expand=" \
"Cube($select=Name)," \
"Axes($expand=Hierarchies($select=UniqueName))".format(cellset_id)

url = add_url_parameters(url, **{"!sandbox": sandbox_name})
response = self._rest.GET(url=url, **kwargs)
response_json = response.json()
Expand Down Expand Up @@ -3686,15 +3716,13 @@ def extract_cellset_csv(
:param mdx_headers: boolean. Fully qualified hierarchy name as header instead of simple dimension name
:return: Raw format from TM1.
"""
if 'delete_cellset' in kwargs:
delete_cellset = kwargs.pop('delete_cellset')
else:
delete_cellset = True
delete_cellset = kwargs.pop('delete_cellset', True)

_, _, rows, columns = self.extract_cellset_composition(
cube, _, rows, columns = self.extract_cellset_composition(
cellset_id,
delete_cellset=False,
sandbox_name=sandbox_name, **kwargs)
sandbox_name=sandbox_name,
**kwargs)

cellset_dict = self.extract_cellset_raw(
cellset_id,
Expand Down Expand Up @@ -3804,6 +3832,7 @@ def extract_cellset_csv_iter_json(
'Axes.item.Ordinal']

attributes_prefixes = set()
attributes_by_dimension = None
if include_attributes:
attributes_by_dimension = self._get_attributes_by_dimension(cube)
for _, attributes in attributes_by_dimension.items():
Expand Down Expand Up @@ -3903,6 +3932,10 @@ def extract_cellset_dataframe(
use_compact_json: bool = False,
shaped: bool = False,
mdx_headers: bool = False,
fillna_numeric_attributes: bool = False,
fillna_numeric_attributes_value: Any = 0,
fillna_string_attributes: bool = False,
fillna_string_attributes_value: Any = '',
**kwargs) -> 'pd.DataFrame':
""" Build pandas data frame from cellset_id

Expand Down Expand Up @@ -3934,9 +3967,61 @@ def extract_cellset_dataframe(
cellset_id=cellset_id, top=top, skip=skip, skip_zeros=skip_zeros,
skip_rule_derived_cells=skip_rule_derived_cells, skip_consolidated_cells=skip_consolidated_cells,
value_separator='~', sandbox_name=sandbox_name, include_attributes=include_attributes,
use_compact_json=use_compact_json, mdx_headers=mdx_headers, **kwargs)
use_compact_json=use_compact_json, mdx_headers=mdx_headers,
# dont delete cellset if attribute types must be retrieved later
delete_cellset=not any([fillna_string_attributes, fillna_string_attributes]), **kwargs)

attribute_types_by_dimension = None
if fillna_string_attributes or fillna_string_attributes:
attribute_types_by_dimension = self._extract_attribute_types_by_dimension(
cellset_id=cellset_id,
sandbox_name=sandbox_name,
delete_cellset=True,
**kwargs)

return build_dataframe_from_csv(raw_csv, sep="~", shaped=shaped,
fillna_numeric_attributes=fillna_numeric_attributes,
fillna_string_attributes=fillna_string_attributes,
fillna_numeric_attributes_value=fillna_numeric_attributes_value,
fillna_string_attributes_value=fillna_string_attributes_value,
attribute_types_by_dimension=attribute_types_by_dimension, **kwargs)

def _extract_attribute_types_by_dimension(self, cellset_id: str, sandbox_name: str, delete_cellset: bool, **kwargs):
attribute_types_by_dimension = {}

return build_dataframe_from_csv(raw_csv, sep="~", shaped=shaped, **kwargs)
_, _, rows, columns = self.extract_cellset_composition(
cellset_id,
delete_cellset=False,
sandbox_name=sandbox_name, **kwargs)

metadata = self.extract_cellset_metadata_raw(
cellset_id=cellset_id,
elem_properties=['Name'],
member_properties=['Name', 'Attributes'],
top=1,
skip=0,
skip_contexts=True,
include_hierarchies=False,
sandbox_name=sandbox_name,
delete_cellset=delete_cellset,
**kwargs)
# gets the attribute names from the first member from the first tuple of each axis.
attributes_by_dimension = dict(zip(
rows + columns,
[list(member['Attributes'].keys()) for axes in metadata['Axes'][::-1] for member in
axes['Tuples'][0]['Members']]))
element_service = self.get_element_service()
for dimension in rows + columns:
attribute_types_by_dimension[dimension] = element_service.get_element_types(
'}ElementAttributes_' + dimension.split('].[')[0][1:],
'}ElementAttributes_' + dimension.split('].[')[0][1:])

attribute_types_by_dimension[dimension] = {
attribute_name: attribute_type for attribute_name, attribute_type in
attribute_types_by_dimension[dimension].items()
if attribute_name in attributes_by_dimension[dimension]}

return attribute_types_by_dimension

@tidy_cellset
@require_pandas
Expand Down
31 changes: 30 additions & 1 deletion TM1py/Utils/Utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,13 @@ def build_csv_from_cellset_dict(
return csv_content.getvalue().strip()


def build_dataframe_from_csv(raw_csv, sep='~', shaped: bool = False, **kwargs) -> 'pd.DataFrame':
def build_dataframe_from_csv(raw_csv, sep='~', shaped: bool = False,
fillna_numeric_attributes: bool = False,
fillna_numeric_attributes_value: Any = 0,
fillna_string_attributes: bool = False,
fillna_string_attributes_value: Any = '',
attribute_types_by_dimension: Dict[str, Dict[str, str]] | None = None,
**kwargs) -> 'pd.DataFrame':
if not raw_csv:
return pd.DataFrame()

Expand All @@ -542,11 +548,34 @@ def build_dataframe_from_csv(raw_csv, sep='~', shaped: bool = False, **kwargs) -
kwargs['dtype'] = {'Value': None, **{col: str for col in range(999)}}
try:
df = pd.read_csv(StringIO(raw_csv), sep=sep, na_values=["", None], keep_default_na=False, **kwargs)

except ValueError:
# retry with dtype 'str' for results with a mixed value column
kwargs['dtype'] = {'Value': str, **{col: str for col in range(999)}}
df = pd.read_csv(StringIO(raw_csv), sep=sep, na_values=["", None], keep_default_na=False, **kwargs)

if fillna_numeric_attributes:
fill_numeric_bool_list = [attr_type.lower() == 'numeric' for dimension, attributes in
attribute_types_by_dimension.items()
for attr_type in [dimension] + list(attributes.values())]
fill_numeric_bool_list += [False] # for the value column
df = df.apply(
lambda col:
col.fillna(fillna_numeric_attributes_value) if fill_numeric_bool_list[
list(df.columns.values).index(col.name)] else col,
axis=0)

if fillna_string_attributes:
fill_string_bool_list = [attr_type.lower() == 'string' for dimension, attributes in
attribute_types_by_dimension.items()
for attr_type in [dimension] + list(attributes.values())]
fill_string_bool_list += [False] # for the value column
df = df.apply(
lambda col:
col.fillna(fillna_string_attributes_value) if fill_string_bool_list[
list(df.columns.values).index(col.name)] else col,
axis=0)

if not shaped:
return df

Expand Down
Loading