Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce shaped arg to execute_mdx_dataframe #893

Merged
merged 2 commits into from
Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 108 additions & 19 deletions TM1py/Services/CellService.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
abbreviate_mdx, build_csv_from_cellset_dict, require_version, require_pandas, build_cellset_from_pandas_dataframe, \
case_and_space_insensitive_equals, get_cube, resembles_mdx, require_admin, extract_compact_json_cellset, \
cell_is_updateable, build_mdx_from_cellset, build_mdx_and_values_from_cellset, \
dimension_names_from_element_unique_names, frame_to_significant_digits, build_dataframe_from_csv
dimension_names_from_element_unique_names, frame_to_significant_digits, build_dataframe_from_csv, \
drop_dimension_properties

try:
import pandas as pd
Expand Down Expand Up @@ -1945,7 +1946,7 @@ def execute_mdx_dataframe(self, mdx: Union[str, MdxBuilder], top: int = None, sk
skip_consolidated_cells: bool = False, skip_rule_derived_cells: bool = False,
sandbox_name: str = None, include_attributes: bool = False,
use_iterative_json: bool = False, use_compact_json: bool = False,
use_blob: bool = False, **kwargs) -> 'pd.DataFrame':
use_blob: bool = False, shaped: bool = False, **kwargs) -> 'pd.DataFrame':
""" Optimized for performance. Get Pandas DataFrame from MDX Query.

Takes all arguments from the pandas.read_csv method:
Expand All @@ -1963,6 +1964,7 @@ def execute_mdx_dataframe(self, mdx: Union[str, MdxBuilder], top: int = None, sk
Comes at a cost of 3-5% performance.
:param use_compact_json: bool
:param use_blob: Has better performance on datasets > 1M cells and lower memory footprint in any case.
:param shaped: preserve shape of view/mdx in data frame
:return: Pandas Dataframe
"""
if use_blob:
Expand All @@ -1978,49 +1980,126 @@ def execute_mdx_dataframe(self, mdx: Union[str, MdxBuilder], top: int = None, sk
line_separator="\r\n",
value_separator="~",
use_blob=use_blob)
return build_dataframe_from_csv(raw_csv, sep='~', **kwargs)

return build_dataframe_from_csv(raw_csv, sep='~', skip_zeros=skip_zeros, shaped=shaped, **kwargs)

cellset_id = self.create_cellset(mdx, sandbox_name=sandbox_name, **kwargs)
return self.extract_cellset_dataframe(cellset_id, top=top, skip=skip, skip_zeros=skip_zeros,
skip_consolidated_cells=skip_consolidated_cells,
skip_rule_derived_cells=skip_rule_derived_cells,
sandbox_name=sandbox_name, include_attributes=include_attributes,
use_iterative_json=use_iterative_json, use_compact_json=use_compact_json,
shaped=shaped,
**kwargs)

@require_pandas
def execute_mdx_dataframe_shaped(self, mdx: str, sandbox_name: str = None, display_attribute: bool = False,
use_iterative_json: bool = False, use_blob: bool = False,
**kwargs) -> 'pd.DataFrame':
""" Retrieves data from cube in the shape of the query.
Dimensions on rows can be stacked. One dimension must be placed on columns. Title selections are ignored.

:param mdx:
:param sandbox_name: str
:param use_blob
:param use_iterative_json
:param display_attribute: bool, show element name or first attribute from MDX PROPERTIES clause
:param kwargs:
:return:
"""
cellset_id = self.create_cellset(mdx, sandbox_name=sandbox_name)
return self.extract_cellset_dataframe_shaped(cellset_id, delete_cellset=True, sandbox_name=sandbox_name,
display_attribute=display_attribute, **kwargs)
if display_attribute and any([use_blob, use_iterative_json]):
raise ValueError("When 'use_blob' or 'use_iterative_json' is True, 'display_attribute' must be False")

# default case
if not any([use_blob, use_iterative_json]):
cellset_id = self.create_cellset(
mdx=mdx,
sandbox_name=sandbox_name)
return self.extract_cellset_dataframe_shaped(
cellset_id=cellset_id,
delete_cellset=True,
sandbox_name=sandbox_name,
display_attribute=display_attribute,
**kwargs)

if all([use_blob, use_iterative_json]):
raise ValueError("'use_blob' and 'use_iterative_json' must not be used together")

# ijson approach
if use_iterative_json:
return self.execute_mdx_dataframe(
mdx=mdx,
shaped=True,
sandbox_name=sandbox_name,
use_iterative_json=use_iterative_json,
use_blob=False,
**kwargs)

# blob approach
return self.execute_mdx_dataframe(
mdx=mdx,
shaped=True,
sandbox_name=sandbox_name,
use_blob=True,
**kwargs)

@require_pandas
def execute_view_dataframe_shaped(self, cube_name: str, view_name: str, private: bool = False,
sandbox_name: str = None,
**kwargs) -> 'pd.DataFrame':
sandbox_name: str = None, use_iterative_json: bool = False,
use_blob: bool = False, **kwargs) -> 'pd.DataFrame':
""" Retrieves data from cube in the shape of the query.
Dimensions on rows can be stacked. One dimension must be placed on columns. Title selections are ignored.

:param cube_name:
:param view_name:
:param private:
:param sandbox_name: str
:param use_blob
:param use_iterative_json
:param kwargs:
:return:
"""
cellset_id = self.create_cellset_from_view(cube_name, view_name, private, sandbox_name=sandbox_name)
return self.extract_cellset_dataframe_shaped(cellset_id, delete_cellset=True, sandbox_name=sandbox_name,
**kwargs)

# default approach
if not any([use_blob, use_iterative_json]):
cellset_id = self.create_cellset_from_view(
cube_name=cube_name,
view_name=view_name,
private=private,
sandbox_name=sandbox_name)
return self.extract_cellset_dataframe_shaped(
cellset_id=cellset_id,
delete_cellset=True,
sandbox_name=sandbox_name,
**kwargs)

if all([use_blob, use_iterative_json]):
raise ValueError("'use_blob' and 'use_iterative_json' must not be used together")

# ijson approach
if use_iterative_json:
return self.execute_view_dataframe(
cube_name=cube_name,
view_name=view_name,
private=private,
shaped=True,
sandbox_name=sandbox_name,
use_iterative_json=use_iterative_json,
use_blob=False,
**kwargs)

# blob approach
if private:
raise ValueError("view must be public when 'use_blob' argument is True")

return self.execute_view_dataframe(
cube_name=cube_name,
view_name=view_name,
private=private,
shaped=True,
sandbox_name=sandbox_name,
use_blob=True,
**kwargs)

@require_pandas
def execute_view_dataframe_pivot(self, cube_name: str, view_name: str, private: bool = False, dropna: bool = False,
Expand Down Expand Up @@ -2108,7 +2187,8 @@ def execute_view_elements_value_dict(self, cube_name: str, view_name: str, priva
def execute_view_dataframe(self, cube_name: str, view_name: str, private: bool = False, top: int = None,
skip: int = None, skip_zeros: bool = True, skip_consolidated_cells: bool = False,
skip_rule_derived_cells: bool = False, sandbox_name: str = None,
use_iterative_json: bool = False, use_blob: bool = False, **kwargs) -> 'pd.DataFrame':
use_iterative_json: bool = False, use_blob: bool = False, shaped: bool = False,
**kwargs) -> 'pd.DataFrame':
""" Optimized for performance. Get Pandas DataFrame from an existing Cube View
Context dimensions are omitted in the resulting Dataframe !
Cells with Zero/null are omitted !
Expand Down Expand Up @@ -2143,16 +2223,17 @@ def execute_view_dataframe(self, cube_name: str, view_name: str, private: bool =
use_iterative_json=use_iterative_json,
line_separator="\r\n",
value_separator="~",
use_blob=True)
return build_dataframe_from_csv(raw_csv, sep='~', **kwargs)
use_blob=True,
**kwargs)
return build_dataframe_from_csv(raw_csv, sep='~', skip_zeros=skip_zeros, shaped=shaped, **kwargs)

cellset_id = self.create_cellset_from_view(cube_name=cube_name, view_name=view_name, private=private,
sandbox_name=sandbox_name, **kwargs)
return self.extract_cellset_dataframe(cellset_id, top=top, skip=skip, skip_zeros=skip_zeros,
skip_consolidated_cells=skip_consolidated_cells,
skip_rule_derived_cells=skip_rule_derived_cells,
sandbox_name=sandbox_name, use_iterative_json=use_iterative_json,
**kwargs)
shaped=shaped, **kwargs)

def execute_view_cellcount(self, cube_name: str, view_name: str, private: bool = False, sandbox_name: str = None,
**kwargs) -> int:
Expand Down Expand Up @@ -3081,6 +3162,7 @@ def extract_cellset_dataframe(
include_attributes: bool = False,
use_iterative_json: bool = False,
use_compact_json: bool = False,
shaped: bool = False,
**kwargs) -> 'pd.DataFrame':
""" Build pandas data frame from cellset_id

Expand Down Expand Up @@ -3113,7 +3195,7 @@ def extract_cellset_dataframe(
value_separator='~', sandbox_name=sandbox_name, include_attributes=include_attributes,
use_compact_json=use_compact_json, **kwargs)

return build_dataframe_from_csv(raw_csv, **kwargs)
return build_dataframe_from_csv(raw_csv, sep="~", skip_zeros=skip_zeros, shaped=shaped, **kwargs)

@tidy_cellset
@require_pandas
Expand Down Expand Up @@ -3508,7 +3590,7 @@ def _execute_view_csv_use_blob(self, cube_name: str, view_name: str, top: int, s
# alter native-view to assure only one element per dimension in title
native_view = view_service.get_native_view(cube_name=cube_name, view_name=view_name, private=False)
for title in native_view.titles:
title.subset = AnonymousSubset(
title._subset = AnonymousSubset(
dimension_name=title.dimension_name,
elements=[title.selected])
native_view.name = view_name = unique_name
Expand Down Expand Up @@ -3624,11 +3706,18 @@ def _execute_mdx_csv_use_blob(self, mdx: Union[str, MdxBuilder], top: int, skip:
in columns + rows]

unique_name = self.suggest_unique_object_name()

# dimension properties must be skipped as they produce extra variableS in TI data source
# and tear up the variable definition
if isinstance(mdx, MdxBuilder):
mdx = mdx.to_mdx(skip_dimension_properties=True)
else:
mdx = drop_dimension_properties(mdx)

view = MDXView(
cube_name=cube,
view_name=unique_name,
# dimension properties must be skipped as they produce an extra variable in TI data source
MDX=mdx.to_mdx(skip_dimension_properties=True) if isinstance(mdx, MdxBuilder) else mdx)
MDX=mdx)

file_name = f"{unique_name}.csv"
if include_headers:
Expand Down
16 changes: 9 additions & 7 deletions TM1py/Services/ElementService.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ def get_elements_dataframe(self, dimension_name: str = None, hierarchy_name: str

if not isinstance(elements, str):
if isinstance(elements, Iterable):
elements = "{" + ",".join(f"[{dimension_name}].[{hierarchy_name}].[{member}]" for member in elements) + "}"
elements = "{" + ",".join(
f"[{dimension_name}].[{hierarchy_name}].[{member}]" for member in elements) + "}"
else:
raise ValueError("Argument 'element_selection' must be None or str")

Expand Down Expand Up @@ -534,10 +535,11 @@ def get_levels_count(self, dimension_name: str, hierarchy_name: str, **kwargs) -
def get_element_types(self, dimension_name: str, hierarchy_name: str,
skip_consolidations: bool = False, **kwargs) -> CaseAndSpaceInsensitiveDict:
url = format_url(
"/api/v1/Dimensions('{}')/Hierarchies('{}')/Elements?$select=Name,Type{}",
"/api/v1/Dimensions('{}')/Hierarchies('{}')/Elements?$select=Name,Type",
dimension_name,
hierarchy_name,
"&$filter=Type ne 3" if skip_consolidations else "")
hierarchy_name)
if skip_consolidations:
url += "&$filter=Type ne 3"
response = self._rest.GET(url, **kwargs)

result = CaseAndSpaceInsensitiveDict()
Expand All @@ -548,9 +550,9 @@ def get_element_types(self, dimension_name: str, hierarchy_name: str,
def get_element_types_from_all_hierarchies(
self, dimension_name: str, skip_consolidations: bool = False, **kwargs) -> CaseAndSpaceInsensitiveDict:
url = format_url(
"/api/v1/Dimensions('{}')?$expand=Hierarchies($select=Elements;$expand=Elements($select=Name,Type{}",
dimension_name,
";$filter=Type ne 3))" if skip_consolidations else "))")
"/api/v1/Dimensions('{}')?$expand=Hierarchies($select=Elements;$expand=Elements($select=Name,Type",
dimension_name)
url += ";$filter=Type ne 3))" if skip_consolidations else "))"
response = self._rest.GET(url, **kwargs)

result = CaseAndSpaceInsensitiveDict()
Expand Down
11 changes: 9 additions & 2 deletions TM1py/Services/PowerBiService.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,15 @@ def execute_mdx(self, mdx, **kwargs) -> 'pd.DataFrame':
return self.cells.execute_mdx_dataframe_shaped(mdx, **kwargs)

@require_pandas
def execute_view(self, cube_name, view_name, private, **kwargs) -> 'pd.DataFrame':
return self.cells.execute_view_dataframe_shaped(cube_name, view_name, private, **kwargs)
def execute_view(self, cube_name: str, view_name: str, private: bool, use_iterative_json=False, use_blob=False,
**kwargs) -> 'pd.DataFrame':
return self.cells.execute_view_dataframe_shaped(
cube_name,
view_name,
private,
use_iterative_json=use_iterative_json,
use_blob=use_blob,
**kwargs)

@require_pandas
def get_member_properties(self, dimension_name: str = None, hierarchy_name: str = None,
Expand Down
33 changes: 30 additions & 3 deletions TM1py/Utils/Utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,16 +425,35 @@ def build_csv_from_cellset_dict(
return csv_content.getvalue().strip()


def build_dataframe_from_csv(raw_csv, sep='~', **kwargs) -> 'pd.DataFrame':
def build_dataframe_from_csv(raw_csv, sep='~', skip_zeros: bool = True, shaped: bool = False,
**kwargs) -> 'pd.DataFrame':
if not raw_csv:
return pd.DataFrame()

memory_file = StringIO(raw_csv)
# make sure all element names are strings and values column is derived from data
if 'dtype' not in kwargs:
kwargs['dtype'] = {'Value': None, **{col: str for col in range(999)}}
try:
df = pd.read_csv(StringIO(raw_csv), sep=sep, na_values=["", None], keep_default_na=False, **kwargs)
except ValueError:
# retry with dtype 'str' for results with a mixed value column
kwargs['dtype'] = {'Value': str, **{col: str for col in range(999)}}
df = pd.read_csv(StringIO(raw_csv), sep=sep, na_values=["", None], keep_default_na=False, **kwargs)

if not shaped:
return df

# due to csv creation logic, last column is bottom dimension from the column selection
df = df.pivot_table(
index=tuple(df.columns[:-2]),
aggfunc="sum",
columns=df.columns[-2],
values=df.columns[-1],
dropna=skip_zeros,
sort=False).reset_index()

return pd.read_csv(memory_file, sep=sep, na_values=["", None], keep_default_na=False, **kwargs)
# drop title on index
return df.rename_axis(None, axis=1)


def _build_csv_line_items_from_axis_tuple(members: Dict, include_attributes: bool = False) -> List[str]:
Expand Down Expand Up @@ -1188,6 +1207,14 @@ def frame_to_significant_digits(x, digits=15):
return str(round(x, digits)).replace('e+', 'E')


def drop_dimension_properties(mdx: str):
pattern = re.compile(r"(?i)DIMENSION\s+PROPERTIES\s+.*?\s+ON")
mdx = pattern.sub(" ON", mdx)

pattern = re.compile(r"(?i)\s+PROPERTIES\s+.*?\s+ON")
return pattern.sub(" ON", mdx)


class HTTPAdapterWithSocketOptions(HTTPAdapter):
def __init__(self, *args, **kwargs):
self.socket_options = kwargs.pop("socket_options", None)
Expand Down
Loading