Skip to content

Commit

Permalink
Handle duplictes in df to dict transformation
Browse files Browse the repository at this point in the history
duplicates in string values are accepted (last one wins)
duplicates in numeric values are aggregated
  • Loading branch information
MariusWirtz committed Apr 11, 2022
1 parent c90b1ba commit 7ae5439
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 16 deletions.
6 changes: 4 additions & 2 deletions TM1py/Services/CellService.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,8 @@ def write_dataframe(self, cube_name: str, data: 'pd.DataFrame', dimensions: Iter
increment: bool = False, deactivate_transaction_log: bool = False,
reactivate_transaction_log: bool = False, sandbox_name: str = None,
use_ti: bool = False, use_changeset: bool = False, precision: int = 8,
skip_non_updateable: bool = False, measure_dimension_elements: Dict = None, **kwargs) -> str:
skip_non_updateable: bool = False, measure_dimension_elements: Dict = None,
sum_numeric_duplicates: bool =True, **kwargs) -> str:
"""
Function expects same shape as `execute_mdx_dataframe` returns.
Column order must match dimensions in the target cube with an additional column for the values.
Expand All @@ -424,6 +425,7 @@ def write_dataframe(self, cube_name: str, data: 'pd.DataFrame', dimensions: Iter
:param measure_dimension_elements: dictionary of measure elements and their types to improve
performance when `use_ti` is `True`.
When all written values are numeric you can pass a default dict with default key 'Numeric'
:sum_numeric_duplicates: Aggregate numerical values for duplicated intersections
:return: changeset or None
"""
if not isinstance(data, pd.DataFrame):
Expand All @@ -435,7 +437,7 @@ def write_dataframe(self, cube_name: str, data: 'pd.DataFrame', dimensions: Iter
if not len(data.columns) == len(dimensions) + 1:
raise ValueError("Number of columns in 'data' DataFrame must be number of dimensions in cube + 1")

cells = build_cellset_from_pandas_dataframe(data)
cells = build_cellset_from_pandas_dataframe(data, sum_numeric_duplicates=sum_numeric_duplicates)

return self.write(cube_name=cube_name,
cellset_as_dict=cells,
Expand Down
29 changes: 23 additions & 6 deletions TM1py/Utils/Utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

try:
import pandas as pd
import numpy as np

_has_pandas = True
except ImportError:
Expand Down Expand Up @@ -687,26 +688,42 @@ def build_pandas_dataframe_from_cellset(cellset: Dict, multiindex: bool = True,


@require_pandas
def build_cellset_from_pandas_dataframe(df: 'pd.DataFrame') -> 'CaseAndSpaceInsensitiveTuplesDict':
def build_cellset_from_pandas_dataframe(
df: 'pd.DataFrame',
sum_numeric_duplicates: bool = True) -> 'CaseAndSpaceInsensitiveTuplesDict':
"""
:param df: a Pandas Dataframe, with dimension-column mapping in correct order.
param sum_numeric_duplicates: Aggregate numerical values for duplicated intersections
param df: a Pandas Dataframe, with dimension-column mapping in correct order.
As created in build_pandas_dataframe_from_cellset
:return: a CaseAndSpaceInsensitiveTuplesDict
"""
if isinstance(df.index, pd.MultiIndex):
df.reset_index(inplace=True)

# handle duplicate intersections
dimension_headers = df.columns[:-1]
value_header = df.columns[-1]
df = df.groupby([*dimension_headers])[value_header].sum().reset_index()
if sum_numeric_duplicates:
value_header = df.columns[-1]
dimension_headers = df.columns[:-1]

if pd.api.types.is_numeric_dtype(df[value_header]):
df = aggregate_duplicate_intersections(df, dimension_headers, value_header)
else:
filter_mask = df[value_header].apply(np.isreal)
df_n = df[filter_mask]
df_s = df[~ filter_mask]
df_n = aggregate_duplicate_intersections(df_n, dimension_headers, value_header)
df = pd.concat([df_n, df_s])

cellset = CaseAndSpaceInsensitiveTuplesDict(
dict(zip(df.iloc[:, :-1].itertuples(index=False, name=None), df.iloc[:, -1].values)))
return cellset


def aggregate_duplicate_intersections(df, dimension_headers, value_header):
return df.groupby([*dimension_headers])[value_header].sum().reset_index()


def lower_and_drop_spaces(item: str) -> str:
return item.replace(" ", "").lower()

Expand Down
52 changes: 44 additions & 8 deletions Tests/CellService_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,10 @@ def setUpClass(cls):

# Build Dimensions
for dimension_name in cls.dimension_names:
elements = [Element('Element {}'.format(str(j)), 'Numeric') for j in range(1, 1001)]
elements = [Element('Element {}'.format(str(j)), 'Numeric')
for j
in range(1, 1001)]

element_attributes = [ElementAttribute("Attr1", "String"),
ElementAttribute("Attr2", "Numeric"),
ElementAttribute("Attr3", "Numeric")]
Expand Down Expand Up @@ -177,21 +180,25 @@ def tearDown(cls):

@classmethod
def build_string_cube(cls):
if cls.tm1.cubes.exists(cls.string_cube_name):
cls.tm1.cubes.delete(cls.string_cube_name)

for d, dimension_name in enumerate(cls.string_dimension_names, start=1):
dimension = Dimension(dimension_name)
hierarchy = Hierarchy(dimension_name, dimension_name)
for i in range(1, 5, 1):
element_name = "d" + str(d) + "e" + str(i)
hierarchy.add_element(element_name=element_name, element_type="String")
dimension.add_hierarchy(hierarchy)
if not cls.tm1.dimensions.exists(dimension.name):
cls.tm1.dimensions.update_or_create(dimension)
cls.tm1.dimensions.update_or_create(dimension)

cube = Cube(name=cls.string_cube_name, dimensions=cls.string_dimension_names)
if not cls.tm1.cubes.exists(cube.name):
cls.tm1.cubes.update_or_create(cube)
# zero out cube
cls.tm1.processes.execute_ti_code("CubeClearData('" + cls.string_cube_name + "');")
cls.tm1.elements.add_elements(
dimension_name=cube.dimensions[-1],
hierarchy_name=cube.dimensions[-1],
elements=[Element("n1", "Numeric")])

cls.tm1.cubes.update_or_create(cube)

@classmethod
def remove_string_cube(cls):
Expand Down Expand Up @@ -723,7 +730,7 @@ def test_write_dataframe(self):
self.assertEqual(list(df["Value"]), values)

@skip_if_no_pandas
def test_write_dataframe_duplicate_entries(self):
def test_write_dataframe_duplicate_numeric_entries(self):
df = pd.DataFrame({
self.dimension_names[0]: ["element 1", "element 1", "element 1"],
self.dimension_names[1]: ["element 1", "element 1", "element 1"],
Expand All @@ -742,6 +749,35 @@ def test_write_dataframe_duplicate_entries(self):

self.assertEqual([6], values)

@skip_if_no_pandas
def test_write_dataframe_duplicate_numeric_and_string_entries(self):
df = pd.DataFrame({
self.string_dimension_names[0]: ["d1e1", "d1e1", "d1e1", "d1e1", "d1e1"],
self.string_dimension_names[1]: ["d2e1", "d2e1", "d2e1", "d2e1", "d2e1"],
self.string_dimension_names[2]: ["d3e1", "d3e2", "d3e2", "n1", "n1"],
"Value": ["text1", "text2", "text3", 3.0, 4.0]})
self.tm1.cubes.cells.write_dataframe(self.string_cube_name, df)

query = MdxBuilder.from_cube(self.string_cube_name)
query = query.add_hierarchy_set_to_column_axis(
MdxHierarchySet.member(Member.of(self.string_dimension_names[0], "d1e1")))
query = query.add_hierarchy_set_to_row_axis(MdxHierarchySet.members([
Member.of(self.string_dimension_names[1], "d2e1")]))
query = query.add_member_to_where(Member.of(self.string_dimension_names[2], "n1"))
values = self.tm1.cubes.cells.execute_mdx_values(query.to_mdx())
self.assertEqual([7], values)

query = MdxBuilder.from_cube(self.string_cube_name)
query = query.add_hierarchy_set_to_column_axis(
MdxHierarchySet.members([
Member.of(self.string_dimension_names[2], "d3e1"),
Member.of(self.string_dimension_names[2], "d3e2")]))
query = query.add_hierarchy_set_to_row_axis(MdxHierarchySet.members([
Member.of(self.string_dimension_names[1], "d2e1")]))
query = query.add_member_to_where(Member.of(self.string_dimension_names[0], "d1e1"))
values = self.tm1.cubes.cells.execute_mdx_values(query.to_mdx())
self.assertEqual(["text1", "text3"], values)

@skip_if_no_pandas
def test_write_dataframe_error(self):
df = pd.DataFrame({
Expand Down

0 comments on commit 7ae5439

Please sign in to comment.