Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Profiler utils update #1092

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -1924,6 +1924,10 @@ def _get_skewness(
):
return

if self._greater_than_64_bit and type(df_series) is pd.Series:
df_series = df_series.to_numpy(dtype=float)
else:
df_series = pl.from_pandas(df_series, nan_to_null=False)
batch_biased_skewness = profiler_utils.biased_skew(df_series)
subset_properties["biased_skewness"] = batch_biased_skewness
batch_count = subset_properties["match_count"]
Expand Down Expand Up @@ -1968,6 +1972,10 @@ def _get_kurtosis(
):
return

if self._greater_than_64_bit and type(df_series) is pd.Series:
df_series = df_series.to_numpy(dtype=float)
else:
df_series = pl.from_pandas(df_series, nan_to_null=False)
batch_biased_kurtosis = profiler_utils.biased_kurt(df_series)
subset_properties["biased_kurtosis"] = batch_biased_kurtosis
batch_count = subset_properties["match_count"]
Expand Down
10 changes: 7 additions & 3 deletions dataprofiler/profilers/profiler_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@
)

import numpy as np
import polars as pl
import psutil
import scipy
from pandas import DataFrame, Series
from pandas import DataFrame
from polars import Series

from ..labelers.data_labelers import DataLabeler

Expand Down Expand Up @@ -320,7 +322,7 @@ def add_nested_dictionaries(first_dict: dict, second_dict: dict) -> dict:
return merged_dict


def biased_skew(df_series: Series) -> np.float64:
def biased_skew(df_series: Series | np.ndarray) -> np.float64:
"""
Calculate the biased estimator for skewness of the given data.

Expand Down Expand Up @@ -358,7 +360,7 @@ def biased_skew(df_series: Series) -> np.float64:
return skew


def biased_kurt(df_series: Series) -> np.float64:
def biased_kurt(df_series: Series | np.ndarray) -> np.float64:
"""
Calculate the biased estimator for kurtosis of the given data.

Expand Down Expand Up @@ -675,6 +677,8 @@ def get_memory_size(data: list | np.ndarray | DataFrame, unit: str = "M") -> flo
:type unit: string
:return: memory size of the input data
"""
if type(data) is DataFrame:
data = pl.from_pandas(data)
unit_map: dict = collections.defaultdict(B=0, K=1, M=2, G=3)
if unit not in unit_map:
raise ValueError(
Expand Down