diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 74c24e21..7fe05aee 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -1924,6 +1924,10 @@ def _get_skewness( ): return + if self._greater_than_64_bit and type(df_series) is pd.Series: + df_series = df_series.to_numpy(dtype=float) + else: + df_series = pl.from_pandas(df_series, nan_to_null=False) batch_biased_skewness = profiler_utils.biased_skew(df_series) subset_properties["biased_skewness"] = batch_biased_skewness batch_count = subset_properties["match_count"] @@ -1968,6 +1972,10 @@ def _get_kurtosis( ): return + if self._greater_than_64_bit and type(df_series) is pd.Series: + df_series = df_series.to_numpy(dtype=float) + else: + df_series = pl.from_pandas(df_series, nan_to_null=False) batch_biased_kurtosis = profiler_utils.biased_kurt(df_series) subset_properties["biased_kurtosis"] = batch_biased_kurtosis batch_count = subset_properties["match_count"] diff --git a/dataprofiler/profilers/profiler_utils.py b/dataprofiler/profilers/profiler_utils.py index e38e1b04..a81dca7a 100644 --- a/dataprofiler/profilers/profiler_utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -26,9 +26,11 @@ ) import numpy as np +import polars as pl import psutil import scipy -from pandas import DataFrame, Series +from pandas import DataFrame +from polars import Series from ..labelers.data_labelers import DataLabeler @@ -320,7 +322,7 @@ def add_nested_dictionaries(first_dict: dict, second_dict: dict) -> dict: return merged_dict -def biased_skew(df_series: Series) -> np.float64: +def biased_skew(df_series: Series | np.ndarray) -> np.float64: """ Calculate the biased estimator for skewness of the given data. @@ -358,7 +360,7 @@ def biased_skew(df_series: Series) -> np.float64: return skew -def biased_kurt(df_series: Series) -> np.float64: +def biased_kurt(df_series: Series | np.ndarray) -> np.float64: """ Calculate the biased estimator for kurtosis of the given data. @@ -675,6 +677,8 @@ def get_memory_size(data: list | np.ndarray | DataFrame, unit: str = "M") -> flo :type unit: string :return: memory size of the input data """ + if type(data) is DataFrame: + data = pl.from_pandas(data) unit_map: dict = collections.defaultdict(B=0, K=1, M=2, G=3) if unit not in unit_map: raise ValueError(