From 2777631d471a6ad5a012ae0f2e2f25e4f24958eb Mon Sep 17 00:00:00 2001 From: rugk Date: Mon, 21 Oct 2024 23:27:03 +0200 Subject: [PATCH 1/6] docs: Fix broken links in Readme (and pypi) On https://pypi.org/project/ydata-profiling/ and on GitHub these links are broken. Important: I could not find a replacement for the "Cloud services" link, which is **also** broken. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3c0d70051..498ae4318 100644 --- a/README.md +++ b/README.md @@ -237,11 +237,11 @@ To maximize its usefulness in real world contexts, `ydata-profiling` has a set o | Integration type | Description | |---|---| | [Other DataFrame libraries](https://docs.profiling.ydata.ai/latest/integrations/other_dataframe_libraries) | How to compute the profiling of data stored in libraries other than pandas | -| [Great Expectations](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/great_expectations.html) | Generating [Great Expectations](https://greatexpectations.io) expectations suites directly from a profiling report | +| [Great Expectations](https://docs.profiling.ydata.ai/latest/integrations/great_expectations/) | Generating [Great Expectations](https://greatexpectations.io) expectations suites directly from a profiling report | | [Interactive applications](https://docs.profiling.ydata.ai/latest/integrations/interactive_applications) | Embedding profiling reports in [Streamlit](http://streamlit.io), [Dash](http://dash.plotly.com) or [Panel](https://panel.holoviz.org) applications | | [Pipelines](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/pipelines.html) | Integration with DAG workflow execution tools like [Airflow](https://airflow.apache.org) or [Kedro](https://kedro.org) | | [Cloud services](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/cloud_services.html) | Using `ydata-profiling` in hosted computation services like [Lambda](https://lambdalabs.com), [Google Cloud](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/blob/master/retail/propensity-model/bqml/bqml_kfp_retail_propensity_to_purchase.ipynb) or [Kaggle](https://www.kaggle.com/code) | -| [IDEs](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/ides.html) | Using `ydata-profiling` directly from integrated development environments such as [PyCharm](https://www.jetbrains.com/pycharm/) | +| [IDEs](https://docs.profiling.ydata.ai/latest/integrations/ides/) | Using `ydata-profiling` directly from integrated development environments such as [PyCharm](https://www.jetbrains.com/pycharm/) | ## 🙋 Support Need help? Want to share a perspective? Report a bug? Ideas for collaborations? Reach out via the following channels: From e178f111894677b7c43d5075b32216c205a7497a Mon Sep 17 00:00:00 2001 From: alexbarros Date: Fri, 25 Oct 2024 08:42:38 -0300 Subject: [PATCH 2/6] feat: add adfuller stationary test parameters to config --- src/ydata_profiling/config.py | 4 +++- .../model/pandas/describe_timeseries_pandas.py | 9 ++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 3fdcef09c..5b0139338 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -1,7 +1,7 @@ """Configuration for the package.""" from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Literal, Optional, Tuple, Union import yaml from pydantic.v1 import BaseModel, BaseSettings, Field, PrivateAttr @@ -112,6 +112,8 @@ class TimeseriesVars(BaseModel): lags: List[int] = [1, 7, 12, 24, 30] significance: float = 0.05 pacf_acf_lag: int = 100 + autolag: Optional[Literal["AIC", "BIC", "t-stat"]] = "AIC" + maxlag: Optional[int] = None class Univariate(BaseModel): diff --git a/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py b/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py index 5ffe99a9f..0f2ca93a5 100644 --- a/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py @@ -16,12 +16,15 @@ def stationarity_test(config: Settings, series: pd.Series) -> Tuple[bool, float]: - significance_threshold = config.vars.timeseries.significance - # make sure the data has no missing values - adfuller_test = adfuller(series.dropna()) + adfuller_test = adfuller( + series.dropna(), + autolag=config.vars.timeseries.autolag, + maxlag=config.vars.timeseries.maxlag + ) p_value = adfuller_test[1] + significance_threshold = config.vars.timeseries.significance return p_value < significance_threshold, p_value From ab82c229792a711827317399b63d94579af67505 Mon Sep 17 00:00:00 2001 From: alexbarros Date: Fri, 25 Oct 2024 09:13:23 -0300 Subject: [PATCH 3/6] fix: Literal not supported in python 3.7 --- src/ydata_profiling/config.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 5b0139338..363a67a8c 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -1,11 +1,17 @@ """Configuration for the package.""" from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Literal, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import yaml from pydantic.v1 import BaseModel, BaseSettings, Field, PrivateAttr +try: + # typing only available in python 3.8+ + from typing import Literal + AutoLagType = Literal["AIC", "BIC", "t-stat"] +except: + AutoLagType = str def _merge_dictionaries(dict1: dict, dict2: dict) -> dict: """ @@ -112,7 +118,7 @@ class TimeseriesVars(BaseModel): lags: List[int] = [1, 7, 12, 24, 30] significance: float = 0.05 pacf_acf_lag: int = 100 - autolag: Optional[Literal["AIC", "BIC", "t-stat"]] = "AIC" + autolag: Optional[AutoLagType] = "AIC" maxlag: Optional[int] = None From 1bd3ac3b3a3acb57ae2763a45796185002f3357e Mon Sep 17 00:00:00 2001 From: alexbarros Date: Fri, 25 Oct 2024 09:13:47 -0300 Subject: [PATCH 4/6] fix: liting issue --- src/ydata_profiling/model/pandas/describe_timeseries_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py b/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py index 0f2ca93a5..7db4d56f3 100644 --- a/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py @@ -20,7 +20,7 @@ def stationarity_test(config: Settings, series: pd.Series) -> Tuple[bool, float] adfuller_test = adfuller( series.dropna(), autolag=config.vars.timeseries.autolag, - maxlag=config.vars.timeseries.maxlag + maxlag=config.vars.timeseries.maxlag, ) p_value = adfuller_test[1] From 4c99b30a22661054a5eb7b8f5b821f2da14f0e52 Mon Sep 17 00:00:00 2001 From: alexbarros Date: Fri, 25 Oct 2024 09:54:04 -0300 Subject: [PATCH 5/6] fix: mypy does not support dinamic types --- src/ydata_profiling/config.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 363a67a8c..6d87d57df 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -6,12 +6,6 @@ import yaml from pydantic.v1 import BaseModel, BaseSettings, Field, PrivateAttr -try: - # typing only available in python 3.8+ - from typing import Literal - AutoLagType = Literal["AIC", "BIC", "t-stat"] -except: - AutoLagType = str def _merge_dictionaries(dict1: dict, dict2: dict) -> dict: """ @@ -118,7 +112,7 @@ class TimeseriesVars(BaseModel): lags: List[int] = [1, 7, 12, 24, 30] significance: float = 0.05 pacf_acf_lag: int = 100 - autolag: Optional[AutoLagType] = "AIC" + autolag: Optional[str] = "AIC" maxlag: Optional[int] = None From 62884d7bd3162a31df78e553bac50811f504909e Mon Sep 17 00:00:00 2001 From: quant12345 Date: Tue, 16 Jul 2024 20:29:23 +0500 Subject: [PATCH 6/6] fix: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) --- src/ydata_profiling/model/typeset.py | 8 ++++---- src/ydata_profiling/model/typeset_relations.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ydata_profiling/model/typeset.py b/src/ydata_profiling/model/typeset.py index 65f0cd2d6..a8e5b9f2e 100644 --- a/src/ydata_profiling/model/typeset.py +++ b/src/ydata_profiling/model/typeset.py @@ -122,7 +122,7 @@ def get_relations() -> Sequence[TypeRelation]: @series_handle_nulls def contains_op(series: pd.Series, state: dict) -> bool: return ( - not pdt.is_categorical_dtype(series) + not isinstance(series.dtype, pd.CategoricalDtype) and pdt.is_string_dtype(series) and series_is_string(series, state) ) @@ -205,9 +205,9 @@ def get_relations() -> Sequence[TypeRelation]: @series_not_empty @series_handle_nulls def contains_op(series: pd.Series, state: dict) -> bool: - is_valid_dtype = pdt.is_categorical_dtype(series) and not pdt.is_bool_dtype( - series - ) + is_valid_dtype = isinstance( + series.dtype, pd.CategoricalDtype + ) and not pdt.is_bool_dtype(series) if is_valid_dtype: return True return False diff --git a/src/ydata_profiling/model/typeset_relations.py b/src/ydata_profiling/model/typeset_relations.py index 1a32526b0..beac15b51 100644 --- a/src/ydata_profiling/model/typeset_relations.py +++ b/src/ydata_profiling/model/typeset_relations.py @@ -31,7 +31,7 @@ def string_is_bool(series: pd.Series, state: dict, k: Dict[str, bool]) -> bool: def tester(s: pd.Series, state: dict) -> bool: return s.str.lower().isin(k.keys()).all() - if pdt.is_categorical_dtype(series): + if isinstance(series.dtype, pd.CategoricalDtype): return False return tester(series, state)