diff --git a/README.md b/README.md index 3c0d70051..498ae4318 100644 --- a/README.md +++ b/README.md @@ -237,11 +237,11 @@ To maximize its usefulness in real world contexts, `ydata-profiling` has a set o | Integration type | Description | |---|---| | [Other DataFrame libraries](https://docs.profiling.ydata.ai/latest/integrations/other_dataframe_libraries) | How to compute the profiling of data stored in libraries other than pandas | -| [Great Expectations](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/great_expectations.html) | Generating [Great Expectations](https://greatexpectations.io) expectations suites directly from a profiling report | +| [Great Expectations](https://docs.profiling.ydata.ai/latest/integrations/great_expectations/) | Generating [Great Expectations](https://greatexpectations.io) expectations suites directly from a profiling report | | [Interactive applications](https://docs.profiling.ydata.ai/latest/integrations/interactive_applications) | Embedding profiling reports in [Streamlit](http://streamlit.io), [Dash](http://dash.plotly.com) or [Panel](https://panel.holoviz.org) applications | | [Pipelines](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/pipelines.html) | Integration with DAG workflow execution tools like [Airflow](https://airflow.apache.org) or [Kedro](https://kedro.org) | | [Cloud services](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/cloud_services.html) | Using `ydata-profiling` in hosted computation services like [Lambda](https://lambdalabs.com), [Google Cloud](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/blob/master/retail/propensity-model/bqml/bqml_kfp_retail_propensity_to_purchase.ipynb) or [Kaggle](https://www.kaggle.com/code) | -| [IDEs](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/ides.html) | Using `ydata-profiling` directly from integrated development environments such as [PyCharm](https://www.jetbrains.com/pycharm/) | +| [IDEs](https://docs.profiling.ydata.ai/latest/integrations/ides/) | Using `ydata-profiling` directly from integrated development environments such as [PyCharm](https://www.jetbrains.com/pycharm/) | ## 🙋 Support Need help? Want to share a perspective? Report a bug? Ideas for collaborations? Reach out via the following channels: diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 3fdcef09c..6d87d57df 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -112,6 +112,8 @@ class TimeseriesVars(BaseModel): lags: List[int] = [1, 7, 12, 24, 30] significance: float = 0.05 pacf_acf_lag: int = 100 + autolag: Optional[str] = "AIC" + maxlag: Optional[int] = None class Univariate(BaseModel): diff --git a/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py b/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py index 5ffe99a9f..7db4d56f3 100644 --- a/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py @@ -16,12 +16,15 @@ def stationarity_test(config: Settings, series: pd.Series) -> Tuple[bool, float]: - significance_threshold = config.vars.timeseries.significance - # make sure the data has no missing values - adfuller_test = adfuller(series.dropna()) + adfuller_test = adfuller( + series.dropna(), + autolag=config.vars.timeseries.autolag, + maxlag=config.vars.timeseries.maxlag, + ) p_value = adfuller_test[1] + significance_threshold = config.vars.timeseries.significance return p_value < significance_threshold, p_value diff --git a/src/ydata_profiling/model/typeset.py b/src/ydata_profiling/model/typeset.py index 65f0cd2d6..a8e5b9f2e 100644 --- a/src/ydata_profiling/model/typeset.py +++ b/src/ydata_profiling/model/typeset.py @@ -122,7 +122,7 @@ def get_relations() -> Sequence[TypeRelation]: @series_handle_nulls def contains_op(series: pd.Series, state: dict) -> bool: return ( - not pdt.is_categorical_dtype(series) + not isinstance(series.dtype, pd.CategoricalDtype) and pdt.is_string_dtype(series) and series_is_string(series, state) ) @@ -205,9 +205,9 @@ def get_relations() -> Sequence[TypeRelation]: @series_not_empty @series_handle_nulls def contains_op(series: pd.Series, state: dict) -> bool: - is_valid_dtype = pdt.is_categorical_dtype(series) and not pdt.is_bool_dtype( - series - ) + is_valid_dtype = isinstance( + series.dtype, pd.CategoricalDtype + ) and not pdt.is_bool_dtype(series) if is_valid_dtype: return True return False diff --git a/src/ydata_profiling/model/typeset_relations.py b/src/ydata_profiling/model/typeset_relations.py index 1a32526b0..beac15b51 100644 --- a/src/ydata_profiling/model/typeset_relations.py +++ b/src/ydata_profiling/model/typeset_relations.py @@ -31,7 +31,7 @@ def string_is_bool(series: pd.Series, state: dict, k: Dict[str, bool]) -> bool: def tester(s: pd.Series, state: dict) -> bool: return s.str.lower().isin(k.keys()).all() - if pdt.is_categorical_dtype(series): + if isinstance(series.dtype, pd.CategoricalDtype): return False return tester(series, state)