From 7431fd548f94e6884b0893074a000f9af1f1bbdb Mon Sep 17 00:00:00 2001 From: Vasco Ramos Date: Wed, 8 Mar 2023 14:45:59 +0000 Subject: [PATCH 1/4] chore(actions): fix docs publishing ci --- .github/workflows/release.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 2045056a0..e18bb1e16 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -117,7 +117,6 @@ jobs: - name: Install the package run: | make install - make install-spark-ci - name: Update examples run: make examples From 1cdb1c689cfb2243cfa797dc738716aa45c11032 Mon Sep 17 00:00:00 2001 From: Keagan O'Donoghue <56366381+k3agan@users.noreply.github.com> Date: Fri, 10 Mar 2023 01:17:48 +0000 Subject: [PATCH 2/4] docs: Updated 179 broken links --- .github/ISSUE_TEMPLATE/bug_report_form.yaml | 6 +- CONTRIBUTING.md | 100 ++++++++------- README.md | 115 +++++++++--------- .../source/pages/getting_started/overview.rst | 6 +- .../pages/getting_started/quickstart.rst | 2 +- .../pages/integrations/cloud_services.rst | 2 +- .../source/pages/integrations/data_apps.rst | 2 +- .../pages/integrations/great_expectations.rst | 4 +- .../source/pages/integrations/pipelines.rst | 2 +- docsrc/source/pages/integrations/pyspark.rst | 2 +- .../2020-04-14-release-v2-6-0.rst | 2 +- docsrc/source/pages/reference/changelog.rst | 2 +- .../pages/reference/changelog/v2_10_1.rst | 12 +- .../pages/reference/changelog/v2_11_0.rst | 4 +- .../pages/reference/changelog/v2_12_0.rst | 8 +- .../pages/reference/changelog/v2_7_0.rst | 22 ++-- .../pages/reference/changelog/v2_8_0.rst | 8 +- .../pages/reference/changelog/v2_9_0rc1.rst | 14 +-- .../pages/reference/changelog/v3_0_0.rst | 4 +- .../pages/reference/changelog/v3_2_0.rst | 6 +- .../pages/reference/changelog/v3_3_0.rst | 4 +- .../pages/reference/changelog/v3_3_1.rst | 4 +- .../pages/reference/changelog/v3_4_0.rst | 40 +++--- .../pages/reference/changelog/v3_5_0.md | 24 ++-- .../pages/reference/changelog/v3_5_1.md | 3 +- .../pages/reference/changelog/v3_6_0.md | 54 ++++---- .../pages/reference/changelog/v3_6_1.md | 5 +- .../pages/reference/changelog/v3_6_2.md | 9 +- .../pages/reference/changelog/v3_6_3.md | 5 +- docsrc/source/pages/reference/history.rst | 2 +- .../pages/support_contrib/common_issues.rst | 18 +-- .../contribution_guidelines.rst | 2 +- .../support_contrib/help_troubleshoot.rst | 2 +- docsrc/source/pages/use_cases/big_data.rst | 6 +- 34 files changed, 247 insertions(+), 254 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report_form.yaml b/.github/ISSUE_TEMPLATE/bug_report_form.yaml index 8d6703e21..b822bde3c 100644 --- a/.github/ISSUE_TEMPLATE/bug_report_form.yaml +++ b/.github/ISSUE_TEMPLATE/bug_report_form.yaml @@ -22,7 +22,7 @@ body: id: expected-behavior attributes: label: Expected Behaviour - description: Tell us what should happen + description: Tell us what should happen placeholder: ex. the report contained B validations: required: true @@ -84,12 +84,12 @@ body: label: Checklist description: | Please complete the checklist below to ensure the bug report is helpful and can be addressed effectively - + Tips: - Help for writing better bug reports is available in the [documentation](https://pandas-profiling.ydata.ai/docs/master/pages/support_contrib/help_troubleshoot.html). - If the description consists of multiple non-related bugs, you are encouraged to create separate issues. options: - - label: There is not yet another bug report for this issue in the [issue tracker](https://github.com/ydataai/pandas-profiling/issues) + - label: There is not yet another bug report for this issue in the [issue tracker](https://github.com/ydataai/ydata-profiling/issues) required: true - label: The problem is reproducible from this bug report. [This guide](http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) can help to craft a minimal bug report. required: true diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f7d2a44f1..c66025387 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,102 +1,100 @@ ## How to contribute to Pandas-Profiling -Pandas-profiling aims to ease exploratory data analysis for structured datasets, including time-series. +Pandas-profiling aims to ease exploratory data analysis for structured datasets, including time-series. Our focus is to provide users with useful and robust statistics for such datasets encountered in industry, academia and elsewhere. Pandas-profiling is open-source and stimulates contributions from passionate community users. - #### Themes to contribute + In line with our aim, we identify the following themes: -- **Exploratory data analysis**: +- **Exploratory data analysis**: The core of the package is a dataset summarization by its main characteristics, which is complemented with warnings on data issues and visualisations. - _Suggestions for contribution_: + _Suggestions for contribution_: Extend the support of more data types (think of paths, location or GPS coordinates and ordinal data types), - text data (e.g. encoding, vocabulary size, spelling errors, language detection), - time series analysis, + text data (e.g. encoding, vocabulary size, spelling errors, language detection), + time series analysis, or even images (e.g. dimensions, EXIF). - - _Related_: [#7][i7], [#129][i129], [#190][i190], [#204][i204] or [create one](https://github.com/ydataai/pandas-profiling/issues/new/choose). -- **Stability, Performance and Restricted environment compatibility:** + _Related_: [#7][i7], [#129][i129], [#190][i190], [#204][i204] or [create one](https://github.com/ydataai/ydata-profiling/issues/new/choose). + +- **Stability, Performance and Restricted environment compatibility:** Data exploration takes place in all kinds of conditions, on the latest machine learning platforms with enormous dataset to managed environments in large corporations. `pandas-profiling` helps analysts, researchers and engineers alike in these cases. We do this by fixing bugs, improving performance on big datasets and adding environment compatibility. - - _Suggestions for contribution (Performance)_: - Perform concurrency analysis or profile execution times and leverage the gained insights for improved performance (e.g. multiprocessing, cython, numba) or test the performance of `pandas-profiling` with [big data sets](https://www.stats.govt.nz/large-datasets/csv-files-for-download/) and corresponding commonly used data formats (such as parquet). - - _Suggestions for contribution (Stability)_: - Either review the code and add tests or watch the [issues page](https://github.com/ydataai/pandas-profiling/issues) and [Stackoverflow tag](https://stackoverflow.com/questions/tagged/pandas-profiling) to find current issues. - - _Related_: [#98][i98], [#122][i122] or [create one](https://github.com/ydataai/pandas-profiling/issues/new/choose). - -- **Interaction, presentation and user experience**: + + _Suggestions for contribution (Performance)_: + Perform concurrency analysis or profile execution times and leverage the gained insights for improved performance (e.g. multiprocessing, cython, numba) or test the performance of `pandas-profiling` with [big data sets](https://www.stats.govt.nz/large-datasets/csv-files-for-download/) and corresponding commonly used data formats (such as parquet). + + _Suggestions for contribution (Stability)_: + Either review the code and add tests or watch the [issues page](https://github.com/ydataai/ydata-profiling/issues) and [Stackoverflow tag](https://stackoverflow.com/questions/tagged/pandas-profiling) to find current issues. + + _Related_: [#98][i98], [#122][i122] or [create one](https://github.com/ydataai/ydata-profiling/issues/new/choose). + +- **Interaction, presentation and user experience**: As `pandas-profiling` eases exploratory data analysis, working with the package should reflect that. Interaction and user experience plays a central role in working with the package. Working on interactive and static features is possible through the modular nature of the package: the user can configure which features to use. _Suggestions for contribution (interactivity)_: - Interactivity allows for more user friendly applications, including but not limited to on demand analysis (don't compute what you don't want to see) and interactive histograms and correlations. - This is ideal for smaller datasets, where we can compute this on-the-fly. + Interactivity allows for more user friendly applications, including but not limited to on demand analysis (don't compute what you don't want to see) and interactive histograms and correlations. + This is ideal for smaller datasets, where we can compute this on-the-fly. `ipywidgets` would be a great place to start (e.g. [widget based view](https://ipywidgets.readthedocs.io/en/stable/examples/Widget%20List.html)). _Suggestions for contribution (presentation)_: Other forms of distribution than HTML (for example PDF or packaged as an GUI application via [PyQt](https://riverbankcomputing.com/software/pyqt/intro)) Users should be able to share reports (improve size of labels in graph, add explanations to correlation matrices and allow for styling/branding). - _Related_: [#161][i161], [#175][i175], [#191][i191] or [create one](https://github.com/ydataai/pandas-profiling/issues/new/choose). + _Related_: [#161][i161], [#175][i175], [#191][i191] or [create one](https://github.com/ydataai/ydata-profiling/issues/new/choose). -- **Community**: +- **Community**: The success of this package demonstrates the power of sharing and working together. You are welcome as part of this community. - + _Suggestions for contribution_: Share with us if this package is of value to you, let us know [in our community](https://discord.com/invite/mw7xjJ7b7s). We are interested in how you use `pandas-profiling` in your work. - - _Related_: [#87][i87] or [create one](https://github.com/ydataai/pandas-profiling/issues/new/choose). -- **Machine learning:** + _Related_: [#87][i87] or [create one](https://github.com/ydataai/ydata-profiling/issues/new/choose). + +- **Machine learning:** `pandas-profiling` is not a machine learning package, even though many of our users use EDA as a step prior to developing their models. Our focus lies in the exploratory data analysis. Any functionality that enables machine learning applications by more effective data profiling, is welcome. - _Related_: [#124][i124], [#173][i173], [#198][i198] or [create one](https://github.com/ydataai/pandas-profiling/issues/new/choose). + _Related_: [#124][i124], [#173][i173], [#198][i198] or [create one](https://github.com/ydataai/ydata-profiling/issues/new/choose). #### **Did you find a bug?** -* **Ensure the bug was not already reported** by searching on Github under [Issues](https://github.com/ydataai/pandas-profiling/issues). +- **Ensure the bug was not already reported** by searching on Github under [Issues](https://github.com/ydataai/ydata-profiling/issues). -* If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/ydataai/pandas-profiling/issues/new/choose). -If possible, use the relevant bug report templates to create the issue. +- If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/ydataai/ydata-profiling/issues/new/choose). + If possible, use the relevant bug report templates to create the issue. #### **Did you write a patch that fixes a bug?** -* Open a new Github pull request with the patch. - -* Ensure the PR description clearly describes the problem and solution. -Include the relevant issue number if applicable. +- Open a new Github pull request with the patch. +- Ensure the PR description clearly describes the problem and solution. + Include the relevant issue number if applicable. #### Acknowledgements We would like to thank everyone who has helped getting us to where we are now. -See the [Contributor Graph](https://github.com/ydataai/pandas-profiling/graphs/contributors) - -[i7]: https://github.com/ydataai/pandas-profiling/issues/7 -[i129]: https://github.com/ydataai/pandas-profiling/issues/129 -[i190]: https://github.com/ydataai/pandas-profiling/issues/190 -[i204]: https://github.com/ydataai/pandas-profiling/issues/204 -[i98]: https://github.com/ydataai/pandas-profiling/issues/98 -[i122]: https://github.com/ydataai/pandas-profiling/issues/122 -[i124]: https://github.com/ydataai/pandas-profiling/issues/24 -[i173]: https://github.com/ydataai/pandas-profiling/issues/173 -[i198]: https://github.com/ydataai/pandas-profiling/issues/198 -[i87]: https://github.com/ydataai/pandas-profiling/issues/87 -[i161]: https://github.com/ydataai/pandas-profiling/issues/161 -[i175]: https://github.com/ydataai/pandas-profiling/issues/175 -[i191]: https://github.com/ydataai/pandas-profiling/issues/191 - +See the [Contributor Graph](https://github.com/ydataai/ydata-profiling/graphs/contributors) + +[i7]: https://github.com/ydataai/ydata-profiling/issues/7 +[i129]: https://github.com/ydataai/ydata-profiling/issues/129 +[i190]: https://github.com/ydataai/ydata-profiling/issues/190 +[i204]: https://github.com/ydataai/ydata-profiling/issues/204 +[i98]: https://github.com/ydataai/ydata-profiling/issues/98 +[i122]: https://github.com/ydataai/ydata-profiling/issues/122 +[i124]: https://github.com/ydataai/ydata-profiling/issues/24 +[i173]: https://github.com/ydataai/ydata-profiling/issues/173 +[i198]: https://github.com/ydataai/ydata-profiling/issues/198 +[i87]: https://github.com/ydataai/ydata-profiling/issues/87 +[i161]: https://github.com/ydataai/ydata-profiling/issues/161 +[i175]: https://github.com/ydataai/ydata-profiling/issues/175 +[i191]: https://github.com/ydataai/ydata-profiling/issues/191 diff --git a/README.md b/README.md index 0659234ba..03a2c3b9e 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # ydata-profiling -[![Build Status](https://github.com/ydataai/pandas-profiling/actions/workflows/tests.yml/badge.svg?branch=master)](https://github.com/ydataai/pandas-profiling/actions/workflows/tests.yml) +[![Build Status](https://github.com/ydataai/ydata-profiling/actions/workflows/tests.yml/badge.svg?branch=master)](https://github.com/ydataai/ydata-profiling/actions/workflows/tests.yml) [![PyPI download month](https://img.shields.io/pypi/dm/ydata-profiling.svg)](https://pypi.python.org/pypi/ydata-profiling/) [![](https://pepy.tech/badge/pandas-profiling)](https://pypi.org/project/ydata-profiling/) [![Code Coverage](https://codecov.io/gh/ydataai/pandas-profiling/branch/master/graph/badge.svg?token=gMptB4YUnF)](https://codecov.io/gh/ydataai/pandas-profiling) -[![Release Version](https://img.shields.io/github/release/ydataai/pandas-profiling.svg)](https://github.com/ydataai/pandas-profiling/releases) +[![Release Version](https://img.shields.io/github/release/ydataai/pandas-profiling.svg)](https://github.com/ydataai/ydata-profiling/releases) [![Python Version](https://img.shields.io/pypi/pyversions/ydata-profiling)](https://pypi.org/project/ydata-profiling/) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) @@ -27,21 +27,21 @@ `ydata-profiling` primary goal is to provide a one-line Exploratory Data Analysis (EDA) experience in a consistent and fast solution. Like pandas `df.describe()` function, that is so handy, ydata-profiling delivers an extended analysis of a DataFrame while allowing the data analysis to be exported in different formats such as **html** and **json**. -The package outputs a simple and digested analysis of a dataset, including **time-series** and **text**. +The package outputs a simple and digested analysis of a dataset, including **time-series** and **text**. -### 🎊 New year, new face, more functionalities! +### 🎊 New year, new face, more functionalities! -Thank you for using and following ``pandas-profiling`` developments. Yet, we have a new exciting feature - we are now thrilled to announce +Thank you for using and following `pandas-profiling` developments. Yet, we have a new exciting feature - we are now thrilled to announce that Spark is now part of the Data Profiling family from version 4.0.0 onwards - -With its introduction, there was also the need for a new naming, one that will allow to decouple the concept of profiling from the Pandas Dataframes - `ydata-profiling`! - + +With its introduction, there was also the need for a new naming, one that will allow to decouple the concept of profiling from the Pandas Dataframes - `ydata-profiling`! + But fear not, `pip install pandas-profiling` will still be a valid for a while, and we will keep investing in growing the best open-source for data profiling, so you can use it for even more use cases. ## Key features -- **Type inference**: automatic detection of columns' data types (*Categorical*, *Numerical*, *Date*, etc.) -- **Warnings**: A summary of the problems/challenges in the data that you might need to work on (*missing data*, *inaccuracies*, *skewness*, etc.) +- **Type inference**: automatic detection of columns' data types (_Categorical_, _Numerical_, _Date_, etc.) +- **Warnings**: A summary of the problems/challenges in the data that you might need to work on (_missing data_, _inaccuracies_, _skewness_, etc.) - **Univariate analysis**: including descriptive statistics (mean, median, mode, etc) and informative visualizations such as distribution histograms - **Multivariate analysis**: including correlations, a detailed analysis of missing data, duplicate rows, and visual support for variables pairwise interaction - **Time-Series**: including different statistical information relative to time dependent data such as auto-correlation and seasonality, along ACF and PACF plots. @@ -58,7 +58,7 @@ The report contains three additional sections: ### 🎁 Latest features -- Want to scale? Check the latest release with ⭐⚑[Spark support](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/pypspark.html)! +- Want to scale? Check the latest release with ⭐⚑[Spark support](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/pypspark.html)! - Looking for how you can do an EDA for Time-Series πŸ•› ? Check [this blogpost](https://towardsdatascience.com/how-to-do-an-eda-for-time-series-cbb92b3b1913). - You want to compare 2 datasets and get a report? Check [this blogpost](https://medium.com/towards-artificial-intelligence/how-to-compare-2-dataset-with-pandas-profiling-2ae3a9d7695e) @@ -68,16 +68,17 @@ Spark support has been released, but we are always looking for an extra pair of [Check current work in progress!](https://github.com/ydataai/ydata-profiling/projects/3). ## πŸ“ Use cases + YData-profiling can be used to deliver a variety of different use-case. The documentation includes guides, tips and tricks for tackling them: -| Use case | Description | -|----------|---------------------------------------------------------------------------------------------| -| [Comparing datasets](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/comparing_datasets.html ) | Comparing multiple version of the same dataset | -| [Profiling a Time-Series dataset](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/time_series_datasets.html) | Generating a report for a time-series dataset with a single line of code | -|[Profiling large datasets](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/big_data.html ) | Tips on how to prepare data and configure `ydata-profiling` for working with large datasets | -| [Handling sensitive data](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/sensitive_data.html ) | Generating reports which are mindful about sensitive data in the input dataset | -| [Dataset metadata and data dictionaries](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/metadata.html) | Complementing the report with dataset details and column-specific data dictionaries | -| [Customizing the report's appearance](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/custom_report_appearance.html ) | Changing the appearance of the report's page and of the contained visualizations | +| Use case | Description | +| --------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | +| [Comparing datasets](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/comparing_datasets.html) | Comparing multiple version of the same dataset | +| [Profiling a Time-Series dataset](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/time_series_datasets.html) | Generating a report for a time-series dataset with a single line of code | +| [Profiling large datasets](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/big_data.html) | Tips on how to prepare data and configure `ydata-profiling` for working with large datasets | +| [Handling sensitive data](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/sensitive_data.html) | Generating reports which are mindful about sensitive data in the input dataset | +| [Dataset metadata and data dictionaries](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/metadata.html) | Complementing the report with dataset details and column-specific data dictionaries | +| [Customizing the report's appearance](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/custom_report_appearance.html) | Changing the appearance of the report's page and of the contained visualizations | ## ▢️ Quickstart @@ -149,25 +150,27 @@ Additional details on the CLI are available [on the documentation](https://ydata The following example reports showcase the potentialities of the package across a wide range of dataset and data types: -* [Census Income](https://ydata-profiling.ydata.ai/examples/master/census/census_report.html) (US Adult Census data relating income with other demographic properties) -* [NASA Meteorites](https://ydata-profiling.ydata.ai/examples/master/meteorites/meteorites_report.html) (comprehensive set of meteorite landing - object properties and locations) [![Open In Colab](https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/github/ydataai/pandas-profiling/blob/master/examples/meteorites/meteorites_cloud.ipynb) [![Binder](https://camo.githubusercontent.com/483bae47a175c24dfbfc57390edd8b6982ac5fb3/68747470733a2f2f6d7962696e6465722e6f72672f62616467655f6c6f676f2e737667)](https://mybinder.org/v2/gh/ydataai/pandas-profiling/master?filepath=examples%2Fmeteorites%2Fmeteorites%5Fcloud.ipynb) -* [Titanic](https://ydata-profiling.ydata.ai/examples/master/titanic/titanic_report.html) (the "Wonderwall" of datasets) [![Open In Colab](https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/github/ydataai/pandas-profiling/blob/master/examples/titanic/titanic_cloud.ipynb) [![Binder](https://camo.githubusercontent.com/483bae47a175c24dfbfc57390edd8b6982ac5fb3/68747470733a2f2f6d7962696e6465722e6f72672f62616467655f6c6f676f2e737667)](https://mybinder.org/v2/gh/ydataai/pandas-profiling/master?filepath=examples%2Ftitanic%2Ftitanic%5Fcloud.ipynb) -* [NZA](https://ydata-profiling.ydata.ai/examples/master/nza/nza_report.html) (open data from the Dutch Healthcare Authority) -* [Stata Auto](https://ydata-profiling.ydata.ai/examples/master/stata_auto/stata_auto_report.html) (1978 Automobile data) -* [Colors](https://ydata-profiling.ydata.ai/examples/master/colors/colors_report.html) (a simple colors dataset) -* [Vektis](https://ydata-profiling.ydata.ai/examples/master/vektis/vektis_report.html) (Vektis Dutch Healthcare data) -* [UCI Bank Dataset](https://ydata-profiling.ydata.ai/examples/master/bank_marketing_data/uci_bank_marketing_report.html) (marketing dataset from a bank) -* [Russian Vocabulary](https://ydata-profiling.ydata.ai/examples/master/features/russian_vocabulary.html) (100 most common Russian words, showcasing unicode text analysis) -* [Website Inaccessibility](https://ydata-profiling.ydata.ai/examples/master/features/website_inaccessibility_report.html) (website accessibility analysis, showcasing support for URL data) -* [Orange prices](https://ydata-profiling.ydata.ai/examples/master/features/united_report.html) and -* [Coal prices](https://ydata-profiling.ydata.ai/examples/master/features/flatly_report.html) (simple pricing evolution datasets, showcasing the theming options) -* [USA Air Quality](https://github.com/ydataai/pandas-profiling/tree/master/examples/usaairquality) (Time-series air quality dataset EDA example) -* [HCC](https://github.com/ydataai/pandas-profiling/tree/master/examples/hcc) (Open dataset from healthcare, showcasing compare between two sets of data, before and after preprocessing) +- [Census Income](https://ydata-profiling.ydata.ai/examples/master/census/census_report.html) (US Adult Census data relating income with other demographic properties) +- [NASA Meteorites](https://ydata-profiling.ydata.ai/examples/master/meteorites/meteorites_report.html) (comprehensive set of meteorite landing - object properties and locations) [![Open In Colab](https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/github/ydataai/pandas-profiling/blob/master/examples/meteorites/meteorites_cloud.ipynb) [![Binder](https://camo.githubusercontent.com/483bae47a175c24dfbfc57390edd8b6982ac5fb3/68747470733a2f2f6d7962696e6465722e6f72672f62616467655f6c6f676f2e737667)](https://mybinder.org/v2/gh/ydataai/pandas-profiling/master?filepath=examples%2Fmeteorites%2Fmeteorites%5Fcloud.ipynb) +- [Titanic](https://ydata-profiling.ydata.ai/examples/master/titanic/titanic_report.html) (the "Wonderwall" of datasets) [![Open In Colab](https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/github/ydataai/pandas-profiling/blob/master/examples/titanic/titanic_cloud.ipynb) [![Binder](https://camo.githubusercontent.com/483bae47a175c24dfbfc57390edd8b6982ac5fb3/68747470733a2f2f6d7962696e6465722e6f72672f62616467655f6c6f676f2e737667)](https://mybinder.org/v2/gh/ydataai/pandas-profiling/master?filepath=examples%2Ftitanic%2Ftitanic%5Fcloud.ipynb) +- [NZA](https://ydata-profiling.ydata.ai/examples/master/nza/nza_report.html) (open data from the Dutch Healthcare Authority) +- [Stata Auto](https://ydata-profiling.ydata.ai/examples/master/stata_auto/stata_auto_report.html) (1978 Automobile data) +- [Colors](https://ydata-profiling.ydata.ai/examples/master/colors/colors_report.html) (a simple colors dataset) +- [Vektis](https://ydata-profiling.ydata.ai/examples/master/vektis/vektis_report.html) (Vektis Dutch Healthcare data) +- [UCI Bank Dataset](https://ydata-profiling.ydata.ai/examples/master/bank_marketing_data/uci_bank_marketing_report.html) (marketing dataset from a bank) +- [Russian Vocabulary](https://ydata-profiling.ydata.ai/examples/master/features/russian_vocabulary.html) (100 most common Russian words, showcasing unicode text analysis) +- [Website Inaccessibility](https://ydata-profiling.ydata.ai/examples/master/features/website_inaccessibility_report.html) (website accessibility analysis, showcasing support for URL data) +- [Orange prices](https://ydata-profiling.ydata.ai/examples/master/features/united_report.html) and +- [Coal prices](https://ydata-profiling.ydata.ai/examples/master/features/flatly_report.html) (simple pricing evolution datasets, showcasing the theming options) +- [USA Air Quality](https://github.com/ydataai/ydata-profiling/tree/master/examples/usaairquality) (Time-series air quality dataset EDA example) +- [HCC](https://github.com/ydataai/ydata-profiling/tree/master/examples/hcc) (Open dataset from healthcare, showcasing compare between two sets of data, before and after preprocessing) ## πŸ› οΈ Installation + Additional details, including information about widget support, are available [on the documentation](https://ydata-profiling.ydata.ai/docs/master/pages/getting_started/installation.html). ### Using pip + [![PyPi Downloads](https://pepy.tech/badge/ydata-profiling)](https://pepy.tech/project/ydata-profiling) [![PyPi Monthly Downloads](https://pepy.tech/badge/pandas-profiling/month)](https://pepy.tech/project/ydata-profiling/month) [![PyPi Version](https://badge.fury.io/py/ydata-profiling.svg)](https://pypi.org/project/ydata-profiling/) @@ -182,9 +185,9 @@ pip install -U ydata-profiling The package declares "extras", sets of additional dependencies. -* `[notebook]`: support for rendering the report in Jupyter notebook widgets. -* `[unicode]`: support for more detailed Unicode analysis, at the expense of additional disk space. -* `[pyspark]`: support for pyspark for big dataset analysis +- `[notebook]`: support for rendering the report in Jupyter notebook widgets. +- `[unicode]`: support for more detailed Unicode analysis, at the expense of additional disk space. +- `[pyspark]`: support for pyspark for big dataset analysis Install these with e.g. @@ -192,11 +195,10 @@ Install these with e.g. pip install -U ydata-profiling[notebook,unicode,pyspark] ``` - ### Using conda -[![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/pandas-profiling.svg)](https://anaconda.org/conda-forge/pandas-profiling) -[![Conda Version](https://img.shields.io/conda/vn/conda-forge/pandas-profiling.svg)](https://anaconda.org/conda-forge/pandas-profiling) +[![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/pandas-profiling.svg)](https://anaconda.org/conda-forge/pandas-profiling) +[![Conda Version](https://img.shields.io/conda/vn/conda-forge/pandas-profiling.svg)](https://anaconda.org/conda-forge/pandas-profiling) You can install using the `conda` package manager by running: @@ -206,7 +208,7 @@ conda install -c conda-forge ydata-profiling ### From source (development) -Download the source code by cloning the repository or click on [Download ZIP](https://github.com/ydataai/pandas-profiling/archive/master.zip) to download the latest stable version. +Download the source code by cloning the repository or click on [Download ZIP](https://github.com/ydataai/ydata-profiling/archive/master.zip) to download the latest stable version. Install it by navigating to the proper directory and running: @@ -214,31 +216,32 @@ Install it by navigating to the proper directory and running: pip install -e . ``` -The profiling report is written in HTML and CSS, which means a modern browser is required. +The profiling report is written in HTML and CSS, which means a modern browser is required. You need [Python 3](https://python3statement.org/) to run the package. Other dependencies can be found in the requirements files: -| Filename | Requirements| -|----------|-------------| -| [requirements.txt](https://github.com/ydataai/pandas-profiling/blob/master/requirements.txt) | Package requirements| -| [requirements-dev.txt](https://github.com/ydataai/pandas-profiling/blob/master/requirements-dev.txt) | Requirements for development| -| [requirements-test.txt](https://github.com/ydataai/pandas-profiling/blob/master/requirements-test.txt) | Requirements for testing| -| [setup.py](https://github.com/ydataai/pandas-profiling/blob/master/setup.py) | Requirements for widgets etc. | +| Filename | Requirements | +| ----------------------------------------------------------------------------------------------------- | ----------------------------- | +| [requirements.txt](https://github.com/ydataai/ydata-profiling/blob/master/requirements.txt) | Package requirements | +| [requirements-dev.txt](https://github.com/ydataai/ydata-profiling/blob/master/requirements-dev.txt) | Requirements for development | +| [requirements-test.txt](https://github.com/ydataai/ydata-profiling/blob/master/requirements-test.txt) | Requirements for testing | +| [setup.py](https://github.com/ydataai/ydata-profiling/blob/master/setup.py) | Requirements for widgets etc. | ## πŸ”— Integrations -To maximize its usefulness in real world contexts, `pandas-profiling` has a set of implicit and explicit integrations with a variety of other actors in the Data Science ecosystem: +To maximize its usefulness in real world contexts, `pandas-profiling` has a set of implicit and explicit integrations with a variety of other actors in the Data Science ecosystem: -| Integration type | Description | -|---|---| -| [Other DataFrame libraries](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/other_dataframe_libraries.html) | How to compute the profiling of data stored in libraries other than pandas | -| [Great Expectations](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/great_expectations.html) | Generating [Great Expectations](https://greatexpectations.io) expectations suites directly from a profiling report | -| [Interactive applications](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/data_apps.html) | Embedding profiling reports in [Streamlit](http://streamlit.io), [Dash](http://dash.plotly.com) or [Panel](https://panel.holoviz.org) applications | -| [Pipelines](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/pipelines.html) | Integration with DAG workflow execution tools like [Airflow](https://airflow.apache.org) or [Kedro](https://kedro.org) | -| [Cloud services](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/cloud_services.html) | Using `pandas-profiling` in hosted computation services like [Lambda](https://lambdalabs.com), [Google Cloud](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/blob/master/retail/propensity-model/bqml/bqml_kfp_retail_propensity_to_purchase.ipynb) or [Kaggle](https://www.kaggle.com/code) | -| [IDEs](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/ides.html) | Using `pandas-profiling` directly from integrated development environments such as [PyCharm](https://www.jetbrains.com/pycharm/) | +| Integration type | Description | +| --------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [Other DataFrame libraries](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/other_dataframe_libraries.html) | How to compute the profiling of data stored in libraries other than pandas | +| [Great Expectations](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/great_expectations.html) | Generating [Great Expectations](https://greatexpectations.io) expectations suites directly from a profiling report | +| [Interactive applications](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/data_apps.html) | Embedding profiling reports in [Streamlit](http://streamlit.io), [Dash](http://dash.plotly.com) or [Panel](https://panel.holoviz.org) applications | +| [Pipelines](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/pipelines.html) | Integration with DAG workflow execution tools like [Airflow](https://airflow.apache.org) or [Kedro](https://kedro.org) | +| [Cloud services](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/cloud_services.html) | Using `pandas-profiling` in hosted computation services like [Lambda](https://lambdalabs.com), [Google Cloud](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/blob/master/retail/propensity-model/bqml/bqml_kfp_retail_propensity_to_purchase.ipynb) or [Kaggle](https://www.kaggle.com/code) | +| [IDEs](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/ides.html) | Using `pandas-profiling` directly from integrated development environments such as [PyCharm](https://www.jetbrains.com/pycharm/) | ## πŸ™‹ Support + Need help? Want to share a perspective? Report a bug? Ideas for collaborations? Reach out via the following channels: - [Stack Overflow](https://stackoverflow.com/questions/tagged/pandas-profiling+or+ydata-profiling): ideal for asking questions on how to use the package diff --git a/docsrc/source/pages/getting_started/overview.rst b/docsrc/source/pages/getting_started/overview.rst index 86c250ca6..323e11fe2 100644 --- a/docsrc/source/pages/getting_started/overview.rst +++ b/docsrc/source/pages/getting_started/overview.rst @@ -5,9 +5,9 @@ Overview .. image:: https://ydataai.github.io/pandas-profiling/docs/assets/logo_header.png :alt: Pandas Profiling Logo Header -.. image:: https://github.com/ydataai/pandas-profiling/actions/workflows/tests.yml/badge.svg?branch=master +.. image:: https://github.com/ydataai/ydata-profiling/actions/workflows/tests.yml/badge.svg?branch=master :alt: Build Status - :target: https://github.com/ydataai/pandas-profiling/actions/workflows/tests.yml + :target: https://github.com/ydataai/ydata-profiling/actions/workflows/tests.yml .. image:: https://codecov.io/gh/ydataai/pandas-profiling/branch/master/graph/badge.svg?token=gMptB4YUnF :alt: Code Coverage @@ -15,7 +15,7 @@ Overview .. image:: https://img.shields.io/github/release/pandas-profiling/pandas-profiling.svg :alt: Release Version - :target: https://github.com/ydataai/pandas-profiling/releases + :target: https://github.com/ydataai/ydata-profiling/releases .. image:: https://img.shields.io/pypi/pyversions/pandas-profiling :alt: Python Version diff --git a/docsrc/source/pages/getting_started/quickstart.rst b/docsrc/source/pages/getting_started/quickstart.rst index a2d447c80..c708385bc 100644 --- a/docsrc/source/pages/getting_started/quickstart.rst +++ b/docsrc/source/pages/getting_started/quickstart.rst @@ -87,7 +87,7 @@ Information about all available options and arguments can be viewed through the Deeper profiling ---------------- -The contents, behaviour and appearance of the report are easily customizable. The example code below loads the `explorative configuration file `_, +The contents, behaviour and appearance of the report are easily customizable. The example code below loads the `explorative configuration file `_, which includes many features for text analysis (length distribution, word distribution and character/unicode information), files (file size, creation time) and images (dimensions, EXIF information). The exact settings used in this explorative configuration file can be compared with the `default configuration file `_. diff --git a/docsrc/source/pages/integrations/cloud_services.rst b/docsrc/source/pages/integrations/cloud_services.rst index be28ab82a..8eeeef4d6 100644 --- a/docsrc/source/pages/integrations/cloud_services.rst +++ b/docsrc/source/pages/integrations/cloud_services.rst @@ -21,4 +21,4 @@ The Google Cloud Platform documentation features an article that uses ``ydata-pr Kaggle ------ -``ydata-profiling`` is available in `Kaggle notebooks `_ by default, as it is included in the `standard Kaggle image `_. +``ydata-profiling`` is available in `Kaggle notebooks `_ by default, as it is included in the `standard Kaggle image `_. diff --git a/docsrc/source/pages/integrations/data_apps.rst b/docsrc/source/pages/integrations/data_apps.rst index 8436045c1..d17406c01 100644 --- a/docsrc/source/pages/integrations/data_apps.rst +++ b/docsrc/source/pages/integrations/data_apps.rst @@ -39,7 +39,7 @@ You can install the `pandas-profiling component `_ is a Python framework for building machine learning & data science web apps, built on top of Plotly.js, React and Flask. It is commonly used for interactive data exploration, precisely where ``ydata-profiling`` also focuses. Inline access to the insights provided by ``pandas-profiling`` can help guide the exploratory work allowed by Dash. To integrate a Profiling Report inside a Dash app, two options exist: +`Dash `_ is a Python framework for building machine learning & data science web apps, built on top of Plotly.js, React and Flask. It is commonly used for interactive data exploration, precisely where ``ydata-profiling`` also focuses. Inline access to the insights provided by ``pandas-profiling`` can help guide the exploratory work allowed by Dash. To integrate a Profiling Report inside a Dash app, two options exist: Load HTML version of report as an asset ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docsrc/source/pages/integrations/great_expectations.rst b/docsrc/source/pages/integrations/great_expectations.rst index b0b366d5b..d834bea94 100644 --- a/docsrc/source/pages/integrations/great_expectations.rst +++ b/docsrc/source/pages/integrations/great_expectations.rst @@ -75,7 +75,7 @@ You can also configure each feature individually in the function call: handler=handler, ) -See `the Great Expectations Examples `_ for complete examples. +See `the Great Expectations Examples `_ for complete examples. Included Expectation types @@ -108,7 +108,7 @@ The ``to_expectation_suite`` method returns a default set of Expectations if ``p * ``expect_file_to_exist`` -The default logic is straight forward and can be found in `expectation_algorithms.py `_. +The default logic is straight forward and can be found in `expectation_algorithms.py `_. Rolling your own Expectation Generation Logic --------------------------------------------- diff --git a/docsrc/source/pages/integrations/pipelines.rst b/docsrc/source/pages/integrations/pipelines.rst index b4590d981..708a255e4 100644 --- a/docsrc/source/pages/integrations/pipelines.rst +++ b/docsrc/source/pages/integrations/pipelines.rst @@ -7,7 +7,7 @@ With Python, command-line and Jupyter interfaces, ``pandas-profiling`` integrate Airflow ------- -Integration with Airflow can be easily achieved through the `BashOperator `_ or the `PythonOperator `_. +Integration with Airflow can be easily achieved through the `BashOperator `_ or the `PythonOperator `_. .. code-block:: python diff --git a/docsrc/source/pages/integrations/pyspark.rst b/docsrc/source/pages/integrations/pyspark.rst index 26b5939ff..a9f5ffb71 100644 --- a/docsrc/source/pages/integrations/pyspark.rst +++ b/docsrc/source/pages/integrations/pyspark.rst @@ -91,6 +91,6 @@ ydata-profiling in Databricks Yes! We have fantastic new coming with a full tutorial on how you can use ydata-profiling in Databricks Notebooks. -The notebook example can be found `here `_. +The notebook example can be found `here `_. Stay tuned - we are going to update the documentation soon! \ No newline at end of file diff --git a/docsrc/source/pages/reference/announcements/2020-04-14-release-v2-6-0.rst b/docsrc/source/pages/reference/announcements/2020-04-14-release-v2-6-0.rst index e6047a320..9cf564b56 100644 --- a/docsrc/source/pages/reference/announcements/2020-04-14-release-v2-6-0.rst +++ b/docsrc/source/pages/reference/announcements/2020-04-14-release-v2-6-0.rst @@ -32,6 +32,6 @@ Therefore, we welcome you to support the project through GitHub! Find more information here: - `Sponsor the project on GitHub `_ -- `Read the release notes v2.6.0 `_ +- `Read the release notes v2.6.0 `_ April 14, 2020 πŸ’˜ \ No newline at end of file diff --git a/docsrc/source/pages/reference/changelog.rst b/docsrc/source/pages/reference/changelog.rst index 7abce534c..b69865bc7 100644 --- a/docsrc/source/pages/reference/changelog.rst +++ b/docsrc/source/pages/reference/changelog.rst @@ -58,4 +58,4 @@ Changelog Prior to v2.7.0 --------------- -Previously, there was no explicit changelog. However, changes were included in the release description on GitHub, which you can find `in this page `_. +Previously, there was no explicit changelog. However, changes were included in the release description on GitHub, which you can find `in this page `_. diff --git a/docsrc/source/pages/reference/changelog/v2_10_1.rst b/docsrc/source/pages/reference/changelog/v2_10_1.rst index 819e73aa3..dd73150aa 100644 --- a/docsrc/source/pages/reference/changelog/v2_10_1.rst +++ b/docsrc/source/pages/reference/changelog/v2_10_1.rst @@ -3,13 +3,13 @@ Changelog v2.10.1 πŸ› Bug fixes ^^^^^^^^^^^^ -- Fixed recursion error for NaN values `[683] `_ and `[671] `_ -- Fixed error for empty dataframe `[664] `_ -- Fixed Jupyter notebook widget string rendering issue `[668] `_ -- Fixed histogram of string length with NaNs `[642] `_ and `[613] `_ -- Fixed slugify logic for interaction columns `[663] `_ +- Fixed recursion error for NaN values `[683] `_ and `[671] `_ +- Fixed error for empty dataframe `[664] `_ +- Fixed Jupyter notebook widget string rendering issue `[668] `_ +- Fixed histogram of string length with NaNs `[642] `_ and `[613] `_ +- Fixed slugify logic for interaction columns `[663] `_ πŸ“– Documentation ^^^^^^^^^^^^^^^^ -- Update Slack community link on readme `[673] `_ +- Update Slack community link on readme `[673] `_ - Include recent contributions to the "Resources" page. \ No newline at end of file diff --git a/docsrc/source/pages/reference/changelog/v2_11_0.rst b/docsrc/source/pages/reference/changelog/v2_11_0.rst index 6e915ebd5..590ef2a8b 100644 --- a/docsrc/source/pages/reference/changelog/v2_11_0.rst +++ b/docsrc/source/pages/reference/changelog/v2_11_0.rst @@ -3,8 +3,8 @@ Changelog v2.11.0 πŸŽ‰ Features ^^^^^^^^^^^ -- Great Expectations integration `[430] `_ `docs `_ (thanks @spbail, @talagluck and the Great Expectations team). -- Introduced the ``infer_dtypes`` parameter to control automatic inference of data types `[676] `_ (thanks @mohith7548 and @ieaves). +- Great Expectations integration `[430] `_ `docs `_ (thanks @spbail, @talagluck and the Great Expectations team). +- Introduced the ``infer_dtypes`` parameter to control automatic inference of data types `[676] `_ (thanks @mohith7548 and @ieaves). - Improved JSON representation for pd.Series, pd.DataFrame, numpy data and Samples. 🚨 Breaking changes diff --git a/docsrc/source/pages/reference/changelog/v2_12_0.rst b/docsrc/source/pages/reference/changelog/v2_12_0.rst index 60437603b..c0bb78c3b 100644 --- a/docsrc/source/pages/reference/changelog/v2_12_0.rst +++ b/docsrc/source/pages/reference/changelog/v2_12_0.rst @@ -3,14 +3,14 @@ Changelog v2.12.0 πŸŽ‰ Features ^^^^^^^^^^^ -- Add the number and the percentage of negative values for numerical variables `[695] `_ (contributed by @gverbock) +- Add the number and the percentage of negative values for numerical variables `[695] `_ (contributed by @gverbock) - Enable setting of typeset/summarizer (contributed by @ieaves) -- Allow empty data frames `[678] `_ (contributed by @spbail, @fwd2020-c) +- Allow empty data frames `[678] `_ (contributed by @spbail, @fwd2020-c) πŸ› Bug fixes ^^^^^^^^^^^^ -- Patch args for great_expectations datetime profiler `[727] `_ (contributed by @jstammers) -- Negative exponent formatting `[723] `_ (reported by @rdpapworth) +- Patch args for great_expectations datetime profiler `[727] `_ (contributed by @jstammers) +- Negative exponent formatting `[723] `_ (reported by @rdpapworth) πŸ“– Documentation ^^^^^^^^^^^^^^^^ diff --git a/docsrc/source/pages/reference/changelog/v2_7_0.rst b/docsrc/source/pages/reference/changelog/v2_7_0.rst index ccf5007bd..6178c2d19 100644 --- a/docsrc/source/pages/reference/changelog/v2_7_0.rst +++ b/docsrc/source/pages/reference/changelog/v2_7_0.rst @@ -4,35 +4,35 @@ Changelog v2.7.0 πŸŽ‰ Features ^^^^^^^^^^^ -- Reports are built in phases, see issue for details (`#421 `_) +- Reports are built in phases, see issue for details (`#421 `_) - The most occurring duplicates rows are included in the report. - ProfileReports can now be saved to and loaded from disk (for caching). - Explicit analysis duration is added to the reproduction section of the report. - **Doc**: this version introduces documentation powered by Sphinx. The previously used pdoc3 has been adequate initially, however misses functionality and extensibility. -- **Doc**: Dedicated page for large datasets is created (`#420 `_). -- **Doc**: The installation instructions have been extended, installation via conda would default to 1.4.1 (`#449 `_, `#448 `_). +- **Doc**: Dedicated page for large datasets is created (`#420 `_). +- **Doc**: The installation instructions have been extended, installation via conda would default to 1.4.1 (`#449 `_, `#448 `_). - **CI**: Linting, building the documentation and examples and uploading the package to PyPi have been automated using git flow and Github Actions. πŸ› Bug fixes ^^^^^^^^^^^^ -- warnings were not shown in the "warnings" tab, but were at variable level (`#389 `_). -- The "median absolute deviation" is now reported instead of the "mean absolute deviation" (`#453 `_). +- warnings were not shown in the "warnings" tab, but were at variable level (`#389 `_). +- The "median absolute deviation" is now reported instead of the "mean absolute deviation" (`#453 `_). - Several style-related fixes for Jupyter lab and notebooks (tables, warnings, wide images). -- ``pd.NAN`` introduced in ``pandas`` 1 now supported (`#437 `_). -- The logic for calculating infinite values is now correct (`#397 `_). +- ``pd.NAN`` introduced in ``pandas`` 1 now supported (`#437 `_). +- The logic for calculating infinite values is now correct (`#397 `_). πŸ‘·β€β™‚οΈ Internal Improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - The number of progress bars is reduced. The progress bars are now grouped by build phase (e.g. describing dataset, building report structure, rendering report, exporting to file). -- The progress bars provide more information about the current step to the user `#434 `_). -- Invalid correlations coefficients do not cause it to drop the complete variable anymore, instead the plot now propagates the NaN (`#417 `_). +- The progress bars provide more information about the current step to the user `#434 `_). +- Invalid correlations coefficients do not cause it to drop the complete variable anymore, instead the plot now propagates the NaN (`#417 `_). - **Performance**: type inference test now short-circuit, as ``visions`` does by default. - **Performance**: the numerical summary is optimized to use ``numpy`` directly, instead of slower methods provided by ``pandas``. -- **Config**: dynamic histogram bins are now disabled by default default for better default computational performance (`#441 `_). +- **Config**: dynamic histogram bins are now disabled by default default for better default computational performance (`#441 `_). - **Config**: type inference to warning when date variables are processed as categorical is set to False by default for being a bottleneck for larger datasets. -- **Warn**: the user is warned that the ``to_widgets`` does not work in Google Colab, which doesn't support ``ipywidgets`` properly (`#462 `_). +- **Warn**: the user is warned that the ``to_widgets`` does not work in Google Colab, which doesn't support ``ipywidgets`` properly (`#462 `_). - **Cln**: Moved ProfileReport out of ``__init__`` to it's own class file. - **Cln**: removed the ``output_file`` parameter form examples. - **Cln**: the HTML representation of the footer and wrapper are moved out of ProfileReport to the report structure. diff --git a/docsrc/source/pages/reference/changelog/v2_8_0.rst b/docsrc/source/pages/reference/changelog/v2_8_0.rst index cc8498d10..d16d19c30 100644 --- a/docsrc/source/pages/reference/changelog/v2_8_0.rst +++ b/docsrc/source/pages/reference/changelog/v2_8_0.rst @@ -5,18 +5,18 @@ Changelog v2.8.0 ^^^^^^^^^^^ - Expanded the Unicode analysis capabilities: next to the most occurring unicode scripts, categories and blocks, it's now possible to inspect the most frequent characters for each of them. - ProfileReport.set_variable now accepts nested parameters such as ``report.set_variable("variables.descriptions", {"var1": "Identifier"})``. -- Ability to have descriptions of the variables alongside the descriptive statistics (`#232 `_, `#402 `_). +- Ability to have descriptions of the variables alongside the descriptive statistics (`#232 `_, `#402 `_). - **Config**: Introducing config shorthands. - **Config**: ``plot.scatter_threshold`` allows for configuration above what value scatter plots are replace with hexbin plots. -- **Config**: ``html.inline`` allows for rendering assets as vector images to package export as folder and file (similar to exporting a website). (`#452 `_). -- It's now possible to specify which interactions to compute to filter out un-needed interactions between columns (`#451 `_). +- **Config**: ``html.inline`` allows for rendering assets as vector images to package export as folder and file (similar to exporting a website). (`#452 `_). +- It's now possible to specify which interactions to compute to filter out un-needed interactions between columns (`#451 `_). - When the ``output_file`` is omitted in the CLI, it uses the ``input_file`` with HTML extensions. This can be useful when profiling of a complete directory from the command line, e.g. ``find . -type f -name "*.csv" -exec pandas_profiling {} \;``. - **Config**: Split the ``vars.cat.check_composition`` in ``vars.cat.unicode`` and ``vars.cat.length`` for more control on the summaries. - **Config**: Included a new configuration sample file ``config_explorative.yml``, including ``Text`` (length distribution, unicode information), ``File`` (file size, creation time), ``Image`` (dimensions, exif information). πŸ› Bug fixes ^^^^^^^^^^^^ -- Resolved color ValueError on Mac (`#464 `_). +- Resolved color ValueError on Mac (`#464 `_). - **Style**: too many interactions overflowed tabs. Now they elegantly turn into a select control. - Unique variables are always uniform and have high cardinality, hence we can remove the redundant labels. - The counts for unicode properties were based on unique characters, instead of following the original frequency distribution. diff --git a/docsrc/source/pages/reference/changelog/v2_9_0rc1.rst b/docsrc/source/pages/reference/changelog/v2_9_0rc1.rst index 0d7355c96..7c5777e19 100644 --- a/docsrc/source/pages/reference/changelog/v2_9_0rc1.rst +++ b/docsrc/source/pages/reference/changelog/v2_9_0rc1.rst @@ -3,28 +3,28 @@ Changelog v2.9.0rc1 πŸŽ‰ Features ^^^^^^^^^^^ -- Working with sensitive data: Introduced ``sensitive=True`` option to mask non-aggregated data (such as samples, duplicates, frequency tables for categorical columns) [`#503 `_]. +- Working with sensitive data: Introduced ``sensitive=True`` option to mask non-aggregated data (such as samples, duplicates, frequency tables for categorical columns) [`#503 `_]. - The sample section can be parametrized with a custom sample (for instance mock data). -- Introduce shorthands for groups of parameters for styles and explorative mode [`#499 `_]. +- Introduce shorthands for groups of parameters for styles and explorative mode [`#499 `_]. - Metadata of a dataset can be added to the report (see documentation). - Numeric columns now report monotonicity information. - A pie chart can be generated for boolean and (low) categorical columns. πŸ› Bug fixes ^^^^^^^^^^^^ -- NaT in date columns were interpreted as a date in 1680 by histograms [`#507 `_]. -- ValueError: ('widget type not understood', 'select') [`#493 `_]. -- Fixed regression in working with pandas' nullable integers [`#502 `_]. +- NaT in date columns were interpreted as a date in 1680 by histograms [`#507 `_]. +- ValueError: ('widget type not understood', 'select') [`#493 `_]. +- Fixed regression in working with pandas' nullable integers [`#502 `_]. - Formatting of precision of numeric values has been improved in a few places. πŸ‘·β€β™‚οΈ Internal Improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - Histograms used to be calculated at view time (single thread) and are now computed in parallel. -- Matplotlib's rcParams are now modified through the contextmanager [`#494 `_]. +- Matplotlib's rcParams are now modified through the contextmanager [`#494 `_]. πŸ“– Documentation ^^^^^^^^^^^^^^^^ -- Links to Colab and Binder notebooks [`#480 `_ and `#497 `_]. +- Links to Colab and Binder notebooks [`#480 `_ and `#497 `_]. - The documentation for sensitive data, large datasets and metadata have been extended. 🚨 Breaking changes diff --git a/docsrc/source/pages/reference/changelog/v3_0_0.rst b/docsrc/source/pages/reference/changelog/v3_0_0.rst index 94e958711..940a4cc49 100644 --- a/docsrc/source/pages/reference/changelog/v3_0_0.rst +++ b/docsrc/source/pages/reference/changelog/v3_0_0.rst @@ -11,8 +11,8 @@ This is the first release to adhere to the `SemVer `_ and ` πŸ› Bug fixes ^^^^^^^^^^^^ -- Various issues could not be (easily) solved in the previous configuration architecture, are fixed in this release (`[584] `_, `[644] `_, `[698] `_, `[720] `_ and `[724] `_) -- Fix crash with exotic characters (`[707] `_) +- Various issues could not be (easily) solved in the previous configuration architecture, are fixed in this release (`[584] `_, `[644] `_, `[698] `_, `[720] `_ and `[724] `_) +- Fix crash with exotic characters (`[707] `_) - Fixed the way (sub)titles were shown in the report grids. πŸ“– Documentation diff --git a/docsrc/source/pages/reference/changelog/v3_2_0.rst b/docsrc/source/pages/reference/changelog/v3_2_0.rst index 2afac0702..e703ad4e3 100644 --- a/docsrc/source/pages/reference/changelog/v3_2_0.rst +++ b/docsrc/source/pages/reference/changelog/v3_2_0.rst @@ -3,15 +3,15 @@ Changelog v3.2.0 πŸŽ‰ Features ^^^^^^^^^^^ -- Add stop words to word_summary_vc `[#863] `_ +- Add stop words to word_summary_vc `[#863] `_ - show categorical freq with stacked barh instead of pie - Make pie plot colors customizable πŸ› Bug fixes ^^^^^^^^^^^^ -- Fix pandas 1.4.x compatibility `[#911] `_ +- Fix pandas 1.4.x compatibility `[#911] `_ - Omit setting of mpl backend (special thanks to `Jake Odom `_ -- Character counts bugfix `[#842] `_ +- Character counts bugfix `[#842] `_ - Default type for render map (Unsupported) πŸ‘·β€β™‚οΈ Internal Improvements diff --git a/docsrc/source/pages/reference/changelog/v3_3_0.rst b/docsrc/source/pages/reference/changelog/v3_3_0.rst index a0db3f86d..3c1004c76 100644 --- a/docsrc/source/pages/reference/changelog/v3_3_0.rst +++ b/docsrc/source/pages/reference/changelog/v3_3_0.rst @@ -7,8 +7,8 @@ Changelog v3.3.0 πŸ› Bug fixes ^^^^^^^^^^^^ -- High correlation warning printed multiple time `[#1019] `_ `[#824] `_ -- Incorrect duplicated rows count `[#1012] `_ +- High correlation warning printed multiple time `[#1019] `_ `[#824] `_ +- Incorrect duplicated rows count `[#1012] `_ πŸ‘·β€β™‚οΈ Internal Improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docsrc/source/pages/reference/changelog/v3_3_1.rst b/docsrc/source/pages/reference/changelog/v3_3_1.rst index 27b40be0f..28df3bc4c 100644 --- a/docsrc/source/pages/reference/changelog/v3_3_1.rst +++ b/docsrc/source/pages/reference/changelog/v3_3_1.rst @@ -5,6 +5,6 @@ Changelog v3.3.1 ^^^^^^^^^^^^ - remove unused imports - (`66864c1 `__) + (`66864c1 `__) - Remove unused imports. - (`985fbd1 `__) + (`985fbd1 `__) diff --git a/docsrc/source/pages/reference/changelog/v3_4_0.rst b/docsrc/source/pages/reference/changelog/v3_4_0.rst index 0d45c4f8e..f8725e186 100644 --- a/docsrc/source/pages/reference/changelog/v3_4_0.rst +++ b/docsrc/source/pages/reference/changelog/v3_4_0.rst @@ -5,38 +5,38 @@ Changelog v3.4.0 ^^^^^^^^^^^^ - correlation passing extra parameters - (`#1114 `__) - (`21f4fe6 `__) + (`#1114 `__) + (`21f4fe6 `__) - cramer’s correlation fails with missings vals - (`#1109 `__) - (`8e7f8b2 `__) + (`#1109 `__) + (`8e7f8b2 `__) - drop joblib dependency - (`#1090 `__) - (`586cef3 `__), + (`#1090 `__) + (`586cef3 `__), closes - `#1056 `__ + `#1056 `__ - fix linter errors - (`#1117 `__) - (`5f17cfd `__) + (`#1117 `__) + (`5f17cfd `__) - make tangled-up-in-unicode an optional dependency - (`#1070 `__) - (`e6b2a00 `__) + (`#1070 `__) + (`e6b2a00 `__) - remove unused imports - (`56beed4 `__) + (`56beed4 `__) - remove unused imports - (`66864c1 `__) + (`66864c1 `__) - Remove unused imports. - (`985fbd1 `__) + (`985fbd1 `__) πŸŽ‰ Features ^^^^^^^^^^^^ - add support for Pandas 1.5 - (`#1076 `__) - (`5c5a710 `__) + (`#1076 `__) + (`5c5a710 `__) - added filter to locate columns - (`#1115 `__) - (`c2f817d `__) + (`#1115 `__) + (`c2f817d `__) - introduce auto parameter for correlations - (`#1095 `__) - (`4d2e415 `__) + (`#1095 `__) + (`4d2e415 `__) diff --git a/docsrc/source/pages/reference/changelog/v3_5_0.md b/docsrc/source/pages/reference/changelog/v3_5_0.md index f4899e4e4..814a94bac 100644 --- a/docsrc/source/pages/reference/changelog/v3_5_0.md +++ b/docsrc/source/pages/reference/changelog/v3_5_0.md @@ -1,20 +1,18 @@ ### Changelog v3.5.0 - #### πŸ› Bug fixes -* change context managed backend ([#1149](https://github.com/ydataai/pandas-profiling/issues/1149)) ([11e1a8a](https://github.com/ydataai/pandas-profiling/commit/11e1a8a3fa8d13513fe926b731fb907a066af2a1)) -* dataset names on comparison report ([#1159](https://github.com/ydataai/pandas-profiling/issues/1159)) ([3c14d43](https://github.com/ydataai/pandas-profiling/commit/3c14d438d9a557ac85f5663cc3446c0fb3081e18)) -* duplicate key in test dict ([#1126](https://github.com/ydataai/pandas-profiling/issues/1126)) ([d19affe](https://github.com/ydataai/pandas-profiling/commit/d19affe15a4e3063af7187ca5fa81f1bf75ce648)) -* improve description and correct plot for β€˜auto’ correlation ([#1119](https://github.com/ydataai/pandas-profiling/issues/1119)) ([2617b92](https://github.com/ydataai/pandas-profiling/commit/2617b92d08ed87546c80e0cc01cd475d1e60ec56)) -* remove correlation calculation for constants ([#1152](https://github.com/ydataai/pandas-profiling/issues/1152)) ([1ed2bc0](https://github.com/ydataai/pandas-profiling/commit/1ed2bc0702f504592ed211097469405a5061a857)) -* time series render format ([#1157](https://github.com/ydataai/pandas-profiling/issues/1157)) ([39ca8ce](https://github.com/ydataai/pandas-profiling/commit/39ca8ce7d4ed2ad0ebb78db5d5f26d3ace08753a)) -* update config files to only calculate 'auto' correlation ([#1158](https://github.com/ydataai/pandas-profiling/issues/1158)) ([34cf73d](https://github.com/ydataai/pandas-profiling/commit/34cf73dadaea08e44e741f99fa0a10c322c86109)) -* update repository links ([#1141](https://github.com/ydataai/pandas-profiling/issues/1141)) ([c742c5d](https://github.com/ydataai/pandas-profiling/commit/c742c5dbeb18fe2907a4c03792e8802993c46da5)) - +- change context managed backend ([#1149](https://github.com/ydataai/ydata-profiling/issues/1149)) ([11e1a8a](https://github.com/ydataai/ydata-profiling/commit/11e1a8a3fa8d13513fe926b731fb907a066af2a1)) +- dataset names on comparison report ([#1159](https://github.com/ydataai/ydata-profiling/issues/1159)) ([3c14d43](https://github.com/ydataai/ydata-profiling/commit/3c14d438d9a557ac85f5663cc3446c0fb3081e18)) +- duplicate key in test dict ([#1126](https://github.com/ydataai/ydata-profiling/issues/1126)) ([d19affe](https://github.com/ydataai/ydata-profiling/commit/d19affe15a4e3063af7187ca5fa81f1bf75ce648)) +- improve description and correct plot for β€˜auto’ correlation ([#1119](https://github.com/ydataai/ydata-profiling/issues/1119)) ([2617b92](https://github.com/ydataai/ydata-profiling/commit/2617b92d08ed87546c80e0cc01cd475d1e60ec56)) +- remove correlation calculation for constants ([#1152](https://github.com/ydataai/ydata-profiling/issues/1152)) ([1ed2bc0](https://github.com/ydataai/ydata-profiling/commit/1ed2bc0702f504592ed211097469405a5061a857)) +- time series render format ([#1157](https://github.com/ydataai/ydata-profiling/issues/1157)) ([39ca8ce](https://github.com/ydataai/ydata-profiling/commit/39ca8ce7d4ed2ad0ebb78db5d5f26d3ace08753a)) +- update config files to only calculate 'auto' correlation ([#1158](https://github.com/ydataai/ydata-profiling/issues/1158)) ([34cf73d](https://github.com/ydataai/ydata-profiling/commit/34cf73dadaea08e44e741f99fa0a10c322c86109)) +- update repository links ([#1141](https://github.com/ydataai/ydata-profiling/issues/1141)) ([c742c5d](https://github.com/ydataai/ydata-profiling/commit/c742c5dbeb18fe2907a4c03792e8802993c46da5)) #### πŸŽ‰ Features -* add typechecking to profile report ([#1139](https://github.com/ydataai/pandas-profiling/issues/1139)) ([ec8ece0](https://github.com/ydataai/pandas-profiling/commit/ec8ece0de394eb4c2918bb6a74f0c5e5bb77ca61)) -* report comparison example ([#1160](https://github.com/ydataai/pandas-profiling/issues/1160)) ([5e75fd2](https://github.com/ydataai/pandas-profiling/commit/5e75fd275d14c8ce7ba49d0a15ec26810c4c0e73)) -* report comparisons ([#1069](https://github.com/ydataai/pandas-profiling/issues/1069)) ([70ee5c7](https://github.com/ydataai/pandas-profiling/commit/70ee5c776ad0c72d709631690a2df1cde5ca0424)), closes [#1137](https://github.com/ydataai/pandas-profiling/issues/1137) [#1136](https://github.com/ydataai/pandas-profiling/issues/1136) [#1143](https://github.com/ydataai/pandas-profiling/issues/1143) [#1148](https://github.com/ydataai/pandas-profiling/issues/1148) [#1150](https://github.com/ydataai/pandas-profiling/issues/1150) +- add typechecking to profile report ([#1139](https://github.com/ydataai/ydata-profiling/issues/1139)) ([ec8ece0](https://github.com/ydataai/ydata-profiling/commit/ec8ece0de394eb4c2918bb6a74f0c5e5bb77ca61)) +- report comparison example ([#1160](https://github.com/ydataai/ydata-profiling/issues/1160)) ([5e75fd2](https://github.com/ydataai/ydata-profiling/commit/5e75fd275d14c8ce7ba49d0a15ec26810c4c0e73)) +- report comparisons ([#1069](https://github.com/ydataai/ydata-profiling/issues/1069)) ([70ee5c7](https://github.com/ydataai/ydata-profiling/commit/70ee5c776ad0c72d709631690a2df1cde5ca0424)), closes [#1137](https://github.com/ydataai/ydata-profiling/issues/1137) [#1136](https://github.com/ydataai/ydata-profiling/issues/1136) [#1143](https://github.com/ydataai/ydata-profiling/issues/1143) [#1148](https://github.com/ydataai/ydata-profiling/issues/1148) [#1150](https://github.com/ydataai/ydata-profiling/issues/1150) diff --git a/docsrc/source/pages/reference/changelog/v3_5_1.md b/docsrc/source/pages/reference/changelog/v3_5_1.md index f6274d6d8..881ad761f 100644 --- a/docsrc/source/pages/reference/changelog/v3_5_1.md +++ b/docsrc/source/pages/reference/changelog/v3_5_1.md @@ -1,6 +1,5 @@ ### Changelog v3.5.1 - #### πŸ› Bug fixes -* **utils:** use 'urllib.request' instead of 'requests' ([#1177](https://github.com/ydataai/pandas-profiling/issues/1177)) ([e4d020b](https://github.com/ydataai/pandas-profiling/commit/e4d020b873b67845a329517e42620ed96545d60e)), closes [#1168](https://github.com/ydataai/pandas-profiling/issues/1168) \ No newline at end of file +- **utils:** use 'urllib.request' instead of 'requests' ([#1177](https://github.com/ydataai/ydata-profiling/issues/1177)) ([e4d020b](https://github.com/ydataai/ydata-profiling/commit/e4d020b873b67845a329517e42620ed96545d60e)), closes [#1168](https://github.com/ydataai/ydata-profiling/issues/1168) diff --git a/docsrc/source/pages/reference/changelog/v3_6_0.md b/docsrc/source/pages/reference/changelog/v3_6_0.md index d2ed762e1..925d9e3d5 100644 --- a/docsrc/source/pages/reference/changelog/v3_6_0.md +++ b/docsrc/source/pages/reference/changelog/v3_6_0.md @@ -1,34 +1,32 @@ -# [3.6.0](https://github.com/ydataai/pandas-profiling/compare/v3.5.0...v3.6.0) (2022-12-21) - +# [3.6.0](https://github.com/ydataai/ydata-profiling/compare/v3.5.0...v3.6.0) (2022-12-21) #### πŸ› Bug fixes -* add css to cope with large tables ([7f42f87](https://github.com/ydataai/pandas-profiling/commit/7f42f87cedd06694fe83241416e1fa21327b8c97)) -* adjust categoricals layout ([f0bb45a](https://github.com/ydataai/pandas-profiling/commit/f0bb45a2a2d89b5c6e77fd20939e069979b2b948)) -* categorical data not being obscured in the common values plot ([40236bc](https://github.com/ydataai/pandas-profiling/commit/40236bc67619a8aadeae797920c6238616169641)) -* compare report ignoring config parameter ([3d60556](https://github.com/ydataai/pandas-profiling/commit/3d6055675579d72a5ddf34c4c85e94befb403e72)) -* compare report warnings always showing the last alert type ([6b3c13d](https://github.com/ydataai/pandas-profiling/commit/6b3c13dd33489c8a895b2db1854b23a7edd3b948)) -* comparison fails when duplicates are disable ([#1208](https://github.com/ydataai/pandas-profiling/issues/1208)) ([6d19620](https://github.com/ydataai/pandas-profiling/commit/6d1962044d5bcf634266998551328bd3cdeb354c)) -* do no raise exception for percentage formatter ([3ea626d](https://github.com/ydataai/pandas-profiling/commit/3ea626de3d839a55fb0fac9dc7a5fa1da18ba037)) -* enforce recomputation of description sets ([a9fd1c8](https://github.com/ydataai/pandas-profiling/commit/a9fd1c845511679a18c87a9566d343ea945e9f16)) -* error comparing only one precomputed profile ([00646cd](https://github.com/ydataai/pandas-profiling/commit/00646cde15e0fb0dad29e4bd3cc5747b3eff61e2)) -* **html:** sensible cloud-platform notebook html rendering ([b22ece2](https://github.com/ydataai/pandas-profiling/commit/b22ece261c0e9a74254361b6b7e121ab94abe44d)) -* ignoring config of precomputed reports ([6478c40](https://github.com/ydataai/pandas-profiling/commit/6478c4047ee871ede7f7aa76379818ee3217e7d7)) -* only compute auto correlation when no config is specified ([d5d4f58](https://github.com/ydataai/pandas-profiling/commit/d5d4f58d3b0728bed021677ffb7be14cb7f04f27)) -* remove malfunctioning hook ([e2593f5](https://github.com/ydataai/pandas-profiling/commit/e2593f5bb093117c7afb8914eafbda6e2e110782)) -* remove unused test ([2170338](https://github.com/ydataai/pandas-profiling/commit/21703385a42bf38d4306511e0f99bed9e1092991)) -* return the proper type for widgets ([4c0b358](https://github.com/ydataai/pandas-profiling/commit/4c0b358002d75139c23babc30cbc0c7b23534d92)) -* set compute default to false ([c70e491](https://github.com/ydataai/pandas-profiling/commit/c70e49136fbdf1d3fe7e6ef5b23a8adbd0567ecf)) -* solve mypy error ([9c4266e](https://github.com/ydataai/pandas-profiling/commit/9c4266eb1cb252d8008795080723598d2d151e26)) -* solve mypy issue ([e3e7788](https://github.com/ydataai/pandas-profiling/commit/e3e7788907eebcf572423b48800f848d965f5969)) -* uses colors from the specified config ([c0c556d](https://github.com/ydataai/pandas-profiling/commit/c0c556d29cc191d44fdb08fc813818709c1b0666)) -* **utils:** use 'urllib.request' instead of 'requests' ([#1177](https://github.com/ydataai/pandas-profiling/issues/1177)) ([e4d020b](https://github.com/ydataai/pandas-profiling/commit/e4d020b873b67845a329517e42620ed96545d60e)), closes [#1168](https://github.com/ydataai/pandas-profiling/issues/1168) - +- add css to cope with large tables ([7f42f87](https://github.com/ydataai/ydata-profiling/commit/7f42f87cedd06694fe83241416e1fa21327b8c97)) +- adjust categoricals layout ([f0bb45a](https://github.com/ydataai/ydata-profiling/commit/f0bb45a2a2d89b5c6e77fd20939e069979b2b948)) +- categorical data not being obscured in the common values plot ([40236bc](https://github.com/ydataai/ydata-profiling/commit/40236bc67619a8aadeae797920c6238616169641)) +- compare report ignoring config parameter ([3d60556](https://github.com/ydataai/ydata-profiling/commit/3d6055675579d72a5ddf34c4c85e94befb403e72)) +- compare report warnings always showing the last alert type ([6b3c13d](https://github.com/ydataai/ydata-profiling/commit/6b3c13dd33489c8a895b2db1854b23a7edd3b948)) +- comparison fails when duplicates are disable ([#1208](https://github.com/ydataai/ydata-profiling/issues/1208)) ([6d19620](https://github.com/ydataai/ydata-profiling/commit/6d1962044d5bcf634266998551328bd3cdeb354c)) +- do no raise exception for percentage formatter ([3ea626d](https://github.com/ydataai/ydata-profiling/commit/3ea626de3d839a55fb0fac9dc7a5fa1da18ba037)) +- enforce recomputation of description sets ([a9fd1c8](https://github.com/ydataai/ydata-profiling/commit/a9fd1c845511679a18c87a9566d343ea945e9f16)) +- error comparing only one precomputed profile ([00646cd](https://github.com/ydataai/ydata-profiling/commit/00646cde15e0fb0dad29e4bd3cc5747b3eff61e2)) +- **html:** sensible cloud-platform notebook html rendering ([b22ece2](https://github.com/ydataai/ydata-profiling/commit/b22ece261c0e9a74254361b6b7e121ab94abe44d)) +- ignoring config of precomputed reports ([6478c40](https://github.com/ydataai/ydata-profiling/commit/6478c4047ee871ede7f7aa76379818ee3217e7d7)) +- only compute auto correlation when no config is specified ([d5d4f58](https://github.com/ydataai/ydata-profiling/commit/d5d4f58d3b0728bed021677ffb7be14cb7f04f27)) +- remove malfunctioning hook ([e2593f5](https://github.com/ydataai/ydata-profiling/commit/e2593f5bb093117c7afb8914eafbda6e2e110782)) +- remove unused test ([2170338](https://github.com/ydataai/ydata-profiling/commit/21703385a42bf38d4306511e0f99bed9e1092991)) +- return the proper type for widgets ([4c0b358](https://github.com/ydataai/ydata-profiling/commit/4c0b358002d75139c23babc30cbc0c7b23534d92)) +- set compute default to false ([c70e491](https://github.com/ydataai/ydata-profiling/commit/c70e49136fbdf1d3fe7e6ef5b23a8adbd0567ecf)) +- solve mypy error ([9c4266e](https://github.com/ydataai/ydata-profiling/commit/9c4266eb1cb252d8008795080723598d2d151e26)) +- solve mypy issue ([e3e7788](https://github.com/ydataai/ydata-profiling/commit/e3e7788907eebcf572423b48800f848d965f5969)) +- uses colors from the specified config ([c0c556d](https://github.com/ydataai/ydata-profiling/commit/c0c556d29cc191d44fdb08fc813818709c1b0666)) +- **utils:** use 'urllib.request' instead of 'requests' ([#1177](https://github.com/ydataai/ydata-profiling/issues/1177)) ([e4d020b](https://github.com/ydataai/ydata-profiling/commit/e4d020b873b67845a329517e42620ed96545d60e)), closes [#1168](https://github.com/ydataai/ydata-profiling/issues/1168) #### πŸŽ‰ Features -* add heatmap values as a table under correlations ([fc5da9e](https://github.com/ydataai/pandas-profiling/commit/fc5da9eff07e7e18c5fd2d8caa698af7aee861e2)) -* allow to specify the configuration for the comparison report ([ad725b0](https://github.com/ydataai/pandas-profiling/commit/ad725b0f7d3b61c2a4fafddbdbfc1451197e2c94)) -* design improvements on the correlations section ([e5cd8cf](https://github.com/ydataai/pandas-profiling/commit/e5cd8cfb4b91f22b3435f9830f516e929c4e8d32)) -* implement imbalanced warning ([ce84c81](https://github.com/ydataai/pandas-profiling/commit/ce84c81c9d2194237676a407fbe5d2461ed64eda)) -* update variables layout ([#1207](https://github.com/ydataai/pandas-profiling/issues/1207)) ([cf0e0a7](https://github.com/ydataai/pandas-profiling/commit/cf0e0a72477ce13941caf09887afe6a1c3073858)) \ No newline at end of file +- add heatmap values as a table under correlations ([fc5da9e](https://github.com/ydataai/ydata-profiling/commit/fc5da9eff07e7e18c5fd2d8caa698af7aee861e2)) +- allow to specify the configuration for the comparison report ([ad725b0](https://github.com/ydataai/ydata-profiling/commit/ad725b0f7d3b61c2a4fafddbdbfc1451197e2c94)) +- design improvements on the correlations section ([e5cd8cf](https://github.com/ydataai/ydata-profiling/commit/e5cd8cfb4b91f22b3435f9830f516e929c4e8d32)) +- implement imbalanced warning ([ce84c81](https://github.com/ydataai/ydata-profiling/commit/ce84c81c9d2194237676a407fbe5d2461ed64eda)) +- update variables layout ([#1207](https://github.com/ydataai/ydata-profiling/issues/1207)) ([cf0e0a7](https://github.com/ydataai/ydata-profiling/commit/cf0e0a72477ce13941caf09887afe6a1c3073858)) diff --git a/docsrc/source/pages/reference/changelog/v3_6_1.md b/docsrc/source/pages/reference/changelog/v3_6_1.md index ae2524d7a..e8a4b30b4 100644 --- a/docsrc/source/pages/reference/changelog/v3_6_1.md +++ b/docsrc/source/pages/reference/changelog/v3_6_1.md @@ -1,7 +1,6 @@ ### Changelog v3.6.1 - #### πŸ› Bug fixes -* categorical var frequency plot ([6cb391f](https://github.com/ydataai/pandas-profiling/commit/6cb391fd8d26c98792e14592b4d853f9a557eab0)) -* remove ipywidgets import ([1b8b117](https://github.com/ydataai/pandas-profiling/commit/1b8b11719cd2a1dfcde9ecd7406aa0545bf46a8e)) \ No newline at end of file +- categorical var frequency plot ([6cb391f](https://github.com/ydataai/ydata-profiling/commit/6cb391fd8d26c98792e14592b4d853f9a557eab0)) +- remove ipywidgets import ([1b8b117](https://github.com/ydataai/ydata-profiling/commit/1b8b11719cd2a1dfcde9ecd7406aa0545bf46a8e)) diff --git a/docsrc/source/pages/reference/changelog/v3_6_2.md b/docsrc/source/pages/reference/changelog/v3_6_2.md index 2ae882d80..31527c8b6 100644 --- a/docsrc/source/pages/reference/changelog/v3_6_2.md +++ b/docsrc/source/pages/reference/changelog/v3_6_2.md @@ -1,9 +1,8 @@ ### Changelog v3.6.2 - #### πŸ› Bug fixes -* comparison alerts ([#1229](https://github.com/ydataai/pandas-profiling/issues/1229)) ([6f6baf2](https://github.com/ydataai/pandas-profiling/commit/6f6baf2db01d1802eef8ce3ebc0612a37cffa3cf)) -* comparison histogram ([#1228](https://github.com/ydataai/pandas-profiling/issues/1228)) ([09ccae6](https://github.com/ydataai/pandas-profiling/commit/09ccae66aad9a16528ac6eda755475cb76ca8228)) -* comparison report style issues ([34431a1](https://github.com/ydataai/pandas-profiling/commit/34431a13d39fe0b9a5f58a6c739120a9df0e90c0)) -* update the link for the people-example.csv ([37cd822](https://github.com/ydataai/pandas-profiling/commit/37cd822fc8fea7b3a4c9ea456fbd01be76f8391c)) +- comparison alerts ([#1229](https://github.com/ydataai/ydata-profiling/issues/1229)) ([6f6baf2](https://github.com/ydataai/ydata-profiling/commit/6f6baf2db01d1802eef8ce3ebc0612a37cffa3cf)) +- comparison histogram ([#1228](https://github.com/ydataai/ydata-profiling/issues/1228)) ([09ccae6](https://github.com/ydataai/ydata-profiling/commit/09ccae66aad9a16528ac6eda755475cb76ca8228)) +- comparison report style issues ([34431a1](https://github.com/ydataai/ydata-profiling/commit/34431a13d39fe0b9a5f58a6c739120a9df0e90c0)) +- update the link for the people-example.csv ([37cd822](https://github.com/ydataai/ydata-profiling/commit/37cd822fc8fea7b3a4c9ea456fbd01be76f8391c)) diff --git a/docsrc/source/pages/reference/changelog/v3_6_3.md b/docsrc/source/pages/reference/changelog/v3_6_3.md index cbc38cf9c..593244a0d 100644 --- a/docsrc/source/pages/reference/changelog/v3_6_3.md +++ b/docsrc/source/pages/reference/changelog/v3_6_3.md @@ -1,7 +1,6 @@ ### Changelog v3.6.3 - #### πŸ› Bug fixes -* issue[#1104](https://github.com/ydataai/pandas-profiling/issues/1104) empty dataframe ([#1238](https://github.com/ydataai/pandas-profiling/issues/1238)) ([db60d48](https://github.com/ydataai/pandas-profiling/commit/db60d48383acd70f6c5c09dc6b9c98e0a7dce57b)) -* time series vars excluded from some plots ([#1243](https://github.com/ydataai/pandas-profiling/issues/1243)) ([906591e](https://github.com/ydataai/pandas-profiling/commit/906591ef64da9766deefa7e1e8841080ca2a905e)) \ No newline at end of file +- issue[#1104](https://github.com/ydataai/ydata-profiling/issues/1104) empty dataframe ([#1238](https://github.com/ydataai/ydata-profiling/issues/1238)) ([db60d48](https://github.com/ydataai/ydata-profiling/commit/db60d48383acd70f6c5c09dc6b9c98e0a7dce57b)) +- time series vars excluded from some plots ([#1243](https://github.com/ydataai/ydata-profiling/issues/1243)) ([906591e](https://github.com/ydataai/ydata-profiling/commit/906591ef64da9766deefa7e1e8841080ca2a905e)) diff --git a/docsrc/source/pages/reference/history.rst b/docsrc/source/pages/reference/history.rst index 54933efb1..b1769a228 100644 --- a/docsrc/source/pages/reference/history.rst +++ b/docsrc/source/pages/reference/history.rst @@ -5,7 +5,7 @@ The ``ydata-profiling`` project became what it is today due to the work of the creators to make it successful. This page aims to highlights a bit of the development history. For the full picture, have a look at the `contributor -history `__. +history `__. Inception --------- diff --git a/docsrc/source/pages/support_contrib/common_issues.rst b/docsrc/source/pages/support_contrib/common_issues.rst index c076c844c..2b64a8653 100644 --- a/docsrc/source/pages/support_contrib/common_issues.rst +++ b/docsrc/source/pages/support_contrib/common_issues.rst @@ -13,11 +13,11 @@ More information on installing Python packages directly from a notebook: `'Insta Related GitHub issues: -- `[950] `_ -- `[939] `_ -- `[528] `_ -- `[485] `_ -- `[396] `_ +- `[950] `_ +- `[939] `_ +- `[528] `_ +- `[485] `_ +- `[396] `_ Conda installation defaults to v1.4.1 ------------------------------------- @@ -26,13 +26,13 @@ Some users experience that ``conda install -c conda-forge pandas-profiling`` def If creating a new environment with a fresh installation does not resolve this issue, or the current environment must be kept, installing a specific version is one alternative to try: ``conda install -c conda-forge pandas-profiling=3.2.0``. If it fails with an ``UnsatisfiableError`` that suggests dependent packages are either missing or incompatible, then further intervention is required to resolve the *environment* issue. However, *conda* error messages in this regard may be too cryptic or insufficient to pinpoint the culprit, therefore you may have to resort to an alternate means of troubleshooting e.g using the `Mamba Package Manager `_. -For an illustration of this approach see `this issue `_. +For an illustration of this approach see `this issue `_. Related GitHub issues: - `[22] `_ -- `[448] `_ -- `[563] `_ +- `[448] `_ +- `[563] `_ Jupyter "IntSlider(value=0)" @@ -59,4 +59,4 @@ This is due to an incompatibility in an old package version (particularly 1.4.1, Related GitHub issues: -- `[981] `_ \ No newline at end of file +- `[981] `_ \ No newline at end of file diff --git a/docsrc/source/pages/support_contrib/contribution_guidelines.rst b/docsrc/source/pages/support_contrib/contribution_guidelines.rst index 6ea028c9c..dc172b7e5 100644 --- a/docsrc/source/pages/support_contrib/contribution_guidelines.rst +++ b/docsrc/source/pages/support_contrib/contribution_guidelines.rst @@ -79,4 +79,4 @@ The community is low-threshold place to ask questions regarding development and More information ---------------- -Read more on getting involved in the `Contribution Guide available on GitHub `_. +Read more on getting involved in the `Contribution Guide available on GitHub `_. diff --git a/docsrc/source/pages/support_contrib/help_troubleshoot.rst b/docsrc/source/pages/support_contrib/help_troubleshoot.rst index 818e2a311..343967907 100644 --- a/docsrc/source/pages/support_contrib/help_troubleshoot.rst +++ b/docsrc/source/pages/support_contrib/help_troubleshoot.rst @@ -12,7 +12,7 @@ If the problem can be replicated in the new environment, then it likely is a sof Reporting a bug --------------- -To ensure the bug was not already reported by searching on Github under `Issues `_. If you're unable to find an open issue addressing the problem, `open a new one `_. If possible, use the relevant bug report templates to create the issue. +To ensure the bug was not already reported by searching on Github under `Issues `_. If you're unable to find an open issue addressing the problem, `open a new one `_. If possible, use the relevant bug report templates to create the issue. You should provide the **minimal information to reproduce this bug**. `This guide `_ can help in crafting a minimal bug report. Please include: diff --git a/docsrc/source/pages/use_cases/big_data.rst b/docsrc/source/pages/use_cases/big_data.rst index c1a79fb49..bcc6177b7 100644 --- a/docsrc/source/pages/use_cases/big_data.rst +++ b/docsrc/source/pages/use_cases/big_data.rst @@ -20,7 +20,7 @@ This is the recommended starting point for larger datasets. **Minimal mode** - This mode was introduced in version v2.4.0 -This configuration file can be found here: `config_minimal.yaml `_. More details on settings and configuration are available in :doc:`../advanced_usage/available_settings`. +This configuration file can be found here: `config_minimal.yaml `_. More details on settings and configuration are available in :doc:`../advanced_usage/available_settings`. Sample the dataset ------------------ @@ -94,7 +94,7 @@ Pyspark - Interactions - Improved histogram computation -Keep an eye on the `GitHub `_ page to follow the updates on the implementation of `Pyspark Dataframes support `_. +Keep an eye on the `GitHub `_ page to follow the updates on the implementation of `Pyspark Dataframes support `_. Concurrency ----------- @@ -102,4 +102,4 @@ Concurrency ``ydata-profiling`` is a project under active development. One of the highly desired features is the addition of a scalable backend such as `Modin `_ or `Dask `_. -Keep an eye on the `GitHub `_ page to follow the updates on the implementation of a concurrent and highly scalable backend. Specifically, development of a Spark backend is `currently underway `_. +Keep an eye on the `GitHub `_ page to follow the updates on the implementation of a concurrent and highly scalable backend. Specifically, development of a Spark backend is `currently underway `_. From 3218660008ad067ecac340ed52c9861976fffeac Mon Sep 17 00:00:00 2001 From: k3agan Date: Fri, 17 Mar 2023 18:28:04 -0700 Subject: [PATCH 3/4] docs: updated 179 broken links --- CONTRIBUTING.md | 56 ++++----- README.md | 109 +++++++++--------- docsrc/source/pages/reference/changelog.rst | 6 +- .../pages/reference/changelog/v3_5_0.md | 24 ++-- .../pages/reference/changelog/v3_5_1.md | 3 +- .../pages/reference/changelog/v3_6_0.md | 52 +++++---- .../pages/reference/changelog/v3_6_1.md | 5 +- .../pages/reference/changelog/v3_6_2.md | 9 +- .../pages/reference/changelog/v3_6_3.md | 5 +- 9 files changed, 138 insertions(+), 131 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c66025387..2fcc35c64 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,45 +1,45 @@ ## How to contribute to Pandas-Profiling -Pandas-profiling aims to ease exploratory data analysis for structured datasets, including time-series. +Pandas-profiling aims to ease exploratory data analysis for structured datasets, including time-series. Our focus is to provide users with useful and robust statistics for such datasets encountered in industry, academia and elsewhere. Pandas-profiling is open-source and stimulates contributions from passionate community users. -#### Themes to contribute +#### Themes to contribute In line with our aim, we identify the following themes: -- **Exploratory data analysis**: +- **Exploratory data analysis**: The core of the package is a dataset summarization by its main characteristics, which is complemented with warnings on data issues and visualisations. - _Suggestions for contribution_: + _Suggestions for contribution_: Extend the support of more data types (think of paths, location or GPS coordinates and ordinal data types), - text data (e.g. encoding, vocabulary size, spelling errors, language detection), - time series analysis, + text data (e.g. encoding, vocabulary size, spelling errors, language detection), + time series analysis, or even images (e.g. dimensions, EXIF). - + _Related_: [#7][i7], [#129][i129], [#190][i190], [#204][i204] or [create one](https://github.com/ydataai/ydata-profiling/issues/new/choose). -- **Stability, Performance and Restricted environment compatibility:** +- **Stability, Performance and Restricted environment compatibility:** Data exploration takes place in all kinds of conditions, on the latest machine learning platforms with enormous dataset to managed environments in large corporations. `pandas-profiling` helps analysts, researchers and engineers alike in these cases. We do this by fixing bugs, improving performance on big datasets and adding environment compatibility. - - _Suggestions for contribution (Performance)_: - Perform concurrency analysis or profile execution times and leverage the gained insights for improved performance (e.g. multiprocessing, cython, numba) or test the performance of `pandas-profiling` with [big data sets](https://www.stats.govt.nz/large-datasets/csv-files-for-download/) and corresponding commonly used data formats (such as parquet). - - _Suggestions for contribution (Stability)_: + + _Suggestions for contribution (Performance)_: + Perform concurrency analysis or profile execution times and leverage the gained insights for improved performance (e.g. multiprocessing, cython, numba) or test the performance of `pandas-profiling` with [big data sets](https://www.stats.govt.nz/large-datasets/csv-files-for-download/) and corresponding commonly used data formats (such as parquet). + + _Suggestions for contribution (Stability)_: Either review the code and add tests or watch the [issues page](https://github.com/ydataai/ydata-profiling/issues) and [Stackoverflow tag](https://stackoverflow.com/questions/tagged/pandas-profiling) to find current issues. - + _Related_: [#98][i98], [#122][i122] or [create one](https://github.com/ydataai/ydata-profiling/issues/new/choose). -- **Interaction, presentation and user experience**: +- **Interaction, presentation and user experience**: As `pandas-profiling` eases exploratory data analysis, working with the package should reflect that. Interaction and user experience plays a central role in working with the package. Working on interactive and static features is possible through the modular nature of the package: the user can configure which features to use. _Suggestions for contribution (interactivity)_: - Interactivity allows for more user friendly applications, including but not limited to on demand analysis (don't compute what you don't want to see) and interactive histograms and correlations. - This is ideal for smaller datasets, where we can compute this on-the-fly. + Interactivity allows for more user friendly applications, including but not limited to on demand analysis (don't compute what you don't want to see) and interactive histograms and correlations. + This is ideal for smaller datasets, where we can compute this on-the-fly. `ipywidgets` would be a great place to start (e.g. [widget based view](https://ipywidgets.readthedocs.io/en/stable/examples/Widget%20List.html)). _Suggestions for contribution (presentation)_: @@ -48,17 +48,17 @@ In line with our aim, we identify the following themes: _Related_: [#161][i161], [#175][i175], [#191][i191] or [create one](https://github.com/ydataai/ydata-profiling/issues/new/choose). -- **Community**: +- **Community**: The success of this package demonstrates the power of sharing and working together. You are welcome as part of this community. - + _Suggestions for contribution_: Share with us if this package is of value to you, let us know [in our community](https://discord.com/invite/mw7xjJ7b7s). We are interested in how you use `pandas-profiling` in your work. - + _Related_: [#87][i87] or [create one](https://github.com/ydataai/ydata-profiling/issues/new/choose). -- **Machine learning:** +- **Machine learning:** `pandas-profiling` is not a machine learning package, even though many of our users use EDA as a step prior to developing their models. Our focus lies in the exploratory data analysis. Any functionality that enables machine learning applications by more effective data profiling, is welcome. @@ -67,17 +67,18 @@ In line with our aim, we identify the following themes: #### **Did you find a bug?** -- **Ensure the bug was not already reported** by searching on Github under [Issues](https://github.com/ydataai/ydata-profiling/issues). +* **Ensure the bug was not already reported** by searching on Github under [Issues](https://github.com/ydataai/ydata-profiling/issues). -- If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/ydataai/ydata-profiling/issues/new/choose). - If possible, use the relevant bug report templates to create the issue. +* If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/ydataai/ydata-profiling/issues/new/choose). +If possible, use the relevant bug report templates to create the issue. #### **Did you write a patch that fixes a bug?** -- Open a new Github pull request with the patch. +* Open a new Github pull request with the patch. + +* Ensure the PR description clearly describes the problem and solution. +Include the relevant issue number if applicable. -- Ensure the PR description clearly describes the problem and solution. - Include the relevant issue number if applicable. #### Acknowledgements @@ -98,3 +99,4 @@ See the [Contributor Graph](https://github.com/ydataai/ydata-profiling/graphs/co [i161]: https://github.com/ydataai/ydata-profiling/issues/161 [i175]: https://github.com/ydataai/ydata-profiling/issues/175 [i191]: https://github.com/ydataai/ydata-profiling/issues/191 + diff --git a/README.md b/README.md index 03a2c3b9e..61203f7f8 100644 --- a/README.md +++ b/README.md @@ -27,21 +27,21 @@ `ydata-profiling` primary goal is to provide a one-line Exploratory Data Analysis (EDA) experience in a consistent and fast solution. Like pandas `df.describe()` function, that is so handy, ydata-profiling delivers an extended analysis of a DataFrame while allowing the data analysis to be exported in different formats such as **html** and **json**. -The package outputs a simple and digested analysis of a dataset, including **time-series** and **text**. +The package outputs a simple and digested analysis of a dataset, including **time-series** and **text**. -### 🎊 New year, new face, more functionalities! +### 🎊 New year, new face, more functionalities! -Thank you for using and following `pandas-profiling` developments. Yet, we have a new exciting feature - we are now thrilled to announce +Thank you for using and following ``pandas-profiling`` developments. Yet, we have a new exciting feature - we are now thrilled to announce that Spark is now part of the Data Profiling family from version 4.0.0 onwards - -With its introduction, there was also the need for a new naming, one that will allow to decouple the concept of profiling from the Pandas Dataframes - `ydata-profiling`! - + +With its introduction, there was also the need for a new naming, one that will allow to decouple the concept of profiling from the Pandas Dataframes - `ydata-profiling`! + But fear not, `pip install pandas-profiling` will still be a valid for a while, and we will keep investing in growing the best open-source for data profiling, so you can use it for even more use cases. ## Key features -- **Type inference**: automatic detection of columns' data types (_Categorical_, _Numerical_, _Date_, etc.) -- **Warnings**: A summary of the problems/challenges in the data that you might need to work on (_missing data_, _inaccuracies_, _skewness_, etc.) +- **Type inference**: automatic detection of columns' data types (*Categorical*, *Numerical*, *Date*, etc.) +- **Warnings**: A summary of the problems/challenges in the data that you might need to work on (*missing data*, *inaccuracies*, *skewness*, etc.) - **Univariate analysis**: including descriptive statistics (mean, median, mode, etc) and informative visualizations such as distribution histograms - **Multivariate analysis**: including correlations, a detailed analysis of missing data, duplicate rows, and visual support for variables pairwise interaction - **Time-Series**: including different statistical information relative to time dependent data such as auto-correlation and seasonality, along ACF and PACF plots. @@ -58,7 +58,7 @@ The report contains three additional sections: ### 🎁 Latest features -- Want to scale? Check the latest release with ⭐⚑[Spark support](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/pypspark.html)! +- Want to scale? Check the latest release with ⭐⚑[Spark support](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/pypspark.html)! - Looking for how you can do an EDA for Time-Series πŸ•› ? Check [this blogpost](https://towardsdatascience.com/how-to-do-an-eda-for-time-series-cbb92b3b1913). - You want to compare 2 datasets and get a report? Check [this blogpost](https://medium.com/towards-artificial-intelligence/how-to-compare-2-dataset-with-pandas-profiling-2ae3a9d7695e) @@ -68,17 +68,16 @@ Spark support has been released, but we are always looking for an extra pair of [Check current work in progress!](https://github.com/ydataai/ydata-profiling/projects/3). ## πŸ“ Use cases - YData-profiling can be used to deliver a variety of different use-case. The documentation includes guides, tips and tricks for tackling them: -| Use case | Description | -| --------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | -| [Comparing datasets](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/comparing_datasets.html) | Comparing multiple version of the same dataset | -| [Profiling a Time-Series dataset](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/time_series_datasets.html) | Generating a report for a time-series dataset with a single line of code | -| [Profiling large datasets](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/big_data.html) | Tips on how to prepare data and configure `ydata-profiling` for working with large datasets | -| [Handling sensitive data](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/sensitive_data.html) | Generating reports which are mindful about sensitive data in the input dataset | -| [Dataset metadata and data dictionaries](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/metadata.html) | Complementing the report with dataset details and column-specific data dictionaries | -| [Customizing the report's appearance](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/custom_report_appearance.html) | Changing the appearance of the report's page and of the contained visualizations | +| Use case | Description | +|----------|---------------------------------------------------------------------------------------------| +| [Comparing datasets](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/comparing_datasets.html ) | Comparing multiple version of the same dataset | +| [Profiling a Time-Series dataset](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/time_series_datasets.html) | Generating a report for a time-series dataset with a single line of code | +|[Profiling large datasets](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/big_data.html ) | Tips on how to prepare data and configure `ydata-profiling` for working with large datasets | +| [Handling sensitive data](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/sensitive_data.html ) | Generating reports which are mindful about sensitive data in the input dataset | +| [Dataset metadata and data dictionaries](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/metadata.html) | Complementing the report with dataset details and column-specific data dictionaries | +| [Customizing the report's appearance](https://ydata-profiling.ydata.ai/docs/master/pages/use_cases/custom_report_appearance.html ) | Changing the appearance of the report's page and of the contained visualizations | ## ▢️ Quickstart @@ -150,27 +149,25 @@ Additional details on the CLI are available [on the documentation](https://ydata The following example reports showcase the potentialities of the package across a wide range of dataset and data types: -- [Census Income](https://ydata-profiling.ydata.ai/examples/master/census/census_report.html) (US Adult Census data relating income with other demographic properties) -- [NASA Meteorites](https://ydata-profiling.ydata.ai/examples/master/meteorites/meteorites_report.html) (comprehensive set of meteorite landing - object properties and locations) [![Open In Colab](https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/github/ydataai/pandas-profiling/blob/master/examples/meteorites/meteorites_cloud.ipynb) [![Binder](https://camo.githubusercontent.com/483bae47a175c24dfbfc57390edd8b6982ac5fb3/68747470733a2f2f6d7962696e6465722e6f72672f62616467655f6c6f676f2e737667)](https://mybinder.org/v2/gh/ydataai/pandas-profiling/master?filepath=examples%2Fmeteorites%2Fmeteorites%5Fcloud.ipynb) -- [Titanic](https://ydata-profiling.ydata.ai/examples/master/titanic/titanic_report.html) (the "Wonderwall" of datasets) [![Open In Colab](https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/github/ydataai/pandas-profiling/blob/master/examples/titanic/titanic_cloud.ipynb) [![Binder](https://camo.githubusercontent.com/483bae47a175c24dfbfc57390edd8b6982ac5fb3/68747470733a2f2f6d7962696e6465722e6f72672f62616467655f6c6f676f2e737667)](https://mybinder.org/v2/gh/ydataai/pandas-profiling/master?filepath=examples%2Ftitanic%2Ftitanic%5Fcloud.ipynb) -- [NZA](https://ydata-profiling.ydata.ai/examples/master/nza/nza_report.html) (open data from the Dutch Healthcare Authority) -- [Stata Auto](https://ydata-profiling.ydata.ai/examples/master/stata_auto/stata_auto_report.html) (1978 Automobile data) -- [Colors](https://ydata-profiling.ydata.ai/examples/master/colors/colors_report.html) (a simple colors dataset) -- [Vektis](https://ydata-profiling.ydata.ai/examples/master/vektis/vektis_report.html) (Vektis Dutch Healthcare data) -- [UCI Bank Dataset](https://ydata-profiling.ydata.ai/examples/master/bank_marketing_data/uci_bank_marketing_report.html) (marketing dataset from a bank) -- [Russian Vocabulary](https://ydata-profiling.ydata.ai/examples/master/features/russian_vocabulary.html) (100 most common Russian words, showcasing unicode text analysis) -- [Website Inaccessibility](https://ydata-profiling.ydata.ai/examples/master/features/website_inaccessibility_report.html) (website accessibility analysis, showcasing support for URL data) -- [Orange prices](https://ydata-profiling.ydata.ai/examples/master/features/united_report.html) and -- [Coal prices](https://ydata-profiling.ydata.ai/examples/master/features/flatly_report.html) (simple pricing evolution datasets, showcasing the theming options) -- [USA Air Quality](https://github.com/ydataai/ydata-profiling/tree/master/examples/usaairquality) (Time-series air quality dataset EDA example) -- [HCC](https://github.com/ydataai/ydata-profiling/tree/master/examples/hcc) (Open dataset from healthcare, showcasing compare between two sets of data, before and after preprocessing) +* [Census Income](https://ydata-profiling.ydata.ai/examples/master/census/census_report.html) (US Adult Census data relating income with other demographic properties) +* [NASA Meteorites](https://ydata-profiling.ydata.ai/examples/master/meteorites/meteorites_report.html) (comprehensive set of meteorite landing - object properties and locations) [![Open In Colab](https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/github/ydataai/pandas-profiling/blob/master/examples/meteorites/meteorites_cloud.ipynb) [![Binder](https://camo.githubusercontent.com/483bae47a175c24dfbfc57390edd8b6982ac5fb3/68747470733a2f2f6d7962696e6465722e6f72672f62616467655f6c6f676f2e737667)](https://mybinder.org/v2/gh/ydataai/pandas-profiling/master?filepath=examples%2Fmeteorites%2Fmeteorites%5Fcloud.ipynb) +* [Titanic](https://ydata-profiling.ydata.ai/examples/master/titanic/titanic_report.html) (the "Wonderwall" of datasets) [![Open In Colab](https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/github/ydataai/pandas-profiling/blob/master/examples/titanic/titanic_cloud.ipynb) [![Binder](https://camo.githubusercontent.com/483bae47a175c24dfbfc57390edd8b6982ac5fb3/68747470733a2f2f6d7962696e6465722e6f72672f62616467655f6c6f676f2e737667)](https://mybinder.org/v2/gh/ydataai/pandas-profiling/master?filepath=examples%2Ftitanic%2Ftitanic%5Fcloud.ipynb) +* [NZA](https://ydata-profiling.ydata.ai/examples/master/nza/nza_report.html) (open data from the Dutch Healthcare Authority) +* [Stata Auto](https://ydata-profiling.ydata.ai/examples/master/stata_auto/stata_auto_report.html) (1978 Automobile data) +* [Colors](https://ydata-profiling.ydata.ai/examples/master/colors/colors_report.html) (a simple colors dataset) +* [Vektis](https://ydata-profiling.ydata.ai/examples/master/vektis/vektis_report.html) (Vektis Dutch Healthcare data) +* [UCI Bank Dataset](https://ydata-profiling.ydata.ai/examples/master/bank_marketing_data/uci_bank_marketing_report.html) (marketing dataset from a bank) +* [Russian Vocabulary](https://ydata-profiling.ydata.ai/examples/master/features/russian_vocabulary.html) (100 most common Russian words, showcasing unicode text analysis) +* [Website Inaccessibility](https://ydata-profiling.ydata.ai/examples/master/features/website_inaccessibility_report.html) (website accessibility analysis, showcasing support for URL data) +* [Orange prices](https://ydata-profiling.ydata.ai/examples/master/features/united_report.html) and +* [Coal prices](https://ydata-profiling.ydata.ai/examples/master/features/flatly_report.html) (simple pricing evolution datasets, showcasing the theming options) +* [USA Air Quality](https://github.com/ydataai/ydata-profiling/tree/master/examples/usaairquality) (Time-series air quality dataset EDA example) +* [HCC](https://github.com/ydataai/ydata-profiling/tree/master/examples/hcc) (Open dataset from healthcare, showcasing compare between two sets of data, before and after preprocessing) ## πŸ› οΈ Installation - Additional details, including information about widget support, are available [on the documentation](https://ydata-profiling.ydata.ai/docs/master/pages/getting_started/installation.html). ### Using pip - [![PyPi Downloads](https://pepy.tech/badge/ydata-profiling)](https://pepy.tech/project/ydata-profiling) [![PyPi Monthly Downloads](https://pepy.tech/badge/pandas-profiling/month)](https://pepy.tech/project/ydata-profiling/month) [![PyPi Version](https://badge.fury.io/py/ydata-profiling.svg)](https://pypi.org/project/ydata-profiling/) @@ -185,9 +182,9 @@ pip install -U ydata-profiling The package declares "extras", sets of additional dependencies. -- `[notebook]`: support for rendering the report in Jupyter notebook widgets. -- `[unicode]`: support for more detailed Unicode analysis, at the expense of additional disk space. -- `[pyspark]`: support for pyspark for big dataset analysis +* `[notebook]`: support for rendering the report in Jupyter notebook widgets. +* `[unicode]`: support for more detailed Unicode analysis, at the expense of additional disk space. +* `[pyspark]`: support for pyspark for big dataset analysis Install these with e.g. @@ -195,10 +192,11 @@ Install these with e.g. pip install -U ydata-profiling[notebook,unicode,pyspark] ``` -### Using conda +### Using conda [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/pandas-profiling.svg)](https://anaconda.org/conda-forge/pandas-profiling) -[![Conda Version](https://img.shields.io/conda/vn/conda-forge/pandas-profiling.svg)](https://anaconda.org/conda-forge/pandas-profiling) +[![Conda Version](https://img.shields.io/conda/vn/conda-forge/pandas-profiling.svg)](https://anaconda.org/conda-forge/pandas-profiling) + You can install using the `conda` package manager by running: @@ -216,32 +214,31 @@ Install it by navigating to the proper directory and running: pip install -e . ``` -The profiling report is written in HTML and CSS, which means a modern browser is required. +The profiling report is written in HTML and CSS, which means a modern browser is required. You need [Python 3](https://python3statement.org/) to run the package. Other dependencies can be found in the requirements files: -| Filename | Requirements | -| ----------------------------------------------------------------------------------------------------- | ----------------------------- | -| [requirements.txt](https://github.com/ydataai/ydata-profiling/blob/master/requirements.txt) | Package requirements | -| [requirements-dev.txt](https://github.com/ydataai/ydata-profiling/blob/master/requirements-dev.txt) | Requirements for development | -| [requirements-test.txt](https://github.com/ydataai/ydata-profiling/blob/master/requirements-test.txt) | Requirements for testing | -| [setup.py](https://github.com/ydataai/ydata-profiling/blob/master/setup.py) | Requirements for widgets etc. | +| Filename | Requirements| +|----------|-------------| +| [requirements.txt](https://github.com/ydataai/ydata-profiling/blob/master/requirements.txt) | Package requirements| +| [requirements-dev.txt](https://github.com/ydataai/ydata-profiling/blob/master/requirements-dev.txt) | Requirements for development| +| [requirements-test.txt](https://github.com/ydataai/ydata-profiling/blob/master/requirements-test.txt) | Requirements for testing| +| [setup.py](https://github.com/ydataai/ydata-profiling/blob/master/setup.py) | Requirements for widgets etc. | ## πŸ”— Integrations -To maximize its usefulness in real world contexts, `pandas-profiling` has a set of implicit and explicit integrations with a variety of other actors in the Data Science ecosystem: +To maximize its usefulness in real world contexts, `pandas-profiling` has a set of implicit and explicit integrations with a variety of other actors in the Data Science ecosystem: -| Integration type | Description | -| --------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [Other DataFrame libraries](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/other_dataframe_libraries.html) | How to compute the profiling of data stored in libraries other than pandas | -| [Great Expectations](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/great_expectations.html) | Generating [Great Expectations](https://greatexpectations.io) expectations suites directly from a profiling report | -| [Interactive applications](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/data_apps.html) | Embedding profiling reports in [Streamlit](http://streamlit.io), [Dash](http://dash.plotly.com) or [Panel](https://panel.holoviz.org) applications | -| [Pipelines](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/pipelines.html) | Integration with DAG workflow execution tools like [Airflow](https://airflow.apache.org) or [Kedro](https://kedro.org) | -| [Cloud services](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/cloud_services.html) | Using `pandas-profiling` in hosted computation services like [Lambda](https://lambdalabs.com), [Google Cloud](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/blob/master/retail/propensity-model/bqml/bqml_kfp_retail_propensity_to_purchase.ipynb) or [Kaggle](https://www.kaggle.com/code) | -| [IDEs](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/ides.html) | Using `pandas-profiling` directly from integrated development environments such as [PyCharm](https://www.jetbrains.com/pycharm/) | +| Integration type | Description | +|---|---| +| [Other DataFrame libraries](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/other_dataframe_libraries.html) | How to compute the profiling of data stored in libraries other than pandas | +| [Great Expectations](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/great_expectations.html) | Generating [Great Expectations](https://greatexpectations.io) expectations suites directly from a profiling report | +| [Interactive applications](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/data_apps.html) | Embedding profiling reports in [Streamlit](http://streamlit.io), [Dash](http://dash.plotly.com) or [Panel](https://panel.holoviz.org) applications | +| [Pipelines](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/pipelines.html) | Integration with DAG workflow execution tools like [Airflow](https://airflow.apache.org) or [Kedro](https://kedro.org) | +| [Cloud services](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/cloud_services.html) | Using `pandas-profiling` in hosted computation services like [Lambda](https://lambdalabs.com), [Google Cloud](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/blob/master/retail/propensity-model/bqml/bqml_kfp_retail_propensity_to_purchase.ipynb) or [Kaggle](https://www.kaggle.com/code) | +| [IDEs](https://ydata-profiling.ydata.ai/docs/master/pages/integrations/ides.html) | Using `pandas-profiling` directly from integrated development environments such as [PyCharm](https://www.jetbrains.com/pycharm/) | ## πŸ™‹ Support - Need help? Want to share a perspective? Report a bug? Ideas for collaborations? Reach out via the following channels: - [Stack Overflow](https://stackoverflow.com/questions/tagged/pandas-profiling+or+ydata-profiling): ideal for asking questions on how to use the package diff --git a/docsrc/source/pages/reference/changelog.rst b/docsrc/source/pages/reference/changelog.rst index 4ded8feec..32fc4f850 100644 --- a/docsrc/source/pages/reference/changelog.rst +++ b/docsrc/source/pages/reference/changelog.rst @@ -2,9 +2,9 @@ Changelog ========= -.. include:: changelog/v4_1_0.md - :parser: myst_parser.sphinx_ - +.. include:: changelog/v4_1_0.md + :parser: myst_parser.sphinx_ + .. include:: changelog/v4_0_0.md :parser: myst_parser.sphinx_ diff --git a/docsrc/source/pages/reference/changelog/v3_5_0.md b/docsrc/source/pages/reference/changelog/v3_5_0.md index 814a94bac..2a9b93bfa 100644 --- a/docsrc/source/pages/reference/changelog/v3_5_0.md +++ b/docsrc/source/pages/reference/changelog/v3_5_0.md @@ -1,18 +1,20 @@ ### Changelog v3.5.0 + #### πŸ› Bug fixes -- change context managed backend ([#1149](https://github.com/ydataai/ydata-profiling/issues/1149)) ([11e1a8a](https://github.com/ydataai/ydata-profiling/commit/11e1a8a3fa8d13513fe926b731fb907a066af2a1)) -- dataset names on comparison report ([#1159](https://github.com/ydataai/ydata-profiling/issues/1159)) ([3c14d43](https://github.com/ydataai/ydata-profiling/commit/3c14d438d9a557ac85f5663cc3446c0fb3081e18)) -- duplicate key in test dict ([#1126](https://github.com/ydataai/ydata-profiling/issues/1126)) ([d19affe](https://github.com/ydataai/ydata-profiling/commit/d19affe15a4e3063af7187ca5fa81f1bf75ce648)) -- improve description and correct plot for β€˜auto’ correlation ([#1119](https://github.com/ydataai/ydata-profiling/issues/1119)) ([2617b92](https://github.com/ydataai/ydata-profiling/commit/2617b92d08ed87546c80e0cc01cd475d1e60ec56)) -- remove correlation calculation for constants ([#1152](https://github.com/ydataai/ydata-profiling/issues/1152)) ([1ed2bc0](https://github.com/ydataai/ydata-profiling/commit/1ed2bc0702f504592ed211097469405a5061a857)) -- time series render format ([#1157](https://github.com/ydataai/ydata-profiling/issues/1157)) ([39ca8ce](https://github.com/ydataai/ydata-profiling/commit/39ca8ce7d4ed2ad0ebb78db5d5f26d3ace08753a)) -- update config files to only calculate 'auto' correlation ([#1158](https://github.com/ydataai/ydata-profiling/issues/1158)) ([34cf73d](https://github.com/ydataai/ydata-profiling/commit/34cf73dadaea08e44e741f99fa0a10c322c86109)) -- update repository links ([#1141](https://github.com/ydataai/ydata-profiling/issues/1141)) ([c742c5d](https://github.com/ydataai/ydata-profiling/commit/c742c5dbeb18fe2907a4c03792e8802993c46da5)) +* change context managed backend ([#1149](https://github.com/ydataai/ydata-profiling/issues/1149)) ([11e1a8a](https://github.com/ydataai/ydata-profiling/commit/11e1a8a3fa8d13513fe926b731fb907a066af2a1)) +* dataset names on comparison report ([#1159](https://github.com/ydataai/ydata-profiling/issues/1159)) ([3c14d43](https://github.com/ydataai/ydata-profiling/commit/3c14d438d9a557ac85f5663cc3446c0fb3081e18)) +* duplicate key in test dict ([#1126](https://github.com/ydataai/ydata-profiling/issues/1126)) ([d19affe](https://github.com/ydataai/ydata-profiling/commit/d19affe15a4e3063af7187ca5fa81f1bf75ce648)) +* improve description and correct plot for β€˜auto’ correlation ([#1119](https://github.com/ydataai/ydata-profiling/issues/1119)) ([2617b92](https://github.com/ydataai/ydata-profiling/commit/2617b92d08ed87546c80e0cc01cd475d1e60ec56)) +* remove correlation calculation for constants ([#1152](https://github.com/ydataai/ydata-profiling/issues/1152)) ([1ed2bc0](https://github.com/ydataai/ydata-profiling/commit/1ed2bc0702f504592ed211097469405a5061a857)) +* time series render format ([#1157](https://github.com/ydataai/ydata-profiling/issues/1157)) ([39ca8ce](https://github.com/ydataai/ydata-profiling/commit/39ca8ce7d4ed2ad0ebb78db5d5f26d3ace08753a)) +* update config files to only calculate 'auto' correlation ([#1158](https://github.com/ydataai/ydata-profiling/issues/1158)) ([34cf73d](https://github.com/ydataai/ydata-profiling/commit/34cf73dadaea08e44e741f99fa0a10c322c86109)) +* update repository links ([#1141](https://github.com/ydataai/ydata-profiling/issues/1141)) ([c742c5d](https://github.com/ydataai/ydata-profiling/commit/c742c5dbeb18fe2907a4c03792e8802993c46da5)) + #### πŸŽ‰ Features -- add typechecking to profile report ([#1139](https://github.com/ydataai/ydata-profiling/issues/1139)) ([ec8ece0](https://github.com/ydataai/ydata-profiling/commit/ec8ece0de394eb4c2918bb6a74f0c5e5bb77ca61)) -- report comparison example ([#1160](https://github.com/ydataai/ydata-profiling/issues/1160)) ([5e75fd2](https://github.com/ydataai/ydata-profiling/commit/5e75fd275d14c8ce7ba49d0a15ec26810c4c0e73)) -- report comparisons ([#1069](https://github.com/ydataai/ydata-profiling/issues/1069)) ([70ee5c7](https://github.com/ydataai/ydata-profiling/commit/70ee5c776ad0c72d709631690a2df1cde5ca0424)), closes [#1137](https://github.com/ydataai/ydata-profiling/issues/1137) [#1136](https://github.com/ydataai/ydata-profiling/issues/1136) [#1143](https://github.com/ydataai/ydata-profiling/issues/1143) [#1148](https://github.com/ydataai/ydata-profiling/issues/1148) [#1150](https://github.com/ydataai/ydata-profiling/issues/1150) +* add typechecking to profile report ([#1139](https://github.com/ydataai/ydata-profiling/issues/1139)) ([ec8ece0](https://github.com/ydataai/ydata-profiling/commit/ec8ece0de394eb4c2918bb6a74f0c5e5bb77ca61)) +* report comparison example ([#1160](https://github.com/ydataai/ydata-profiling/issues/1160)) ([5e75fd2](https://github.com/ydataai/ydata-profiling/commit/5e75fd275d14c8ce7ba49d0a15ec26810c4c0e73)) +* report comparisons ([#1069](https://github.com/ydataai/ydata-profiling/issues/1069)) ([70ee5c7](https://github.com/ydataai/ydata-profiling/commit/70ee5c776ad0c72d709631690a2df1cde5ca0424)), closes [#1137](https://github.com/ydataai/ydata-profiling/issues/1137) [#1136](https://github.com/ydataai/ydata-profiling/issues/1136) [#1143](https://github.com/ydataai/ydata-profiling/issues/1143) [#1148](https://github.com/ydataai/ydata-profiling/issues/1148) [#1150](https://github.com/ydataai/ydata-profiling/issues/1150) diff --git a/docsrc/source/pages/reference/changelog/v3_5_1.md b/docsrc/source/pages/reference/changelog/v3_5_1.md index 881ad761f..5acc405d1 100644 --- a/docsrc/source/pages/reference/changelog/v3_5_1.md +++ b/docsrc/source/pages/reference/changelog/v3_5_1.md @@ -1,5 +1,6 @@ ### Changelog v3.5.1 + #### πŸ› Bug fixes -- **utils:** use 'urllib.request' instead of 'requests' ([#1177](https://github.com/ydataai/ydata-profiling/issues/1177)) ([e4d020b](https://github.com/ydataai/ydata-profiling/commit/e4d020b873b67845a329517e42620ed96545d60e)), closes [#1168](https://github.com/ydataai/ydata-profiling/issues/1168) +* **utils:** use 'urllib.request' instead of 'requests' ([#1177](https://github.com/ydataai/ydata-profiling/issues/1177)) ([e4d020b](https://github.com/ydataai/ydata-profiling/commit/e4d020b873b67845a329517e42620ed96545d60e)), closes [#1168](https://github.com/ydataai/ydata-profiling/issues/1168) \ No newline at end of file diff --git a/docsrc/source/pages/reference/changelog/v3_6_0.md b/docsrc/source/pages/reference/changelog/v3_6_0.md index 925d9e3d5..6eea1fac3 100644 --- a/docsrc/source/pages/reference/changelog/v3_6_0.md +++ b/docsrc/source/pages/reference/changelog/v3_6_0.md @@ -1,32 +1,34 @@ # [3.6.0](https://github.com/ydataai/ydata-profiling/compare/v3.5.0...v3.6.0) (2022-12-21) + #### πŸ› Bug fixes -- add css to cope with large tables ([7f42f87](https://github.com/ydataai/ydata-profiling/commit/7f42f87cedd06694fe83241416e1fa21327b8c97)) -- adjust categoricals layout ([f0bb45a](https://github.com/ydataai/ydata-profiling/commit/f0bb45a2a2d89b5c6e77fd20939e069979b2b948)) -- categorical data not being obscured in the common values plot ([40236bc](https://github.com/ydataai/ydata-profiling/commit/40236bc67619a8aadeae797920c6238616169641)) -- compare report ignoring config parameter ([3d60556](https://github.com/ydataai/ydata-profiling/commit/3d6055675579d72a5ddf34c4c85e94befb403e72)) -- compare report warnings always showing the last alert type ([6b3c13d](https://github.com/ydataai/ydata-profiling/commit/6b3c13dd33489c8a895b2db1854b23a7edd3b948)) -- comparison fails when duplicates are disable ([#1208](https://github.com/ydataai/ydata-profiling/issues/1208)) ([6d19620](https://github.com/ydataai/ydata-profiling/commit/6d1962044d5bcf634266998551328bd3cdeb354c)) -- do no raise exception for percentage formatter ([3ea626d](https://github.com/ydataai/ydata-profiling/commit/3ea626de3d839a55fb0fac9dc7a5fa1da18ba037)) -- enforce recomputation of description sets ([a9fd1c8](https://github.com/ydataai/ydata-profiling/commit/a9fd1c845511679a18c87a9566d343ea945e9f16)) -- error comparing only one precomputed profile ([00646cd](https://github.com/ydataai/ydata-profiling/commit/00646cde15e0fb0dad29e4bd3cc5747b3eff61e2)) -- **html:** sensible cloud-platform notebook html rendering ([b22ece2](https://github.com/ydataai/ydata-profiling/commit/b22ece261c0e9a74254361b6b7e121ab94abe44d)) -- ignoring config of precomputed reports ([6478c40](https://github.com/ydataai/ydata-profiling/commit/6478c4047ee871ede7f7aa76379818ee3217e7d7)) -- only compute auto correlation when no config is specified ([d5d4f58](https://github.com/ydataai/ydata-profiling/commit/d5d4f58d3b0728bed021677ffb7be14cb7f04f27)) -- remove malfunctioning hook ([e2593f5](https://github.com/ydataai/ydata-profiling/commit/e2593f5bb093117c7afb8914eafbda6e2e110782)) -- remove unused test ([2170338](https://github.com/ydataai/ydata-profiling/commit/21703385a42bf38d4306511e0f99bed9e1092991)) -- return the proper type for widgets ([4c0b358](https://github.com/ydataai/ydata-profiling/commit/4c0b358002d75139c23babc30cbc0c7b23534d92)) -- set compute default to false ([c70e491](https://github.com/ydataai/ydata-profiling/commit/c70e49136fbdf1d3fe7e6ef5b23a8adbd0567ecf)) -- solve mypy error ([9c4266e](https://github.com/ydataai/ydata-profiling/commit/9c4266eb1cb252d8008795080723598d2d151e26)) -- solve mypy issue ([e3e7788](https://github.com/ydataai/ydata-profiling/commit/e3e7788907eebcf572423b48800f848d965f5969)) -- uses colors from the specified config ([c0c556d](https://github.com/ydataai/ydata-profiling/commit/c0c556d29cc191d44fdb08fc813818709c1b0666)) -- **utils:** use 'urllib.request' instead of 'requests' ([#1177](https://github.com/ydataai/ydata-profiling/issues/1177)) ([e4d020b](https://github.com/ydataai/ydata-profiling/commit/e4d020b873b67845a329517e42620ed96545d60e)), closes [#1168](https://github.com/ydataai/ydata-profiling/issues/1168) +* add css to cope with large tables ([7f42f87](https://github.com/ydataai/ydata-profiling/commit/7f42f87cedd06694fe83241416e1fa21327b8c97)) +* adjust categoricals layout ([f0bb45a](https://github.com/ydataai/ydata-profiling/commit/f0bb45a2a2d89b5c6e77fd20939e069979b2b948)) +* categorical data not being obscured in the common values plot ([40236bc](https://github.com/ydataai/ydata-profiling/commit/40236bc67619a8aadeae797920c6238616169641)) +* compare report ignoring config parameter ([3d60556](https://github.com/ydataai/ydata-profiling/commit/3d6055675579d72a5ddf34c4c85e94befb403e72)) +* compare report warnings always showing the last alert type ([6b3c13d](https://github.com/ydataai/ydata-profiling/commit/6b3c13dd33489c8a895b2db1854b23a7edd3b948)) +* comparison fails when duplicates are disable ([#1208](https://github.com/ydataai/ydata-profiling/issues/1208)) ([6d19620](https://github.com/ydataai/ydata-profiling/commit/6d1962044d5bcf634266998551328bd3cdeb354c)) +* do no raise exception for percentage formatter ([3ea626d](https://github.com/ydataai/ydata-profiling/commit/3ea626de3d839a55fb0fac9dc7a5fa1da18ba037)) +* enforce recomputation of description sets ([a9fd1c8](https://github.com/ydataai/ydata-profiling/commit/a9fd1c845511679a18c87a9566d343ea945e9f16)) +* error comparing only one precomputed profile ([00646cd](https://github.com/ydataai/ydata-profiling/commit/00646cde15e0fb0dad29e4bd3cc5747b3eff61e2)) +* **html:** sensible cloud-platform notebook html rendering ([b22ece2](https://github.com/ydataai/ydata-profiling/commit/b22ece261c0e9a74254361b6b7e121ab94abe44d)) +* ignoring config of precomputed reports ([6478c40](https://github.com/ydataai/ydata-profiling/commit/6478c4047ee871ede7f7aa76379818ee3217e7d7)) +* only compute auto correlation when no config is specified ([d5d4f58](https://github.com/ydataai/ydata-profiling/commit/d5d4f58d3b0728bed021677ffb7be14cb7f04f27)) +* remove malfunctioning hook ([e2593f5](https://github.com/ydataai/ydata-profiling/commit/e2593f5bb093117c7afb8914eafbda6e2e110782)) +* remove unused test ([2170338](https://github.com/ydataai/ydata-profiling/commit/21703385a42bf38d4306511e0f99bed9e1092991)) +* return the proper type for widgets ([4c0b358](https://github.com/ydataai/ydata-profiling/commit/4c0b358002d75139c23babc30cbc0c7b23534d92)) +* set compute default to false ([c70e491](https://github.com/ydataai/ydata-profiling/commit/c70e49136fbdf1d3fe7e6ef5b23a8adbd0567ecf)) +* solve mypy error ([9c4266e](https://github.com/ydataai/ydata-profiling/commit/9c4266eb1cb252d8008795080723598d2d151e26)) +* solve mypy issue ([e3e7788](https://github.com/ydataai/ydata-profiling/commit/e3e7788907eebcf572423b48800f848d965f5969)) +* uses colors from the specified config ([c0c556d](https://github.com/ydataai/ydata-profiling/commit/c0c556d29cc191d44fdb08fc813818709c1b0666)) +* **utils:** use 'urllib.request' instead of 'requests' ([#1177](https://github.com/ydataai/ydata-profiling/issues/1177)) ([e4d020b](https://github.com/ydataai/ydata-profiling/commit/e4d020b873b67845a329517e42620ed96545d60e)), closes [#1168](https://github.com/ydataai/ydata-profiling/issues/1168) + #### πŸŽ‰ Features -- add heatmap values as a table under correlations ([fc5da9e](https://github.com/ydataai/ydata-profiling/commit/fc5da9eff07e7e18c5fd2d8caa698af7aee861e2)) -- allow to specify the configuration for the comparison report ([ad725b0](https://github.com/ydataai/ydata-profiling/commit/ad725b0f7d3b61c2a4fafddbdbfc1451197e2c94)) -- design improvements on the correlations section ([e5cd8cf](https://github.com/ydataai/ydata-profiling/commit/e5cd8cfb4b91f22b3435f9830f516e929c4e8d32)) -- implement imbalanced warning ([ce84c81](https://github.com/ydataai/ydata-profiling/commit/ce84c81c9d2194237676a407fbe5d2461ed64eda)) -- update variables layout ([#1207](https://github.com/ydataai/ydata-profiling/issues/1207)) ([cf0e0a7](https://github.com/ydataai/ydata-profiling/commit/cf0e0a72477ce13941caf09887afe6a1c3073858)) +* add heatmap values as a table under correlations ([fc5da9e](https://github.com/ydataai/ydata-profiling/commit/fc5da9eff07e7e18c5fd2d8caa698af7aee861e2)) +* allow to specify the configuration for the comparison report ([ad725b0](https://github.com/ydataai/ydata-profiling/commit/ad725b0f7d3b61c2a4fafddbdbfc1451197e2c94)) +* design improvements on the correlations section ([e5cd8cf](https://github.com/ydataai/ydata-profiling/commit/e5cd8cfb4b91f22b3435f9830f516e929c4e8d32)) +* implement imbalanced warning ([ce84c81](https://github.com/ydataai/ydata-profiling/commit/ce84c81c9d2194237676a407fbe5d2461ed64eda)) +* update variables layout ([#1207](https://github.com/ydataai/ydata-profiling/issues/1207)) ([cf0e0a7](https://github.com/ydataai/ydata-profiling/commit/cf0e0a72477ce13941caf09887afe6a1c3073858)) \ No newline at end of file diff --git a/docsrc/source/pages/reference/changelog/v3_6_1.md b/docsrc/source/pages/reference/changelog/v3_6_1.md index e8a4b30b4..1949eb4b3 100644 --- a/docsrc/source/pages/reference/changelog/v3_6_1.md +++ b/docsrc/source/pages/reference/changelog/v3_6_1.md @@ -1,6 +1,7 @@ ### Changelog v3.6.1 + #### πŸ› Bug fixes -- categorical var frequency plot ([6cb391f](https://github.com/ydataai/ydata-profiling/commit/6cb391fd8d26c98792e14592b4d853f9a557eab0)) -- remove ipywidgets import ([1b8b117](https://github.com/ydataai/ydata-profiling/commit/1b8b11719cd2a1dfcde9ecd7406aa0545bf46a8e)) +* categorical var frequency plot ([6cb391f](https://github.com/ydataai/ydata-profiling/commit/6cb391fd8d26c98792e14592b4d853f9a557eab0)) +* remove ipywidgets import ([1b8b117](https://github.com/ydataai/ydata-profiling/commit/1b8b11719cd2a1dfcde9ecd7406aa0545bf46a8e)) \ No newline at end of file diff --git a/docsrc/source/pages/reference/changelog/v3_6_2.md b/docsrc/source/pages/reference/changelog/v3_6_2.md index 31527c8b6..b357b8275 100644 --- a/docsrc/source/pages/reference/changelog/v3_6_2.md +++ b/docsrc/source/pages/reference/changelog/v3_6_2.md @@ -1,8 +1,9 @@ ### Changelog v3.6.2 + #### πŸ› Bug fixes -- comparison alerts ([#1229](https://github.com/ydataai/ydata-profiling/issues/1229)) ([6f6baf2](https://github.com/ydataai/ydata-profiling/commit/6f6baf2db01d1802eef8ce3ebc0612a37cffa3cf)) -- comparison histogram ([#1228](https://github.com/ydataai/ydata-profiling/issues/1228)) ([09ccae6](https://github.com/ydataai/ydata-profiling/commit/09ccae66aad9a16528ac6eda755475cb76ca8228)) -- comparison report style issues ([34431a1](https://github.com/ydataai/ydata-profiling/commit/34431a13d39fe0b9a5f58a6c739120a9df0e90c0)) -- update the link for the people-example.csv ([37cd822](https://github.com/ydataai/ydata-profiling/commit/37cd822fc8fea7b3a4c9ea456fbd01be76f8391c)) +* comparison alerts ([#1229](https://github.com/ydataai/ydata-profiling/issues/1229)) ([6f6baf2](https://github.com/ydataai/ydata-profiling/commit/6f6baf2db01d1802eef8ce3ebc0612a37cffa3cf)) +* comparison histogram ([#1228](https://github.com/ydataai/ydata-profiling/issues/1228)) ([09ccae6](https://github.com/ydataai/ydata-profiling/commit/09ccae66aad9a16528ac6eda755475cb76ca8228)) +* comparison report style issues ([34431a1](https://github.com/ydataai/ydata-profiling/commit/34431a13d39fe0b9a5f58a6c739120a9df0e90c0)) +* update the link for the people-example.csv ([37cd822](https://github.com/ydataai/ydata-profiling/commit/37cd822fc8fea7b3a4c9ea456fbd01be76f8391c)) diff --git a/docsrc/source/pages/reference/changelog/v3_6_3.md b/docsrc/source/pages/reference/changelog/v3_6_3.md index 593244a0d..982a42028 100644 --- a/docsrc/source/pages/reference/changelog/v3_6_3.md +++ b/docsrc/source/pages/reference/changelog/v3_6_3.md @@ -1,6 +1,7 @@ ### Changelog v3.6.3 + #### πŸ› Bug fixes -- issue[#1104](https://github.com/ydataai/ydata-profiling/issues/1104) empty dataframe ([#1238](https://github.com/ydataai/ydata-profiling/issues/1238)) ([db60d48](https://github.com/ydataai/ydata-profiling/commit/db60d48383acd70f6c5c09dc6b9c98e0a7dce57b)) -- time series vars excluded from some plots ([#1243](https://github.com/ydataai/ydata-profiling/issues/1243)) ([906591e](https://github.com/ydataai/ydata-profiling/commit/906591ef64da9766deefa7e1e8841080ca2a905e)) +* issue[#1104](https://github.com/ydataai/ydata-profiling/issues/1104) empty dataframe ([#1238](https://github.com/ydataai/ydata-profiling/issues/1238)) ([db60d48](https://github.com/ydataai/ydata-profiling/commit/db60d48383acd70f6c5c09dc6b9c98e0a7dce57b)) +* time series vars excluded from some plots ([#1243](https://github.com/ydataai/ydata-profiling/issues/1243)) ([906591e](https://github.com/ydataai/ydata-profiling/commit/906591ef64da9766deefa7e1e8841080ca2a905e)) \ No newline at end of file From 69547a763480291f6c9d96fb4cada0a55b9616d0 Mon Sep 17 00:00:00 2001 From: Ricardo Pereira Date: Mon, 20 Mar 2023 11:00:56 +0000 Subject: [PATCH 4/4] fix: comparing issues when pyspark is installed and when the report summaries are used (#1289) --- src/ydata_profiling/compare_reports.py | 97 +++++++++++++++++++------- src/ydata_profiling/profile_report.py | 6 +- tests/unit/test_comparison.py | 16 +++++ 3 files changed, 90 insertions(+), 29 deletions(-) diff --git a/src/ydata_profiling/compare_reports.py b/src/ydata_profiling/compare_reports.py index e108d5ea5..bfe15f3dd 100644 --- a/src/ydata_profiling/compare_reports.py +++ b/src/ydata_profiling/compare_reports.py @@ -1,3 +1,4 @@ +import json import warnings from typing import Any, List, Optional, Tuple, Union @@ -154,7 +155,7 @@ def _compare_dataset_description_preprocess( def validate_reports( - reports: List[ProfileReport], + reports: Union[List[ProfileReport], List[dict]], configs: List[dict] ) -> None: """Validate if the reports are comparable. @@ -171,17 +172,22 @@ def validate_reports( "Reports may be produced, but may yield unexpected formatting." ) - report_types = [r.config.vars.timeseries.active for r in reports] + report_types = [c.vars.timeseries.active for c in configs] # type: ignore if all(report_types) != any(report_types): raise ValueError( "Comparison between timeseries and tabular reports is not supported." ) - is_df_available = [r.df is not None for r in reports] - if not all(is_df_available): - raise ValueError("Reports where not initialized with a DataFrame.") + if isinstance(reports[0], ProfileReport): + is_df_available = [r.df is not None for r in reports] # type: ignore + if not all(is_df_available): + raise ValueError("Reports where not initialized with a DataFrame.") + + if isinstance(reports[0], ProfileReport): + features = [set(r.df.columns) for r in reports] # type: ignore + else: + features = [set(r["variables"].keys()) for r in reports] # type: ignore - features = [set(r.df.columns) for r in reports] # type: ignore if not all(features[0] == x for x in features): warnings.warn( "The datasets being profiled have a different set of columns. " @@ -250,7 +256,7 @@ def _create_placehoder_alerts(report_alerts: tuple) -> tuple: def compare( - reports: List[ProfileReport], + reports: Union[List[ProfileReport], List[dict]], config: Optional[Settings] = None, compute: bool = False, ) -> ProfileReport: @@ -265,38 +271,77 @@ def compare( recommended in cases where the reports were created using different settings """ - validate_reports(reports) - base_features = reports[0].df.columns # type: ignore - for report in reports[1:]: - cols_2_compare = [col for col in base_features if col in report.df.columns] # type: ignore - report.df = report.df.loc[:, cols_2_compare] # type: ignore - reports = [r for r in reports if not r.df.empty] # type: ignore - if len(reports) == 1: - return reports[0] + if len(reports) == 0: + raise ValueError("No reports available for comparison.") + + report_dtypes = [type(r) for r in reports] + if len(set(report_dtypes)) > 1: + raise TypeError( + "The input must have the same data type for all reports. Comparing ProfileReport objects to summaries obtained from the get_description() method is not supported." + ) + + if isinstance(reports[0], ProfileReport): + all_configs = [r.config for r in reports] # type: ignore + else: + configs_str = [ + json.loads(r["package"]["ydata_profiling_config"]) for r in reports # type: ignore + ] + all_configs = [] + for c_str in configs_str: + c_setting = Settings() + c_setting = c_setting.update(c_str) + all_configs.append(c_setting) + + validate_reports(reports=reports, configs=all_configs) + + if isinstance(reports[0], ProfileReport): + base_features = reports[0].df.columns # type: ignore + for report in reports[1:]: + cols_2_compare = [col for col in base_features if col in report.df.columns] # type: ignore + report.df = report.df.loc[:, cols_2_compare] # type: ignore + reports = [r for r in reports if not r.df.empty] # type: ignore + if len(reports) == 1: + return reports[0] # type: ignore + else: + base_features = list(reports[0]["variables"].keys()) + non_empty_reports = 0 + for report in reports[1:]: + cols_2_compare = [ + col for col in base_features if col in list(report["variables"].keys()) # type: ignore + ] + if len(cols_2_compare) > 0: + non_empty_reports += 1 + if non_empty_reports == 0: + profile = ProfileReport(None, config=all_configs[0]) + profile._description_set = reports[0] + return profile _config = None if config is None: - _config = reports[0].config.copy() + _config = all_configs[0].copy() else: _config = config.copy() - for report in reports: - tsmode = report.config.vars.timeseries.active - title = report.config.title - report.config = config.copy() - report.config.title = title - report.config.vars.timeseries.active = tsmode - if compute: - report._description_set = None + if isinstance(reports[0], ProfileReport): + for report in reports: + tsmode = report.config.vars.timeseries.active # type: ignore + title = report.config.title # type: ignore + report.config = config.copy() # type: ignore + report.config.title = title # type: ignore + report.config.vars.timeseries.active = tsmode # type: ignore + if compute: + report._description_set = None # type: ignore if all(isinstance(report, ProfileReport) for report in reports): # Type ignore is needed as mypy does not pick up on the type narrowing # Consider using TypeGuard (3.10): https://docs.python.org/3/library/typing.html#typing.TypeGuard - _update_titles(reports) + _update_titles(reports) # type: ignore labels, descriptions = _compare_profile_report_preprocess(reports, _config) # type: ignore elif all(isinstance(report, dict) for report in reports): labels, descriptions = _compare_dataset_description_preprocess(reports) # type: ignore else: - raise TypeError("") + raise TypeError( + "The input must have the same data type for all reports. Comparing ProfileReport objects to summaries obtained from the get_description() method is not supported." + ) _config.html.style._labels = labels diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py index 8e707155b..295eca45c 100644 --- a/src/ydata_profiling/profile_report.py +++ b/src/ydata_profiling/profile_report.py @@ -151,7 +151,7 @@ def __init__( @staticmethod def __validate_inputs( - df: Union[pd.DataFrame, sDataFrame], + df: Optional[Union[pd.DataFrame, sDataFrame]], minimal: bool, tsmode: bool, config_file: Optional[Union[Path, str]], @@ -188,8 +188,8 @@ def __validate_inputs( @staticmethod def __initialize_dataframe( - df: Union[pd.DataFrame, sDataFrame], report_config: Settings - ) -> Union[pd.DataFrame, sDataFrame]: + df: Optional[Union[pd.DataFrame, sDataFrame]], report_config: Settings + ) -> Optional[Union[pd.DataFrame, sDataFrame]]: if ( df is not None and isinstance(df, pd.DataFrame) diff --git a/tests/unit/test_comparison.py b/tests/unit/test_comparison.py index fa67b8151..7adb012a9 100644 --- a/tests/unit/test_comparison.py +++ b/tests/unit/test_comparison.py @@ -29,6 +29,14 @@ def test_compare_two(reports): assert len(result_description["table"]["n"]) == 2 +def test_compare_two_description(reports): + args = [r.get_description() for r in reports[:2]] + assert len(args) == 2 + result = compare(args) + result_description = result.get_description() + assert len(result_description["table"]["n"]) == 2 + + def test_compare_three(reports): args = reports[:3] assert len(args) == 3 @@ -37,6 +45,14 @@ def test_compare_three(reports): assert len(result_description["table"]["n"]) == 3 +def test_compare_three_description(reports): + args = [r.get_description() for r in reports[:3]] + assert len(args) == 3 + result = compare(args) + result_description = result.get_description() + assert len(result_description["table"]["n"]) == 3 + + def test_title(): assert _compare_title(["a"]) == "a" assert _compare_title(["a", "b"]) == "Comparing a and b"