Merge pull request #762 from pandas-profiling/develop

v2.12.0 release
ydataai · May 5, 2021 · 662fdad · 662fdad
2 parents 5756097 + 1d4c9b5
commit 662fdad
Show file tree

Hide file tree

Showing 54 changed files with 668 additions and 471 deletions.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -0,0 +1,42 @@
+name: Performance Benchmarks
+
+on:
+  push:
+    branches:
+      - master
+      - develop
+
+jobs:
+  benchmark:
+    name: ${{ matrix.os }} x ${{ matrix.python }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest ] #, macos-latest, windows-latest ]
+        python: ['3.8']
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - uses: actions/setup-python@v1
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Run benchmark
+        run: |
+          pip install --upgrade pip setuptools wheel
+          pip install -r requirements.txt
+          pip install -r requirements-test.txt
+      - run: make install
+      - run: pytest tests/benchmarks/bench.py --benchmark-min-rounds 10 --benchmark-warmup "on" --benchmark-json benchmark.json
+      - name: Store benchmark result
+        uses: rhysd/github-action-benchmark@v1
+        with:
+          name: Pandas Profiling Benchmarks
+          tool: 'pytest'
+          output-file-path: benchmark.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          auto-push: true
+
+          comment-on-alert: true
+          alert-comment-cc-users: '@sbrugman'
diff --git a/.github/workflows/commit.yml b/.github/workflows/commit.yml
@@ -0,0 +1,11 @@
+name: Lint Commit Messages
+on: [pull_request]
+
+jobs:
+  commitlint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - uses: wagoid/commitlint-github-action@v3
diff --git a/.github/workflows/ci.yml → .github/workflows/release.yml b/.github/workflows/ci.yml → .github/workflows/release.yml
@@ -1,4 +1,4 @@
-name: CI
+name: Release CI
 
 on:
   push:

diff --git a/.github/workflows/ci_test.yml → .github/workflows/tests.yml b/.github/workflows/ci_test.yml → .github/workflows/tests.yml
@@ -1,9 +1,9 @@
-name: Tests and Coverage
+name: CI
 
 on: push
 
 jobs:
-  build:
+  test:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
@@ -33,7 +33,53 @@ jobs:
             pandas: "pandas>1.1"
             numpy: "numpy"
 
-    name: python ${{ matrix.python-version }}, ${{ matrix.os }}, ${{ matrix.pandas }}, ${{ matrix.numpy }}
+    name: Tests | python ${{ matrix.python-version }}, ${{ matrix.os }}, ${{ matrix.pandas }}, ${{ matrix.numpy }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+      - uses: actions/cache@v2
+        if: startsWith(runner.os, 'Linux')
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-${{ matrix.pandas }}-pip-
+
+      - uses: actions/cache@v2
+        if: startsWith(runner.os, 'macOS')
+        with:
+          path: ~/Library/Caches/pip
+          key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-${{ matrix.pandas }}-pip-
+
+      - uses: actions/cache@v2
+        if: startsWith(runner.os, 'Windows')
+        with:
+          path: ~\AppData\Local\pip\Cache
+          key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-${{ matrix.pandas }}-pip-
+      - run: |
+          pip install --upgrade pip setuptools wheel
+          pip install -r requirements.txt "${{ matrix.pandas }}" "${{ matrix.numpy }}"
+          pip install -r requirements-test.txt
+      - run: make install
+      - run: make test
+  coverage:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ ubuntu-latest ]
+        python-version: [ 3.8 ]
+        pandas: [ "pandas>1.1"]
+        numpy: ["numpy"]
+
+    name: Coverage | python ${{ matrix.python-version }}, ${{ matrix.os }}, ${{ matrix.pandas }}, ${{ matrix.numpy }}
     steps:
       - uses: actions/checkout@v2
       - name: Setup python

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ repos:
     - id: black
       language_version: python3.8
 -   repo: https://github.com/nbQA-dev/nbQA
-    rev: 0.5.9
+    rev: 0.7.0
     hooks:
     - id: nbqa-black
       additional_dependencies: [ black==20.8b1 ]
@@ -17,12 +17,12 @@ repos:
       additional_dependencies: [ pyupgrade==2.7.3 ]
       args: [ --nbqa-mutate, --py36-plus ]
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v2.10.0
+    rev: v2.12.0
     hooks:
     -   id: pyupgrade
         args: ['--py36-plus','--exit-zero-even-if-changed']
 -   repo: https://github.com/pycqa/isort
-    rev: 5.7.0
+    rev: 5.8.0
     hooks:
       - id: isort
         files: '.*'
@@ -31,8 +31,8 @@ repos:
     rev: "0.46"
     hooks:
     -   id: check-manifest
--   repo: https://gitlab.com/pycqa/flake8
-    rev: "3.8.4"
+-   repo: https://github.com/PyCQA/flake8
+    rev: "3.9.1"
     hooks:
     -   id: flake8
         args: [ "--select=E9,F63,F7,F82"] #,T001

diff --git a/Makefile b/Makefile
@@ -16,7 +16,9 @@ test:
 	pytest tests/issues/
 	pytest --nbval tests/notebooks/
 	flake8 . --select=E9,F63,F7,F82 --show-source --statistics
-
+	pandas_profiling -h
+	make typing
+
 test_cov:
 	pytest --cov=. tests/unit/
 	pytest --cov=. --cov-append tests/issues/

diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@
 <p align="center">
   <a href="https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/">Documentation</a>
   |
-  <a href="https://join.slack.com/t/pandas-profiling/shared_invite/zt-l2iqwb92-9JpTEdFBijR2G798j2MpQw">Slack</a>
+  <a href="https://join.slack.com/t/pandas-profiling/shared_invite/zt-oe5ol4yc-YtbOxNBGUCb~v73TamRLuA">Slack</a>
   | 
   <a href="https://stackoverflow.com/questions/tagged/pandas-profiling">Stack Overflow</a>
 </p>
@@ -79,6 +79,7 @@ The following examples can give you an impression of what the package can do:
 * [Vektis](https://pandas-profiling.github.io/pandas-profiling/examples/master/vektis/vektis_report.html) (Vektis Dutch Healthcare data)
 * [Colors](https://pandas-profiling.github.io/pandas-profiling/examples/master/colors/colors_report.html) (a simple colors dataset)
 * [UCI Bank Dataset](https://pandas-profiling.github.io/pandas-profiling/examples/master/cbank_marketing_data/uci_bank_marketing_report.html) (banking marketing dataset)
+* [RDW](https://pandas-profiling.github.io/pandas-profiling/examples/master/rdw/rdw.html) (RDW, the Dutch DMV's vehicle registration 10 million rows, 71 features)
 
 
 Specific features:
@@ -211,7 +212,7 @@ profile.to_file("your_report.json")
 
 Version 2.4 introduces minimal mode. 
 
-This is a default configuration that disables expensive computations (such as correlations and dynamic binning).
+This is a default configuration that disables expensive computations (such as correlations and duplicate row detection).
 
 Use the following syntax:
 
@@ -220,6 +221,8 @@ profile = ProfileReport(large_dataset, minimal=True)
 profile.to_file("output.html")
 ```
 
+Benchmarks are available [here](https://pandas-profiling.github.io/pandas-profiling/dev/bench/).
+
 ### Command line usage
 
 For standard formatted CSV files that can be read immediately by pandas, you can use the `pandas_profiling` executable. 
@@ -239,7 +242,7 @@ A set of options is available in order to adapt the report generated.
 * `progress_bar` (`bool`): If True, `pandas-profiling` will display a progress bar.
 * `infer_dtypes` (`bool`): When `True` (default) the `dtype` of variables are inferred using `visions` using the typeset logic (for instance a column that has integers stored as string will be analyzed as if being numeric).
 
-More settings can be found in the [default configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_default.yaml), [minimal configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_minimal.yaml) and [dark themed configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_dark.yaml).
+More settings can be found in the [default configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_default.yaml) and [minimal configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_minimal.yaml).
 
 You find the configuration docs on the advanced usage page [here](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/advanced_usage.html)
 
@@ -306,14 +309,15 @@ Types are a powerful abstraction for effective data analysis, that goes beyond t
 `pandas-profiling` currently, recognizes the following types: _Boolean, Numerical, Date, Categorical, URL, Path, File_ and _Image_.
 
 We have developed a type system for Python, tailored for data analysis: [visions](https://github.com/dylan-profiler/visions).
-Selecting the right typeset drastically reduces the complexity the code of your analysis.
-Future versions of `pandas-profiling` will have extended type support through `visions`!
+Choosing an appropriate typeset can both improve the overall expressiveness and reduce the complexity of your analysis/code.
+To learn more about `pandas-profiling`'s type system, check out the default implementation [here](https://github.com/pandas-profiling/pandas-profiling/blob/develop/src/pandas_profiling/model/typeset.py).
+In the meantime, user customized summarizations and type definitions are now fully supported - if you have a specific use-case please reach out with ideas or a PR!
 
 ## Contributing
 
 Read on getting involved in the [Contribution Guide](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/contribution_guidelines.html).
 
-A low threshold place to ask questions or start contributing is by reaching out on the pandas-profiling Slack. [Join the Slack community](https://join.slack.com/t/pandas-profiling/shared_invite/zt-hfy3iwp2-qEJSItye5QBZf8YGFMaMnQ).
+A low threshold place to ask questions or start contributing is by reaching out on the pandas-profiling Slack. [Join the Slack community](https://join.slack.com/t/pandas-profiling/shared_invite/zt-oe5ol4yc-YtbOxNBGUCb~v73TamRLuA).
 
 ## Editor integration
 

diff --git a/docsrc/source/pages/advanced_usage.rst b/docsrc/source/pages/advanced_usage.rst
@@ -165,3 +165,75 @@ It's possible to disable certain groups of features through configuration shorth
     r.set_variable("correlations", None)
     r.set_variable("missing_diagrams", None)
     r.set_variable("interactions", None)
+
+
+
+
+Customise plots
+---------------
+
+A way how to pass arguments to the underlying matplotlib is to use the ``plot`` argument. It is possible to change the default format of images to png (default svg) using the key-pair ``image_format: "png"`` and also the resolution of the image using ``dpi: 800``. 
+
+An example would be:
+
+.. code-block:: python
+
+	profile = ProfileReport(planets, title='Pandas Profiling Report', explorative=True,
+			       plot={
+				   'dpi':200,
+				   'image_format': 'png'
+			       })
+
+
+Furthermore, it is possible to change the default values of histograms, the options for that are the following:
+
+    histogram:
+            x_axis_labels: True
+
+    # Number of bins (set to 0 to automatically detect the bin size)
+            bins: 50
+
+    # Maximum number of bins (when bins=0)
+            max_bins: 250
+
+
+
+
+
+Customise correlation matrix
+-----------------------------
+
+It's possible to directly access the correlation matrix as well. That is done with the ``plot`` argument and then with the `correlation` key. It is possible to customise the palett, one can use the following list used in seaborn or create [their own custom matplotlib palette](https://matplotlib.org/stable/gallery/color/custom_cmap.html). Supported values are 
+
+```
+'Accent', 'Accent_r', 'Blues', 'Blues_r', 'BrBG', 'BrBG_r', 'BuGn', 'BuGn_r', 'BuPu', 'BuPu_r', 'CMRmap', 'CMRmap_r', 'Dark2', 'Dark2_r', 'GnBu', 'GnBu_r', 'Greens', 'Greens_r', 'Greys', 'Greys_r', 'OrRd', 'OrRd_r', 'Oranges', 'Oranges_r', 'PRGn', 'PRGn_r', 'Paired', 'Paired_r', 'Pastel1', 'Pastel1_r', 'Pastel2', 'Pastel2_r', 'PiYG', 'PiYG_r', 'PuBu', 'PuBuGn', 'PuBuGn_r', 'PuBu_r', 'PuOr', 'PuOr_r', 'PuRd', 'PuRd_r', 'Purples', 'Purples_r', 'RdBu', 'RdBu_r', 'RdGy', 'RdGy_r', 'RdPu', 'RdPu_r', 'RdYlBu', 'RdYlBu_r', 'RdYlGn', 'RdYlGn_r', 'Reds', 'Reds_r', 'Set1', 'Set1_r', 'Set2', 'Set2_r', 'Set3', 'Set3_r', 'Spectral', 'Spectral_r', 'Wistia', 'Wistia_r', 'YlGn', 'YlGnBu', 'YlGnBu_r', 'YlGn_r', 'YlOrBr', 'YlOrBr_r', 'YlOrRd', 'YlOrRd_r', 'afmhot', 'afmhot_r', 'autumn', 'autumn_r', 'binary', 'binary_r', 'bone', 'bone_r', 'brg', 'brg_r', 'bwr', 'bwr_r', 'cividis', 'cividis_r', 'cool', 'cool_r', 'coolwarm', 'coolwarm_r', 'copper', 'copper_r', 'crest', 'crest_r', 'cubehelix', 'cubehelix_r', 'flag', 'flag_r', 'flare', 'flare_r', 'gist_earth', 'gist_earth_r', 'gist_gray', 'gist_gray_r', 'gist_heat', 'gist_heat_r', 'gist_ncar', 'gist_ncar_r', 'gist_rainbow', 'gist_rainbow_r', 'gist_stern', 'gist_stern_r', 'gist_yarg', 'gist_yarg_r', 'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 'gray', 'gray_r', 'hot', 'hot_r', 'hsv', 'hsv_r', 'icefire', 'icefire_r', 'inferno', 'inferno_r', 'jet', 'jet_r', 'magma', 'magma_r', 'mako', 'mako_r', 'nipy_spectral', 'nipy_spectral_r', 'ocean', 'ocean_r', 'pink', 'pink_r', 'plasma', 'plasma_r', 'prism', 'prism_r', 'rainbow', 'rainbow_r', 'rocket', 'rocket_r', 'seismic', 'seismic_r', 'spring', 'spring_r', 'summer', 'summer_r', 'tab10', 'tab10_r', 'tab20', 'tab20_r', 'tab20b', 'tab20b_r', 'tab20c', 'tab20c_r', 'terrain', 'terrain_r', 'turbo', 'turbo_r', 'twilight', 'twilight_r', 'twilight_shifted', 'twilight_shifted_r', 'viridis', 'viridis_r', 'vlag', 'vlag_r', 'winter', 'winter_r'
+```
+
+An example can be:
+
+.. code-block:: python
+
+  from pandas_profiling import ProfileReport
+
+  profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True,
+                       plot={
+                           'correlation':{
+                               'cmap': 'RdBu_r',
+                               'bad': '#000000'}}
+                       )
+
+
+Similarly, one can change the palette for *Missing values* using the ``missing`` argument, eg:
+
+.. code-block:: python
+
+  from pandas_profiling import ProfileReport
+
+  profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True,
+                       plot={
+                           'missing':{
+                               'cmap': 'RdBu_r'}}
+                       )
+
+
+
diff --git a/docsrc/source/pages/changelog/v2_12_0.rst b/docsrc/source/pages/changelog/v2_12_0.rst
@@ -3,14 +3,27 @@ Changelog v2.12.0
 
 🎉 Features
 ^^^^^^^^^^^
-- Add the number and the percentage of negative values for numerical variables `[695] <https://github.com/pandas-profiling/pandas-profiling/issues/695>`- (contributed by @gverbock). 
+- Add the number and the percentage of negative values for numerical variables `[695] <https://github.com/pandas-profiling/pandas-profiling/issues/695>`_ (contributed by @gverbock)
 - Enable setting of typeset/summarizer (contributed by @ieaves)
+- Allow empty data frames `[678] <https://github.com/pandas-profiling/pandas-profiling/issues/678>`_ (contributed by @spbail, @fwd2020-c)
+
+🐛 Bug fixes
+^^^^^^^^^^^^
+- Patch args for great_expectations datetime profiler `[727] <https://github.com/pandas-profiling/pandas-profiling/issues/727>`_ (contributed by @jstammers)
+- Negative exponent formatting `[723]  <https://github.com/pandas-profiling/pandas-profiling/issues/723>`_ (reported by @rdpapworth)
 
 📖 Documentation
 ^^^^^^^^^^^^^^^^
 - Fix link syntax (contributed by @ChrisCarini)
 
+👷‍♂️ Internal Improvements
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+- Several performance improvements (minimal mode, duplicates, frequency table sorting)
+- Introduce ``pytest-benchmark`` in CI to monitor commit performance impact
+- Introduce ``commitlint`` in CI to start automating the changelog generation
+
 ⬆️ Dependencies
 ^^^^^^^^^^^^^^^^^^
-- The `ipywidgets` dependency was moved to the `[notebook]` extra, so most of Jupyter will not be installed alongside this package by default (contributed by @akx).
-- Replaced the (testing only) `fastparquet` dependency with `pyarrow` (default pandas parquet engine, contributed by @kurosch).
+- The ``ipywidgets`` dependency was moved to the ``[notebook]`` extra, so most of Jupyter will not be installed alongside this package by default (contributed by @akx)
+- Replaced the (testing only) ``fastparquet`` dependency with ``pyarrow`` (default pandas parquet engine, contributed by @kurosch)
+- Upgrade ``phik``. This drops the hard dependency on numba (contributed by @akx)
diff --git a/docsrc/source/pages/changelog/v2_13_0.rst b/docsrc/source/pages/changelog/v2_13_0.rst
@@ -1,5 +1,5 @@
-Changelog vx.y.z
-----------------
+Changelog v2.13.0
+-----------------
 
 🎉 Features
 ^^^^^^^^^^^

diff --git a/docsrc/source/pages/contribution_guidelines.rst b/docsrc/source/pages/contribution_guidelines.rst
@@ -9,6 +9,10 @@ Contributing a new feature
 
 * Ensure the PR description clearly describes the problem and solution.
   Include the relevant issue number if applicable.
+
+Slack community
+---------------
+A low threshold place to ask questions or start contributing is by reaching out on the pandas-profiling Slack. `Join the Slack community <https://join.slack.com/t/pandas-profiling/shared_invite/zt-oe5ol4yc-YtbOxNBGUCb~v73TamRLuA>`_.
 
 Developer tools
 ---------------
@@ -61,4 +65,4 @@ Read Github's `open source legal guide <https://opensource.guide/legal/#does-my-
 More information
 ----------------
 
-Read more on getting involved in the `Contribution Guide <https://github.com/pandas-profiling/pandas-profiling/blob/master/CONTRIBUTING.md>`_ on Github.
+Read more on getting involved in the `Contribution Guide <https://github.com/pandas-profiling/pandas-profiling/blob/master/CONTRIBUTING.md>`_ on Github.
diff --git a/docsrc/source/pages/resources.rst b/docsrc/source/pages/resources.rst
@@ -14,7 +14,7 @@ Notebooks
 
 Articles
 --------
-
+- `Bringing Customization to Pandas Profiling <https://medium.com/@ianeaves/customizing-pandas-profiling-summaries-b16714d0dac9>`_ (Ian Eaves, March 5, 2021)
 - `Beginner Friendly Data Science Projects Accepting Contributions <https://towardsdatascience.com/beginner-friendly-data-science-projects-accepting-contributions-3b8e26f7e88e>`_ (Adam Ross Nelson, January 18, 2021)
 - `Pandas profiling and exploratory data analysis with line one of code! <https://towardsdatascience.com/pandas-profiling-and-exploratory-data-analysis-with-line-one-of-code-423111991e58>`_ (Magdalena Konkiewicz, Jun 10, 2020)
 - `The Covid 19 health issue <https://concillier.squarespace.com/datasets/covid-19>`_ (Concillier Kitungulu, April 20, 2020)

diff --git a/docsrc/source/pages/support.rst b/docsrc/source/pages/support.rst
@@ -35,6 +35,10 @@ Users with a request for help on how to use `pandas-profiling` should consider a
   :alt: Questions: Stackoverflow "pandas-profiling"
   :target: https://stackoverflow.com/questions/tagged/pandas-profiling
 
+Slack community
+---------------
+
+`Join the Slack community <https://join.slack.com/t/pandas-profiling/shared_invite/zt-oe5ol4yc-YtbOxNBGUCb~v73TamRLuA>`_ and come into contact with other users and developers, that might be able to answer your questions.
 
 Reporting a bug
 ---------------