diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 000000000..d92839944
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,42 @@
+name: Performance Benchmarks
+
+on:
+ push:
+ branches:
+ - master
+ - develop
+
+jobs:
+ benchmark:
+ name: ${{ matrix.os }} x ${{ matrix.python }}
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ ubuntu-latest ] #, macos-latest, windows-latest ]
+ python: ['3.8']
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+ - uses: actions/setup-python@v1
+ with:
+ python-version: ${{ matrix.python }}
+ - name: Run benchmark
+ run: |
+ pip install --upgrade pip setuptools wheel
+ pip install -r requirements.txt
+ pip install -r requirements-test.txt
+ - run: make install
+ - run: pytest tests/benchmarks/bench.py --benchmark-min-rounds 10 --benchmark-warmup "on" --benchmark-json benchmark.json
+ - name: Store benchmark result
+ uses: rhysd/github-action-benchmark@v1
+ with:
+ name: Pandas Profiling Benchmarks
+ tool: 'pytest'
+ output-file-path: benchmark.json
+ github-token: ${{ secrets.GITHUB_TOKEN }}
+ auto-push: true
+
+ comment-on-alert: true
+ alert-comment-cc-users: '@sbrugman'
diff --git a/.github/workflows/commit.yml b/.github/workflows/commit.yml
new file mode 100644
index 000000000..818987e0f
--- /dev/null
+++ b/.github/workflows/commit.yml
@@ -0,0 +1,11 @@
+name: Lint Commit Messages
+on: [pull_request]
+
+jobs:
+ commitlint:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+ - uses: wagoid/commitlint-github-action@v3
\ No newline at end of file
diff --git a/.github/workflows/ci.yml b/.github/workflows/release.yml
similarity index 99%
rename from .github/workflows/ci.yml
rename to .github/workflows/release.yml
index 48e1aa31b..e286aa439 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/release.yml
@@ -1,4 +1,4 @@
-name: CI
+name: Release CI
on:
push:
diff --git a/.github/workflows/ci_test.yml b/.github/workflows/tests.yml
similarity index 56%
rename from .github/workflows/ci_test.yml
rename to .github/workflows/tests.yml
index cd797af42..95fbbbc21 100644
--- a/.github/workflows/ci_test.yml
+++ b/.github/workflows/tests.yml
@@ -1,9 +1,9 @@
-name: Tests and Coverage
+name: CI
on: push
jobs:
- build:
+ test:
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -33,7 +33,53 @@ jobs:
pandas: "pandas>1.1"
numpy: "numpy"
- name: python ${{ matrix.python-version }}, ${{ matrix.os }}, ${{ matrix.pandas }}, ${{ matrix.numpy }}
+ name: Tests | python ${{ matrix.python-version }}, ${{ matrix.os }}, ${{ matrix.pandas }}, ${{ matrix.numpy }}
+ steps:
+ - uses: actions/checkout@v2
+ - name: Setup python
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ architecture: x64
+ - uses: actions/cache@v2
+ if: startsWith(runner.os, 'Linux')
+ with:
+ path: ~/.cache/pip
+ key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
+ restore-keys: |
+ ${{ runner.os }}-${{ matrix.pandas }}-pip-
+
+ - uses: actions/cache@v2
+ if: startsWith(runner.os, 'macOS')
+ with:
+ path: ~/Library/Caches/pip
+ key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
+ restore-keys: |
+ ${{ runner.os }}-${{ matrix.pandas }}-pip-
+
+ - uses: actions/cache@v2
+ if: startsWith(runner.os, 'Windows')
+ with:
+ path: ~\AppData\Local\pip\Cache
+ key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
+ restore-keys: |
+ ${{ runner.os }}-${{ matrix.pandas }}-pip-
+ - run: |
+ pip install --upgrade pip setuptools wheel
+ pip install -r requirements.txt "${{ matrix.pandas }}" "${{ matrix.numpy }}"
+ pip install -r requirements-test.txt
+ - run: make install
+ - run: make test
+ coverage:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ ubuntu-latest ]
+ python-version: [ 3.8 ]
+ pandas: [ "pandas>1.1"]
+ numpy: ["numpy"]
+
+ name: Coverage | python ${{ matrix.python-version }}, ${{ matrix.os }}, ${{ matrix.pandas }}, ${{ matrix.numpy }}
steps:
- uses: actions/checkout@v2
- name: Setup python
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 20305ac6d..a81c863b9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ repos:
- id: black
language_version: python3.8
- repo: https://github.com/nbQA-dev/nbQA
- rev: 0.5.9
+ rev: 0.7.0
hooks:
- id: nbqa-black
additional_dependencies: [ black==20.8b1 ]
@@ -17,12 +17,12 @@ repos:
additional_dependencies: [ pyupgrade==2.7.3 ]
args: [ --nbqa-mutate, --py36-plus ]
- repo: https://github.com/asottile/pyupgrade
- rev: v2.10.0
+ rev: v2.12.0
hooks:
- id: pyupgrade
args: ['--py36-plus','--exit-zero-even-if-changed']
- repo: https://github.com/pycqa/isort
- rev: 5.7.0
+ rev: 5.8.0
hooks:
- id: isort
files: '.*'
@@ -31,8 +31,8 @@ repos:
rev: "0.46"
hooks:
- id: check-manifest
-- repo: https://gitlab.com/pycqa/flake8
- rev: "3.8.4"
+- repo: https://github.com/PyCQA/flake8
+ rev: "3.9.1"
hooks:
- id: flake8
args: [ "--select=E9,F63,F7,F82"] #,T001
diff --git a/Makefile b/Makefile
index bbcd539e9..3a8e2d836 100644
--- a/Makefile
+++ b/Makefile
@@ -16,7 +16,9 @@ test:
pytest tests/issues/
pytest --nbval tests/notebooks/
flake8 . --select=E9,F63,F7,F82 --show-source --statistics
-
+ pandas_profiling -h
+ make typing
+
test_cov:
pytest --cov=. tests/unit/
pytest --cov=. --cov-append tests/issues/
diff --git a/README.md b/README.md
index b1838fe88..de9e02497 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@
Documentation
|
- Slack
+ Slack
|
Stack Overflow
@@ -79,6 +79,7 @@ The following examples can give you an impression of what the package can do:
* [Vektis](https://pandas-profiling.github.io/pandas-profiling/examples/master/vektis/vektis_report.html) (Vektis Dutch Healthcare data)
* [Colors](https://pandas-profiling.github.io/pandas-profiling/examples/master/colors/colors_report.html) (a simple colors dataset)
* [UCI Bank Dataset](https://pandas-profiling.github.io/pandas-profiling/examples/master/cbank_marketing_data/uci_bank_marketing_report.html) (banking marketing dataset)
+* [RDW](https://pandas-profiling.github.io/pandas-profiling/examples/master/rdw/rdw.html) (RDW, the Dutch DMV's vehicle registration 10 million rows, 71 features)
Specific features:
@@ -211,7 +212,7 @@ profile.to_file("your_report.json")
Version 2.4 introduces minimal mode.
-This is a default configuration that disables expensive computations (such as correlations and dynamic binning).
+This is a default configuration that disables expensive computations (such as correlations and duplicate row detection).
Use the following syntax:
@@ -220,6 +221,8 @@ profile = ProfileReport(large_dataset, minimal=True)
profile.to_file("output.html")
```
+Benchmarks are available [here](https://pandas-profiling.github.io/pandas-profiling/dev/bench/).
+
### Command line usage
For standard formatted CSV files that can be read immediately by pandas, you can use the `pandas_profiling` executable.
@@ -239,7 +242,7 @@ A set of options is available in order to adapt the report generated.
* `progress_bar` (`bool`): If True, `pandas-profiling` will display a progress bar.
* `infer_dtypes` (`bool`): When `True` (default) the `dtype` of variables are inferred using `visions` using the typeset logic (for instance a column that has integers stored as string will be analyzed as if being numeric).
-More settings can be found in the [default configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_default.yaml), [minimal configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_minimal.yaml) and [dark themed configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_dark.yaml).
+More settings can be found in the [default configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_default.yaml) and [minimal configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_minimal.yaml).
You find the configuration docs on the advanced usage page [here](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/advanced_usage.html)
@@ -306,14 +309,15 @@ Types are a powerful abstraction for effective data analysis, that goes beyond t
`pandas-profiling` currently, recognizes the following types: _Boolean, Numerical, Date, Categorical, URL, Path, File_ and _Image_.
We have developed a type system for Python, tailored for data analysis: [visions](https://github.com/dylan-profiler/visions).
-Selecting the right typeset drastically reduces the complexity the code of your analysis.
-Future versions of `pandas-profiling` will have extended type support through `visions`!
+Choosing an appropriate typeset can both improve the overall expressiveness and reduce the complexity of your analysis/code.
+To learn more about `pandas-profiling`'s type system, check out the default implementation [here](https://github.com/pandas-profiling/pandas-profiling/blob/develop/src/pandas_profiling/model/typeset.py).
+In the meantime, user customized summarizations and type definitions are now fully supported - if you have a specific use-case please reach out with ideas or a PR!
## Contributing
Read on getting involved in the [Contribution Guide](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/contribution_guidelines.html).
-A low threshold place to ask questions or start contributing is by reaching out on the pandas-profiling Slack. [Join the Slack community](https://join.slack.com/t/pandas-profiling/shared_invite/zt-hfy3iwp2-qEJSItye5QBZf8YGFMaMnQ).
+A low threshold place to ask questions or start contributing is by reaching out on the pandas-profiling Slack. [Join the Slack community](https://join.slack.com/t/pandas-profiling/shared_invite/zt-oe5ol4yc-YtbOxNBGUCb~v73TamRLuA).
## Editor integration
diff --git a/docsrc/source/pages/advanced_usage.rst b/docsrc/source/pages/advanced_usage.rst
index 8f7099ef6..ba39f6bbb 100644
--- a/docsrc/source/pages/advanced_usage.rst
+++ b/docsrc/source/pages/advanced_usage.rst
@@ -165,3 +165,75 @@ It's possible to disable certain groups of features through configuration shorth
r.set_variable("correlations", None)
r.set_variable("missing_diagrams", None)
r.set_variable("interactions", None)
+
+
+
+
+Customise plots
+---------------
+
+A way how to pass arguments to the underlying matplotlib is to use the ``plot`` argument. It is possible to change the default format of images to png (default svg) using the key-pair ``image_format: "png"`` and also the resolution of the image using ``dpi: 800``.
+
+An example would be:
+
+.. code-block:: python
+
+ profile = ProfileReport(planets, title='Pandas Profiling Report', explorative=True,
+ plot={
+ 'dpi':200,
+ 'image_format': 'png'
+ })
+
+
+Furthermore, it is possible to change the default values of histograms, the options for that are the following:
+
+ histogram:
+ x_axis_labels: True
+
+ # Number of bins (set to 0 to automatically detect the bin size)
+ bins: 50
+
+ # Maximum number of bins (when bins=0)
+ max_bins: 250
+
+
+
+
+
+Customise correlation matrix
+-----------------------------
+
+It's possible to directly access the correlation matrix as well. That is done with the ``plot`` argument and then with the `correlation` key. It is possible to customise the palett, one can use the following list used in seaborn or create [their own custom matplotlib palette](https://matplotlib.org/stable/gallery/color/custom_cmap.html). Supported values are
+
+```
+'Accent', 'Accent_r', 'Blues', 'Blues_r', 'BrBG', 'BrBG_r', 'BuGn', 'BuGn_r', 'BuPu', 'BuPu_r', 'CMRmap', 'CMRmap_r', 'Dark2', 'Dark2_r', 'GnBu', 'GnBu_r', 'Greens', 'Greens_r', 'Greys', 'Greys_r', 'OrRd', 'OrRd_r', 'Oranges', 'Oranges_r', 'PRGn', 'PRGn_r', 'Paired', 'Paired_r', 'Pastel1', 'Pastel1_r', 'Pastel2', 'Pastel2_r', 'PiYG', 'PiYG_r', 'PuBu', 'PuBuGn', 'PuBuGn_r', 'PuBu_r', 'PuOr', 'PuOr_r', 'PuRd', 'PuRd_r', 'Purples', 'Purples_r', 'RdBu', 'RdBu_r', 'RdGy', 'RdGy_r', 'RdPu', 'RdPu_r', 'RdYlBu', 'RdYlBu_r', 'RdYlGn', 'RdYlGn_r', 'Reds', 'Reds_r', 'Set1', 'Set1_r', 'Set2', 'Set2_r', 'Set3', 'Set3_r', 'Spectral', 'Spectral_r', 'Wistia', 'Wistia_r', 'YlGn', 'YlGnBu', 'YlGnBu_r', 'YlGn_r', 'YlOrBr', 'YlOrBr_r', 'YlOrRd', 'YlOrRd_r', 'afmhot', 'afmhot_r', 'autumn', 'autumn_r', 'binary', 'binary_r', 'bone', 'bone_r', 'brg', 'brg_r', 'bwr', 'bwr_r', 'cividis', 'cividis_r', 'cool', 'cool_r', 'coolwarm', 'coolwarm_r', 'copper', 'copper_r', 'crest', 'crest_r', 'cubehelix', 'cubehelix_r', 'flag', 'flag_r', 'flare', 'flare_r', 'gist_earth', 'gist_earth_r', 'gist_gray', 'gist_gray_r', 'gist_heat', 'gist_heat_r', 'gist_ncar', 'gist_ncar_r', 'gist_rainbow', 'gist_rainbow_r', 'gist_stern', 'gist_stern_r', 'gist_yarg', 'gist_yarg_r', 'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 'gray', 'gray_r', 'hot', 'hot_r', 'hsv', 'hsv_r', 'icefire', 'icefire_r', 'inferno', 'inferno_r', 'jet', 'jet_r', 'magma', 'magma_r', 'mako', 'mako_r', 'nipy_spectral', 'nipy_spectral_r', 'ocean', 'ocean_r', 'pink', 'pink_r', 'plasma', 'plasma_r', 'prism', 'prism_r', 'rainbow', 'rainbow_r', 'rocket', 'rocket_r', 'seismic', 'seismic_r', 'spring', 'spring_r', 'summer', 'summer_r', 'tab10', 'tab10_r', 'tab20', 'tab20_r', 'tab20b', 'tab20b_r', 'tab20c', 'tab20c_r', 'terrain', 'terrain_r', 'turbo', 'turbo_r', 'twilight', 'twilight_r', 'twilight_shifted', 'twilight_shifted_r', 'viridis', 'viridis_r', 'vlag', 'vlag_r', 'winter', 'winter_r'
+```
+
+An example can be:
+
+.. code-block:: python
+
+ from pandas_profiling import ProfileReport
+
+ profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True,
+ plot={
+ 'correlation':{
+ 'cmap': 'RdBu_r',
+ 'bad': '#000000'}}
+ )
+
+
+Similarly, one can change the palette for *Missing values* using the ``missing`` argument, eg:
+
+.. code-block:: python
+
+ from pandas_profiling import ProfileReport
+
+ profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True,
+ plot={
+ 'missing':{
+ 'cmap': 'RdBu_r'}}
+ )
+
+
+
diff --git a/docsrc/source/pages/changelog/v2_12_0.rst b/docsrc/source/pages/changelog/v2_12_0.rst
index 2b6b6a5e8..02d35bb4a 100644
--- a/docsrc/source/pages/changelog/v2_12_0.rst
+++ b/docsrc/source/pages/changelog/v2_12_0.rst
@@ -3,14 +3,27 @@ Changelog v2.12.0
🎉 Features
^^^^^^^^^^^
-- Add the number and the percentage of negative values for numerical variables `[695] `- (contributed by @gverbock).
+- Add the number and the percentage of negative values for numerical variables `[695] `_ (contributed by @gverbock)
- Enable setting of typeset/summarizer (contributed by @ieaves)
+- Allow empty data frames `[678] `_ (contributed by @spbail, @fwd2020-c)
+
+🐛 Bug fixes
+^^^^^^^^^^^^
+- Patch args for great_expectations datetime profiler `[727] `_ (contributed by @jstammers)
+- Negative exponent formatting `[723] `_ (reported by @rdpapworth)
📖 Documentation
^^^^^^^^^^^^^^^^
- Fix link syntax (contributed by @ChrisCarini)
+👷♂️ Internal Improvements
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+- Several performance improvements (minimal mode, duplicates, frequency table sorting)
+- Introduce ``pytest-benchmark`` in CI to monitor commit performance impact
+- Introduce ``commitlint`` in CI to start automating the changelog generation
+
⬆️ Dependencies
^^^^^^^^^^^^^^^^^^
-- The `ipywidgets` dependency was moved to the `[notebook]` extra, so most of Jupyter will not be installed alongside this package by default (contributed by @akx).
-- Replaced the (testing only) `fastparquet` dependency with `pyarrow` (default pandas parquet engine, contributed by @kurosch).
\ No newline at end of file
+- The ``ipywidgets`` dependency was moved to the ``[notebook]`` extra, so most of Jupyter will not be installed alongside this package by default (contributed by @akx)
+- Replaced the (testing only) ``fastparquet`` dependency with ``pyarrow`` (default pandas parquet engine, contributed by @kurosch)
+- Upgrade ``phik``. This drops the hard dependency on numba (contributed by @akx)
diff --git a/docsrc/source/pages/changelog/v2_13_0.rst b/docsrc/source/pages/changelog/v2_13_0.rst
index f2d44616a..d8b8eb1c3 100644
--- a/docsrc/source/pages/changelog/v2_13_0.rst
+++ b/docsrc/source/pages/changelog/v2_13_0.rst
@@ -1,5 +1,5 @@
-Changelog vx.y.z
-----------------
+Changelog v2.13.0
+-----------------
🎉 Features
^^^^^^^^^^^
diff --git a/docsrc/source/pages/contribution_guidelines.rst b/docsrc/source/pages/contribution_guidelines.rst
index e0d1b4bc3..2d0b80b42 100644
--- a/docsrc/source/pages/contribution_guidelines.rst
+++ b/docsrc/source/pages/contribution_guidelines.rst
@@ -9,6 +9,10 @@ Contributing a new feature
* Ensure the PR description clearly describes the problem and solution.
Include the relevant issue number if applicable.
+
+Slack community
+---------------
+A low threshold place to ask questions or start contributing is by reaching out on the pandas-profiling Slack. `Join the Slack community `_.
Developer tools
---------------
@@ -61,4 +65,4 @@ Read Github's `open source legal guide `_ on Github.
\ No newline at end of file
+Read more on getting involved in the `Contribution Guide `_ on Github.
diff --git a/docsrc/source/pages/resources.rst b/docsrc/source/pages/resources.rst
index 16096af8d..81f85fff1 100644
--- a/docsrc/source/pages/resources.rst
+++ b/docsrc/source/pages/resources.rst
@@ -14,7 +14,7 @@ Notebooks
Articles
--------
-
+- `Bringing Customization to Pandas Profiling `_ (Ian Eaves, March 5, 2021)
- `Beginner Friendly Data Science Projects Accepting Contributions `_ (Adam Ross Nelson, January 18, 2021)
- `Pandas profiling and exploratory data analysis with line one of code! `_ (Magdalena Konkiewicz, Jun 10, 2020)
- `The Covid 19 health issue `_ (Concillier Kitungulu, April 20, 2020)
diff --git a/docsrc/source/pages/support.rst b/docsrc/source/pages/support.rst
index 46ed2e1e4..3f35ac3bd 100644
--- a/docsrc/source/pages/support.rst
+++ b/docsrc/source/pages/support.rst
@@ -35,6 +35,10 @@ Users with a request for help on how to use `pandas-profiling` should consider a
:alt: Questions: Stackoverflow "pandas-profiling"
:target: https://stackoverflow.com/questions/tagged/pandas-profiling
+Slack community
+---------------
+
+`Join the Slack community `_ and come into contact with other users and developers, that might be able to answer your questions.
Reporting a bug
---------------
diff --git a/examples/bank_marketing_data/banking_data.py b/examples/bank_marketing_data/banking_data.py
index 9d5eb285c..139c5e964 100644
--- a/examples/bank_marketing_data/banking_data.py
+++ b/examples/bank_marketing_data/banking_data.py
@@ -5,12 +5,12 @@
import pandas as pd
from pandas_profiling import ProfileReport
-from pandas_profiling.utils.cache import cache_file
+from pandas_profiling.utils.cache import cache_zipped_file
if __name__ == "__main__":
- file_name = cache_file(
+ file_name = cache_zipped_file(
"bank-full.csv",
- "https://storage.googleapis.com/erwinh-public-data/bankingdata/bank-full.csv",
+ "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip",
)
# Download the UCI Bank Marketing Dataset
diff --git a/examples/rdw/rdw.py b/examples/rdw/rdw.py
new file mode 100644
index 000000000..3c500882c
--- /dev/null
+++ b/examples/rdw/rdw.py
@@ -0,0 +1,14 @@
+import pandas as pd
+
+from pandas_profiling import ProfileReport
+from pandas_profiling.utils.cache import cache_file
+
+if __name__ == "__main__":
+ file_name = cache_file(
+ "rdw.parquet",
+ "https://raw.githubusercontent.com/pandas-profiling/pandas-profiling-data/master/data/rdw.parquet",
+ )
+ data = pd.read_parquet(file_name)
+
+ profile = ProfileReport(data, title="RDW Dataset", minimal=True)
+ profile.to_file("rdw.html")
diff --git a/requirements-test.txt b/requirements-test.txt
index 89c4b5df1..e92c82343 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -3,6 +3,7 @@ coverage<5
codecov
pytest-mypy
pytest-cov
+pytest-benchmark~=3.4.1
nbval
pyarrow
flake8
diff --git a/requirements.txt b/requirements.txt
index 56149d813..e290cb734 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,7 @@ htmlmin>=0.1.12
# Missing values
missingno>=0.4.2
# Correlations
-phik>=0.10.0
+phik>=0.11.1
# Text analysis
tangled-up-in-unicode>=0.0.6
# Examples
diff --git a/src/pandas_profiling/config_default.yaml b/src/pandas_profiling/config_default.yaml
index 42c1b1368..fa7009003 100644
--- a/src/pandas_profiling/config_default.yaml
+++ b/src/pandas_profiling/config_default.yaml
@@ -48,6 +48,7 @@ vars:
chi_squared_threshold: 0.999
coerce_str_to_date: False
redact: False
+ histogram_largest: 50
bool:
n_obs: 3
# string to boolean mappings pairs (true, false)
@@ -150,6 +151,7 @@ memory_deep: False
# Configuration related to the duplicates
duplicates:
head: 10
+ key: "# duplicates"
# Configuration related to the samples area
samples:
diff --git a/src/pandas_profiling/config_minimal.yaml b/src/pandas_profiling/config_minimal.yaml
index e1aacad3d..16076c90f 100644
--- a/src/pandas_profiling/config_minimal.yaml
+++ b/src/pandas_profiling/config_minimal.yaml
@@ -14,7 +14,7 @@ variables:
descriptions: {}
# infer dtypes
-infer_dtypes: True
+infer_dtypes: False
# Show the description at each variable (in addition to the overview tab)
show_variable_description: True
@@ -48,6 +48,7 @@ vars:
chi_squared_threshold: 0.0
coerce_str_to_date: False
redact: False
+ histogram_largest: 10
bool:
n_obs: 3
# string to boolean mappings pairs (true, false)
@@ -151,6 +152,7 @@ memory_deep: False
# Configuration related to the duplicates
duplicates:
head: 0
+ key: "# duplicates"
# Configuration related to the samples area
samples:
diff --git a/src/pandas_profiling/model/correlations.py b/src/pandas_profiling/model/correlations.py
index 99756ed71..ee64edc0d 100644
--- a/src/pandas_profiling/model/correlations.py
+++ b/src/pandas_profiling/model/correlations.py
@@ -155,6 +155,9 @@ def calculate_correlation(
The correlation matrices for the given correlation measures. Return None if correlation is empty.
"""
+ if len(df) == 0:
+ return None
+
correlation_measures = {
"pearson": Pearson,
"spearman": Spearman,
diff --git a/src/pandas_profiling/model/describe.py b/src/pandas_profiling/model/describe.py
index 80b53d253..22d8b1f39 100644
--- a/src/pandas_profiling/model/describe.py
+++ b/src/pandas_profiling/model/describe.py
@@ -47,9 +47,6 @@ def describe(
if not isinstance(df, pd.DataFrame):
warnings.warn("df is not of type pandas.DataFrame")
- if df.empty:
- raise ValueError("df can not be empty")
-
disable_progress_bar = not config["progress_bar"].get(bool)
date_start = datetime.utcnow()
@@ -134,7 +131,8 @@ def describe(
# Duplicates
pbar.set_postfix_str("Locating duplicates")
- duplicates = get_duplicates(df, supported_columns)
+ metrics, duplicates = get_duplicates(df, supported_columns)
+ table_stats.update(metrics)
pbar.update()
# Messages
diff --git a/src/pandas_profiling/model/duplicates.py b/src/pandas_profiling/model/duplicates.py
index b81fc232b..90b1ad0fe 100644
--- a/src/pandas_profiling/model/duplicates.py
+++ b/src/pandas_profiling/model/duplicates.py
@@ -1,11 +1,13 @@
-from typing import Optional
+from typing import Any, Dict, Optional, Tuple
import pandas as pd
from pandas_profiling.config import config
-def get_duplicates(df: pd.DataFrame, supported_columns) -> Optional[pd.DataFrame]:
+def get_duplicates(
+ df: pd.DataFrame, supported_columns
+) -> Tuple[Dict[str, Any], Optional[pd.DataFrame]]:
"""Obtain the most occurring duplicate rows in the DataFrame.
Args:
@@ -17,12 +19,34 @@ def get_duplicates(df: pd.DataFrame, supported_columns) -> Optional[pd.DataFrame
"""
n_head = config["duplicates"]["head"].get(int)
- if n_head > 0 and supported_columns:
- return (
- df[df.duplicated(subset=supported_columns, keep=False)]
- .groupby(supported_columns)
- .size()
- .reset_index(name="count")
- .nlargest(n_head, "count")
- )
- return None
+ metrics: Dict[str, Any] = {}
+ if n_head > 0:
+ if supported_columns and len(df) > 0:
+ duplicates_key = config["duplicates"]["key"].get(str)
+ if duplicates_key in df.columns:
+ raise ValueError(
+ f"Duplicates key ({duplicates_key}) may not be part of the DataFrame. Either change the "
+ f" column name in the DataFrame or change the 'duplicates.key' parameter."
+ )
+
+ duplicated_rows = df.duplicated(subset=supported_columns, keep=False)
+ duplicated_rows = (
+ df[duplicated_rows]
+ .groupby(supported_columns)
+ .size()
+ .reset_index(name=duplicates_key)
+ )
+
+ metrics["n_duplicates"] = len(duplicated_rows[duplicates_key])
+ metrics["p_duplicates"] = metrics["n_duplicates"] / len(df)
+
+ return (
+ metrics,
+ duplicated_rows.nlargest(n_head, duplicates_key),
+ )
+ else:
+ metrics["n_duplicates"] = 0
+ metrics["p_duplicates"] = 0.0
+ return metrics, None
+ else:
+ return metrics, None
diff --git a/src/pandas_profiling/model/expectation_algorithms.py b/src/pandas_profiling/model/expectation_algorithms.py
index 83e748a36..efac257b5 100644
--- a/src/pandas_profiling/model/expectation_algorithms.py
+++ b/src/pandas_profiling/model/expectation_algorithms.py
@@ -69,7 +69,10 @@ def path_expectations(name, summary, batch, *args):
def datetime_expectations(name, summary, batch, *args):
if any(k in summary for k in ["min", "max"]):
batch.expect_column_values_to_be_between(
- name, min_value=summary.get("min"), max_value=summary.get("max")
+ name,
+ min_value=summary.get("min"),
+ max_value=summary.get("max"),
+ parse_strings_as_datetimes=True,
)
return name, summary, batch
diff --git a/src/pandas_profiling/model/messages.py b/src/pandas_profiling/model/messages.py
index 3330c049e..3e557cc63 100644
--- a/src/pandas_profiling/model/messages.py
+++ b/src/pandas_profiling/model/messages.py
@@ -56,6 +56,9 @@ class MessageType(Enum):
UNIFORM = auto()
"""The variable is uniformly distributed"""
+ EMPTY = auto()
+ """The DataFrame is empty"""
+
class Message:
"""A message object (type, values, column)."""
@@ -109,7 +112,7 @@ def check_table_messages(table: dict) -> List[Message]:
A list of messages.
"""
messages = []
- if warning_value(table["n_duplicates"]):
+ if "n_duplicates" in table and warning_value(table["n_duplicates"]):
messages.append(
Message(
message_type=MessageType.DUPLICATES,
@@ -117,6 +120,14 @@ def check_table_messages(table: dict) -> List[Message]:
fields={"n_duplicates"},
)
)
+ if table["n"] == 0:
+ messages.append(
+ Message(
+ message_type=MessageType.EMPTY,
+ values=table,
+ fields={"n"},
+ )
+ )
return messages
diff --git a/src/pandas_profiling/model/sample.py b/src/pandas_profiling/model/sample.py
index 1df2acc78..50fac9397 100644
--- a/src/pandas_profiling/model/sample.py
+++ b/src/pandas_profiling/model/sample.py
@@ -1,3 +1,5 @@
+from typing import List
+
import attr
import pandas as pd
@@ -12,7 +14,7 @@ class Sample:
caption = attr.ib(default=None)
-def get_sample(df: pd.DataFrame) -> list:
+def get_sample(df: pd.DataFrame) -> List[Sample]:
"""Obtains a sample from head and tail of the DataFrame
Args:
@@ -21,7 +23,10 @@ def get_sample(df: pd.DataFrame) -> list:
Returns:
a list of Sample objects
"""
- samples = []
+ samples: List[Sample] = []
+ if len(df) == 0:
+ return samples
+
n_head = config["samples"]["head"].get(int)
if n_head > 0:
samples.append(Sample("head", df.head(n=n_head), "First rows"))
diff --git a/src/pandas_profiling/model/summary.py b/src/pandas_profiling/model/summary.py
index a579275c1..ea14eae7e 100644
--- a/src/pandas_profiling/model/summary.py
+++ b/src/pandas_profiling/model/summary.py
@@ -4,7 +4,7 @@
import multiprocessing.pool
import warnings
from collections import Counter
-from typing import Callable, Mapping, Optional, Tuple
+from typing import Callable, Mapping, Tuple
import numpy as np
import pandas as pd
@@ -16,7 +16,6 @@
check_variable_messages,
)
from pandas_profiling.model.summarizer import BaseSummarizer
-from pandas_profiling.model.typeset import Unsupported
from pandas_profiling.visualisation.missing import (
missing_bar,
missing_dendrogram,
@@ -124,7 +123,7 @@ def get_table_stats(df: pd.DataFrame, variable_stats: dict) -> dict:
n = len(df)
memory_size = df.memory_usage(deep=config["memory_deep"].get(bool)).sum()
- record_size = float(memory_size) / n
+ record_size = float(memory_size) / n if n > 0 else 0
table_stats = {
"n": n,
@@ -143,21 +142,9 @@ def get_table_stats(df: pd.DataFrame, variable_stats: dict) -> dict:
if series_summary["n_missing"] == n:
table_stats["n_vars_all_missing"] += 1
- table_stats["p_cells_missing"] = table_stats["n_cells_missing"] / (
- table_stats["n"] * table_stats["n_var"]
- )
-
- supported_columns = [
- k for k, v in variable_stats.items() if v["type"] != Unsupported
- ]
- table_stats["n_duplicates"] = (
- sum(df.duplicated(subset=supported_columns))
- if len(supported_columns) > 0
- else 0
- )
- table_stats["p_duplicates"] = (
- (table_stats["n_duplicates"] / len(df))
- if (len(supported_columns) > 0 and len(df) > 0)
+ table_stats["p_cells_missing"] = (
+ table_stats["n_cells_missing"] / (table_stats["n"] * table_stats["n_var"])
+ if table_stats["n"] > 0
else 0
)
@@ -169,29 +156,6 @@ def get_table_stats(df: pd.DataFrame, variable_stats: dict) -> dict:
return table_stats
-def get_duplicates(df: pd.DataFrame, supported_columns) -> Optional[pd.DataFrame]:
- """Obtain the most occurring duplicate rows in the DataFrame.
-
- Args:
- df: the Pandas DataFrame.
- supported_columns: the columns to consider
-
- Returns:
- A subset of the DataFrame, ordered by occurrence.
- """
- n_head = config["duplicates"]["head"].get(int)
-
- if n_head > 0 and supported_columns:
- return (
- df[df.duplicated(subset=supported_columns, keep=False)]
- .groupby(supported_columns)
- .size()
- .reset_index(name="count")
- .nlargest(n_head, "count")
- )
- return None
-
-
def get_missing_diagrams(df: pd.DataFrame, table_stats: dict) -> dict:
"""Gets the rendered diagrams for missing values.
@@ -203,6 +167,9 @@ def get_missing_diagrams(df: pd.DataFrame, table_stats: dict) -> dict:
A dictionary containing the base64 encoded plots for each diagram that is active in the config (matrix, bar, heatmap, dendrogram).
"""
+ if len(df) == 0:
+ return {}
+
def warn_missing(missing_name, error):
warnings.warn(
f"""There was an attempt to generate the {missing_name} missing values diagrams, but this failed.
diff --git a/src/pandas_profiling/model/summary_algorithms.py b/src/pandas_profiling/model/summary_algorithms.py
index b5150228d..9a95f6be3 100644
--- a/src/pandas_profiling/model/summary_algorithms.py
+++ b/src/pandas_profiling/model/summary_algorithms.py
@@ -96,7 +96,7 @@ def describe_supported(
stats = {
"n_distinct": distinct_count,
"p_distinct": distinct_count / count if count > 0 else 0,
- "is_unique": unique_count == count,
+ "is_unique": unique_count == count and count > 0,
"n_unique": unique_count,
"p_unique": unique_count / count if count > 0 else 0,
}
@@ -120,7 +120,7 @@ def describe_generic(series: pd.Series, summary: dict) -> Tuple[pd.Series, dict]
summary.update(
{
"n": length,
- "p_missing": summary["n_missing"] / length,
+ "p_missing": summary["n_missing"] / length if length > 0 else 0,
"count": length - summary["n_missing"],
"memory_size": series.memory_usage(deep=config["memory_deep"].get(bool)),
}
@@ -233,6 +233,16 @@ def describe_numeric_1d(series: pd.Series, summary: dict) -> Tuple[pd.Series, di
stats["monotonic_decrease_strict"] = (
stats["monotonic_decrease"] and series.is_unique
)
+ if summary["monotonic_increase_strict"]:
+ stats["monotonic"] = 2
+ elif summary["monotonic_decrease_strict"]:
+ stats["monotonic"] = -2
+ elif summary["monotonic_increase"]:
+ stats["monotonic"] = 1
+ elif summary["monotonic_decrease"]:
+ stats["monotonic"] = -1
+ else:
+ stats["monotonic"] = 0
stats.update(
histogram_compute(
@@ -295,10 +305,16 @@ def describe_categorical_1d(series: pd.Series, summary: dict) -> Tuple[pd.Series
# Only run if at least 1 non-missing value
value_counts = summary["value_counts_without_nan"]
+ histogram_largest = config["vars"]["cat"]["histogram_largest"].get(int)
+ histogram_data = value_counts
+ if histogram_largest > 0:
+ histogram_data = histogram_data.nlargest(histogram_largest)
summary.update(
histogram_compute(
- value_counts, summary["n_distinct"], name="histogram_frequencies"
+ histogram_data,
+ summary["n_distinct"],
+ name="histogram_frequencies",
)
)
diff --git a/src/pandas_profiling/report/formatters.py b/src/pandas_profiling/report/formatters.py
index 6a29e8bdd..558f4fa4e 100644
--- a/src/pandas_profiling/report/formatters.py
+++ b/src/pandas_profiling/report/formatters.py
@@ -78,7 +78,7 @@ def fmt_timespan(num_seconds, detailed=False, max_units=3):
import math
import numbers
import re
- from datetime import datetime, timedelta
+ from datetime import timedelta
time_units = (
dict(
@@ -206,8 +206,10 @@ def fmt_numeric(value: float, precision=10) -> str:
fmtted = f"{{:.{precision}g}}".format(value)
for v in ["e+", "e-"]:
if v in fmtted:
+ sign = "-" if v in "e-" else ""
fmtted = fmtted.replace(v, " × 10") + ""
fmtted = fmtted.replace("0", "")
+ fmtted = fmtted.replace("", f"{sign}")
return fmtted
@@ -255,6 +257,21 @@ def fmt(value) -> str:
return str(escape(value))
+def fmt_monotonic(value: int) -> str:
+ if value == 2:
+ return "Strictly increasing"
+ elif value == 1:
+ return "Increasing"
+ elif value == 0:
+ return "Not monotonic"
+ elif value == -1:
+ return "Decreasing"
+ elif value == -2:
+ return "Strictly decreasing"
+ else:
+ raise ValueError("Value should be integer ranging from -2 to 2.")
+
+
def help(title, url=None) -> str:
"""Creat help badge
@@ -281,6 +298,7 @@ def get_fmt_mapping() -> Dict[str, Callable]:
"fmt_bytesize": fmt_bytesize,
"fmt_timespan": fmt_timespan,
"fmt_numeric": fmt_numeric,
+ "fmt_monotonic": fmt_monotonic,
"fmt_number": fmt_number,
"fmt_array": fmt_array,
"fmt": fmt,
diff --git a/src/pandas_profiling/report/presentation/flavours/html/templates/warnings/warning_duplicates.html b/src/pandas_profiling/report/presentation/flavours/html/templates/warnings/warning_duplicates.html
index 8820e1d50..59bb93c56 100644
--- a/src/pandas_profiling/report/presentation/flavours/html/templates/warnings/warning_duplicates.html
+++ b/src/pandas_profiling/report/presentation/flavours/html/templates/warnings/warning_duplicates.html
@@ -1 +1 @@
-Dataset has {{ message.values['n_duplicates'] }} ({{ message.values['p_duplicates'] | fmt_percent }}) duplicate rows
\ No newline at end of file
+Dataset has {{ message.values['n_duplicates'] }} ({{ message.values['p_duplicates'] | fmt_percent }}) duplicate rows
\ No newline at end of file
diff --git a/src/pandas_profiling/report/presentation/flavours/html/templates/warnings/warning_empty.html b/src/pandas_profiling/report/presentation/flavours/html/templates/warnings/warning_empty.html
new file mode 100644
index 000000000..a676c9577
--- /dev/null
+++ b/src/pandas_profiling/report/presentation/flavours/html/templates/warnings/warning_empty.html
@@ -0,0 +1 @@
+Dataset is empty
diff --git a/src/pandas_profiling/report/presentation/flavours/widget/warnings.py b/src/pandas_profiling/report/presentation/flavours/widget/warnings.py
index 97f830014..43b959f79 100644
--- a/src/pandas_profiling/report/presentation/flavours/widget/warnings.py
+++ b/src/pandas_profiling/report/presentation/flavours/widget/warnings.py
@@ -25,6 +25,7 @@ def render(self):
"skewed": "info",
"high_correlation": "",
"duplicates": "",
+ "empty": "",
}
items = []
diff --git a/src/pandas_profiling/report/presentation/frequency_table_utils.py b/src/pandas_profiling/report/presentation/frequency_table_utils.py
index bb53e1dae..0862a19b8 100644
--- a/src/pandas_profiling/report/presentation/frequency_table_utils.py
+++ b/src/pandas_profiling/report/presentation/frequency_table_utils.py
@@ -1,7 +1,9 @@
-from typing import Dict, Sequence
+from typing import Any, Dict, List
+import numpy as np
-def freq_table(freqtable, n: int, max_number_to_print: int) -> Sequence[Dict]:
+
+def freq_table(freqtable, n: int, max_number_to_print: int) -> List[Dict]:
"""Render the rows for a frequency table (value, count).
Args:
@@ -19,13 +21,13 @@ def freq_table(freqtable, n: int, max_number_to_print: int) -> Sequence[Dict]:
max_number_to_print = n
if max_number_to_print < len(freqtable):
- freq_other = sum(freqtable.iloc[max_number_to_print:])
+ freq_other = np.sum(freqtable.iloc[max_number_to_print:])
min_freq = freqtable.values[max_number_to_print]
else:
freq_other = 0
min_freq = 0
- freq_missing = n - sum(freqtable)
+ freq_missing = n - np.sum(freqtable)
# No values
if len(freqtable) == 0:
return []
@@ -79,39 +81,37 @@ def freq_table(freqtable, n: int, max_number_to_print: int) -> Sequence[Dict]:
return rows
-def extreme_obs_table(freqtable, number_to_print, n, ascending=True) -> list:
+def extreme_obs_table(freqtable, number_to_print: int, n: int) -> List[Dict[str, Any]]:
"""Similar to the frequency table, for extreme observations.
Args:
- freqtable: The frequency table.
+ freqtable: The (sorted) frequency table.
number_to_print: The number of observations to print.
n: The total number of observations.
- ascending: The ordering of the observations (Default value = True)
Returns:
The HTML rendering of the extreme observation table.
"""
+
# If it's mixed between base types (str, int) convert to str. Pure "mixed" types are filtered during type
# discovery
# TODO: should be in cast?
- if "mixed" in freqtable.index.inferred_type:
- freqtable.index = freqtable.index.astype(str)
-
- sorted_freqtable = freqtable.sort_index(ascending=ascending)
- obs_to_print = sorted_freqtable.iloc[:number_to_print]
- max_freq = max(obs_to_print.values)
-
- rows = []
- for label, freq in obs_to_print.items():
- rows.append(
- {
- "label": label,
- "width": freq / max_freq if max_freq != 0 else 0,
- "count": freq,
- "percentage": float(freq) / n,
- "extra_class": "",
- "n": n,
- }
- )
+ # if "mixed" in freqtable.index.inferred_type:
+ # freqtable.index = freqtable.index.astype(str)
+
+ obs_to_print = freqtable.iloc[:number_to_print]
+ max_freq = obs_to_print.max()
+
+ rows = [
+ {
+ "label": label,
+ "width": freq / max_freq if max_freq != 0 else 0,
+ "count": freq,
+ "percentage": float(freq) / n,
+ "extra_class": "",
+ "n": n,
+ }
+ for label, freq in obs_to_print.items()
+ ]
return rows
diff --git a/src/pandas_profiling/report/structure/overview.py b/src/pandas_profiling/report/structure/overview.py
index e8751086a..2ab3bcb60 100644
--- a/src/pandas_profiling/report/structure/overview.py
+++ b/src/pandas_profiling/report/structure/overview.py
@@ -7,38 +7,46 @@
def get_dataset_overview(summary):
- dataset_info = Table(
+ table_metrics = [
+ {
+ "name": "Number of variables",
+ "value": summary["table"]["n_var"],
+ "fmt": "fmt_number",
+ },
+ {
+ "name": "Number of observations",
+ "value": summary["table"]["n"],
+ "fmt": "fmt_number",
+ },
+ {
+ "name": "Missing cells",
+ "value": summary["table"]["n_cells_missing"],
+ "fmt": "fmt_number",
+ },
+ {
+ "name": "Missing cells (%)",
+ "value": summary["table"]["p_cells_missing"],
+ "fmt": "fmt_percent",
+ },
+ ]
+ if "n_duplicates" in summary["table"]:
+ table_metrics.extend(
+ [
+ {
+ "name": "Duplicate rows",
+ "value": summary["table"]["n_duplicates"],
+ "fmt": "fmt_number",
+ },
+ {
+ "name": "Duplicate rows (%)",
+ "value": summary["table"]["p_duplicates"],
+ "fmt": "fmt_percent",
+ },
+ ]
+ )
+
+ table_metrics.extend(
[
- {
- "name": "Number of variables",
- "value": summary["table"]["n_var"],
- "fmt": "fmt_number",
- },
- {
- "name": "Number of observations",
- "value": summary["table"]["n"],
- "fmt": "fmt_number",
- },
- {
- "name": "Missing cells",
- "value": summary["table"]["n_cells_missing"],
- "fmt": "fmt_number",
- },
- {
- "name": "Missing cells (%)",
- "value": summary["table"]["p_cells_missing"],
- "fmt": "fmt_percent",
- },
- {
- "name": "Duplicate rows",
- "value": summary["table"]["n_duplicates"],
- "fmt": "fmt_number",
- },
- {
- "name": "Duplicate rows (%)",
- "value": summary["table"]["p_duplicates"],
- "fmt": "fmt_percent",
- },
{
"name": "Total size in memory",
"value": summary["table"]["memory_size"],
@@ -49,7 +57,11 @@ def get_dataset_overview(summary):
"value": summary["table"]["record_size"],
"fmt": "fmt_bytesize",
},
- ],
+ ]
+ )
+
+ dataset_info = Table(
+ table_metrics,
name="Dataset statistics",
)
diff --git a/src/pandas_profiling/report/structure/variables/render_common.py b/src/pandas_profiling/report/structure/variables/render_common.py
index 426f258b1..e55d29536 100644
--- a/src/pandas_profiling/report/structure/variables/render_common.py
+++ b/src/pandas_profiling/report/structure/variables/render_common.py
@@ -9,6 +9,8 @@ def render_common(summary):
n_extreme_obs = config["n_extreme_obs"].get(int)
n_freq_table_max = config["n_freq_table_max"].get(int)
+ sorted_freqtable = summary["value_counts_without_nan"].sort_index(ascending=True)
+
template_variables = {
# TODO: with nan
"freq_table_rows": freq_table(
@@ -17,16 +19,14 @@ def render_common(summary):
max_number_to_print=n_freq_table_max,
),
"firstn_expanded": extreme_obs_table(
- freqtable=summary["value_counts_without_nan"],
+ freqtable=sorted_freqtable,
number_to_print=n_extreme_obs,
n=summary["n"],
- ascending=True,
),
"lastn_expanded": extreme_obs_table(
- freqtable=summary["value_counts_without_nan"],
+ freqtable=sorted_freqtable[::-1],
number_to_print=n_extreme_obs,
n=summary["n"],
- ascending=False,
),
}
diff --git a/src/pandas_profiling/report/structure/variables/render_real.py b/src/pandas_profiling/report/structure/variables/render_real.py
index e7ce82412..6624548d5 100644
--- a/src/pandas_profiling/report/structure/variables/render_real.py
+++ b/src/pandas_profiling/report/structure/variables/render_real.py
@@ -152,17 +152,6 @@ def render_real(summary):
name="Quantile statistics",
)
- if summary["monotonic_increase_strict"]:
- monotocity = "Strictly increasing"
- elif summary["monotonic_decrease_strict"]:
- monotocity = "Strictly decreasing"
- elif summary["monotonic_increase"]:
- monotocity = "Increasing"
- elif summary["monotonic_decrease"]:
- monotocity = "Decreasing"
- else:
- monotocity = "Not monotonic"
-
descriptive_statistics = Table(
[
{
@@ -190,7 +179,11 @@ def render_real(summary):
},
{"name": "Sum", "value": summary["sum"], "fmt": "fmt_numeric"},
{"name": "Variance", "value": summary["variance"], "fmt": "fmt_numeric"},
- {"name": "Monotocity", "value": monotocity, "fmt": "fmt"},
+ {
+ "name": "Monotonicity",
+ "value": summary["monotonic"],
+ "fmt": "fmt_monotonic",
+ },
],
name="Descriptive statistics",
)
diff --git a/src/pandas_profiling/utils/cache.py b/src/pandas_profiling/utils/cache.py
index 1699b2c22..356d6fea8 100644
--- a/src/pandas_profiling/utils/cache.py
+++ b/src/pandas_profiling/utils/cache.py
@@ -1,4 +1,5 @@
"""Dataset cache utility functions"""
+import zipfile
from pathlib import Path
import requests
@@ -20,9 +21,44 @@ def cache_file(file_name: str, url: str) -> Path:
data_path = get_data_path()
data_path.mkdir(exist_ok=True)
+ file_path = data_path / file_name
+
+ # If not exists, download and create file
+ if not file_path.exists():
+ response = requests.get(url)
+ file_path.write_bytes(response.content)
+
+ return file_path
+
+
+def cache_zipped_file(file_name: str, url: str) -> Path:
+ """Check if file_name already is in the data path, otherwise download it from url.
+
+ Args:
+ file_name: the file name
+ url: the URL of the dataset
+
+ Returns:
+ The relative path to the dataset
+ """
+
+ data_path = get_data_path()
+ data_path.mkdir(exist_ok=True)
+
+ file_path = data_path / file_name
+
# If not exists, download and create file
- if not (data_path / file_name).exists():
- data = requests.get(url)
- (data_path / file_name).write_bytes(data.content)
+ if not file_path.exists():
+ response = requests.get(url)
+ if response.status_code != 200:
+ raise FileNotFoundError("Could not download resource")
+
+ tmp_path = data_path / "tmp.zip"
+ tmp_path.write_bytes(response.content)
+
+ with zipfile.ZipFile(tmp_path, "r") as zip_file:
+ zip_file.extract(file_path.name, data_path)
+
+ tmp_path.unlink()
- return data_path / file_name
+ return file_path
diff --git a/tests/benchmarks/bench.py b/tests/benchmarks/bench.py
new file mode 100644
index 000000000..a1db2a0b0
--- /dev/null
+++ b/tests/benchmarks/bench.py
@@ -0,0 +1,59 @@
+from functools import partial
+
+import pandas as pd
+
+from pandas_profiling import ProfileReport
+from pandas_profiling.utils.cache import cache_file
+
+
+def func(df, **kwargs):
+ profile = ProfileReport(df, progress_bar=False, **kwargs)
+ report = profile.to_html()
+ return report
+
+
+def test_titanic_explorative(benchmark):
+ file_name = cache_file(
+ "titanic.parquet",
+ "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet",
+ )
+
+ data = pd.read_parquet(file_name)
+
+ kwargs = dict(explorative=True)
+ benchmark(partial(func, **kwargs), data)
+
+
+def test_titanic_default(benchmark):
+ file_name = cache_file(
+ "titanic.parquet",
+ "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet",
+ )
+
+ data = pd.read_parquet(file_name)
+
+ benchmark(partial(func), data)
+
+
+def test_titanic_minimal(benchmark):
+ file_name = cache_file(
+ "titanic.parquet",
+ "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet",
+ )
+
+ data = pd.read_parquet(file_name)
+
+ kwargs = dict(minimal=True)
+ benchmark(partial(func, **kwargs), data)
+
+
+def test_rdw_minimal(benchmark):
+ file_name = cache_file(
+ "rdw_sample_100k.parquet",
+ "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/rdw_sample_100k.parquet",
+ )
+
+ data = pd.read_parquet(file_name)
+
+ kwargs = dict(minimal=True)
+ benchmark(partial(func, **kwargs), data)
diff --git a/tests/issues/test_issue377.py b/tests/issues/test_issue377.py
index 2ffa39a92..3362e812e 100644
--- a/tests/issues/test_issue377.py
+++ b/tests/issues/test_issue377.py
@@ -6,25 +6,35 @@
import pandas as pd
import pytest
+import requests
-import pandas_profiling
-from pandas_profiling.utils.cache import cache_file
+from pandas_profiling import ProfileReport
+from pandas_profiling.utils.cache import cache_zipped_file
-@pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher")
-def test_issue377():
- file_name = cache_file(
- "bank-full.csv",
- "https://storage.googleapis.com/erwinh-public-data/bankingdata/bank-full.csv",
- )
+@pytest.fixture()
+def df():
+ try:
+ file_name = cache_zipped_file(
+ "bank-full.csv",
+ "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip",
+ )
+ except (requests.exceptions.ConnectionError, FileNotFoundError):
+ return
# Download the UCI Bank Marketing Dataset
df = pd.read_csv(file_name, sep=";")
+ return df
+
+
+@pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher")
+def test_issue377(df):
+ if df is None:
+ pytest.skip("dataset unavailable")
+ return
original_order = tuple(df.columns.values)
- profile = pandas_profiling.ProfileReport(
- df, sort="None", pool_size=1, progress_bar=False
- )
+ profile = ProfileReport(df, sort="None", pool_size=1, progress_bar=False)
new_order = tuple(profile.get_description()["variables"].keys())
assert original_order == new_order
diff --git a/tests/issues/test_issue51.py b/tests/issues/test_issue51.py
index 50617ca81..71815f23e 100644
--- a/tests/issues/test_issue51.py
+++ b/tests/issues/test_issue51.py
@@ -7,9 +7,6 @@
import pandas_profiling
-# FIXME: correlations can be computed stand alone to speed up tests
-from pandas_profiling.config import config
-
def test_issue51(get_data_file):
# Categorical has empty ('') value
diff --git a/tests/performance/time_inf.py b/tests/performance/time_inf.py
deleted file mode 100644
index ba2aecaa4..000000000
--- a/tests/performance/time_inf.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import timeit
-
-testcode = """
-import numpy as np
-import pandas as pd
-
-np.random.seed(12)
-vals = np.random.random(10000)
-series = pd.Series(vals)
-series[series < 0.3] = np.nan
-series[series < 0.2] = np.Inf
-
-
-
-def f1(series):
- return len(series.loc[(~np.isfinite(series)) & series.notnull()])
-
-
-def f2(series):
- return ((series == np.inf) | (series == -np.inf)).sum()
-"""
-
-
-print(timeit.timeit("f1(series)", number=10, setup=testcode))
-print(timeit.timeit("f2(series)", number=10, setup=testcode))
diff --git a/tests/performance/time_kurtosis.py b/tests/performance/time_kurtosis.py
deleted file mode 100644
index dfa106272..000000000
--- a/tests/performance/time_kurtosis.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import timeit
-
-testcode = """
-import numpy as np
-import pandas as pd
-import scipy.stats
-
-np.random.seed(12)
-vals = np.random.random(1000)
-series = pd.Series(vals)
-series[series < 0.2] = pd.NA
-
-def f1(series):
- arr = series.values
- return scipy.stats.kurtosis(arr, bias=False, nan_policy='omit')
-
-
-def f2(series):
- arr = series.values
- arr_without_nan = arr[~np.isnan(arr)]
- return scipy.stats.kurtosis(arr_without_nan, bias=False)
-
-
-def f3(series):
- return series.kurtosis()
-
-
-def f4(series):
- return series[series.notna()].kurtosis()
-"""
-
-
-print(timeit.timeit("f1(series)", number=10, setup=testcode))
-print(timeit.timeit("f2(series)", number=10, setup=testcode))
-print(timeit.timeit("f3(series)", number=10, setup=testcode))
-print(timeit.timeit("f4(series)", number=10, setup=testcode))
diff --git a/tests/performance/time_mad.py b/tests/performance/time_mad.py
deleted file mode 100644
index 8c6107614..000000000
--- a/tests/performance/time_mad.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import timeit
-
-testcode = '''
-import numpy as np
-import pandas as pd
-
-np.random.seed(12)
-vals = np.random.random(1000)
-series = pd.Series(vals)
-series[series < 0.2] = pd.NA
-
-
-def mad(arr):
- """ Median Absolute Deviation: a "Robust" version of standard deviation.
- Indices variabililty of the sample.
- https://en.wikipedia.org/wiki/Median_absolute_deviation
- """
- arr = np.ma.array(arr).compressed() # should be faster to not use masked arrays.
- med = np.median(arr)
- return np.median(np.abs(arr - med))
-
-
-def mad2(arr):
- """ Median Absolute Deviation: a "Robust" version of standard deviation.
- Indices variabililty of the sample.
- https://en.wikipedia.org/wiki/Median_absolute_deviation
- """
- med = np.median(arr)
- return np.median(np.abs(arr - med))
-
-
-def f1(series):
- arr = series.values
- arr_without_nan = arr[~np.isnan(arr)]
- return mad(arr_without_nan)
-
-
-def f2(series):
- arr = series.values
- arr_without_nan = arr[~np.isnan(arr)]
- return mad(arr_without_nan)
-
-
-def f3(series):
- return series.mad()
-
-
-def f4(series):
- return series[series.notna()].mad()
-'''
-
-
-print(timeit.timeit("f1(series)", number=10, setup=testcode))
-print(timeit.timeit("f2(series)", number=10, setup=testcode))
-print(timeit.timeit("f3(series)", number=10, setup=testcode))
-print(timeit.timeit("f4(series)", number=10, setup=testcode))
diff --git a/tests/performance/time_mean.py b/tests/performance/time_mean.py
deleted file mode 100644
index f6149a4c0..000000000
--- a/tests/performance/time_mean.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import timeit
-
-testcode = """
-import numpy as np
-import pandas as pd
-
-np.random.seed(12)
-vals = np.random.random(1000)
-series = pd.Series(vals)
-series[series < 0.2] = pd.NA
-
-
-def f1(series):
- arr = series.values
- arr_without_nan = arr[~np.isnan(arr)]
- return np.mean(arr_without_nan)
-
-
-def f2(series):
- arr = series.values
- return np.nanmean(arr)
-
-
-def f3(series):
- return series.mean()
-
-
-def f4(series):
- return series[series.notna()].mean()
-"""
-
-
-print(timeit.timeit("f1(series)", number=10, setup=testcode))
-print(timeit.timeit("f2(series)", number=10, setup=testcode))
-print(timeit.timeit("f3(series)", number=10, setup=testcode))
-print(timeit.timeit("f4(series)", number=10, setup=testcode))
diff --git a/tests/performance/timings.py b/tests/performance/timings.py
deleted file mode 100644
index acde9360d..000000000
--- a/tests/performance/timings.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import timeit
-from itertools import product
-from string import ascii_lowercase
-
-import numpy as np
-import pandas as pd
-import seaborn as sns
-from matplotlib import pyplot as plt
-
-from pandas_profiling import ProfileReport
-
-
-def generate_column_names(n):
- column_names = []
- iters = 1
- while len(column_names) < n:
- column_names += list(
- "".join(combo) for combo in product(ascii_lowercase, repeat=iters)
- )
- iters += 1
- return column_names
-
-
-def make_sample_data(cols, rows):
- column_names = generate_column_names(cols)
-
- df = pd.DataFrame(
- np.random.randint(0, 1000000, size=(rows, cols)), columns=column_names[0:cols]
- )
- df = df.astype(str)
-
- assert df.shape == (rows, cols)
- return df.copy()
-
-
-def make_report_minimal(df):
- report = ProfileReport(
- df,
- minimal=True,
- pool_size=0,
- sort="None",
- title="Dataset with Numeric Categories",
- )
- html = report.to_html()
- assert type(html) == str and 'Dataset info
' in html
-
-
-def make_report(df):
- report = ProfileReport(
- df,
- minimal=False,
- pool_size=0,
- sort="None",
- title="Dataset with Numeric Categories",
- )
- html = report.to_html()
- assert type(html) == str and 'Dataset info
' in html
-
-
-def wrap_func(function):
- def inner(df):
- def double_inner():
- return function(df)
-
- return double_inner
-
- return inner
-
-
-def time_report(func, cols, rows, runs=5):
- df = make_sample_data(cols, rows)
- print(df.shape)
- test = wrap_func(func)(df.copy())
- return timeit.timeit(test, number=runs) / runs
-
-
-def plot_col_run_time():
- cols = [2, 4, 10, 50]
- row = 1000
- default_times = [time_report(make_report, col, row) for col in cols]
- minimal_times = [time_report(make_report_minimal, col, row) for col in cols]
-
- ax1 = sns.scatterplot(cols, default_times)
- ax2 = sns.scatterplot(cols, minimal_times)
- _ = ax1.set(
- xlabel=f"Number of columns (row={row})",
- ylabel="time (s)",
- title="Run Time Complexity",
- )
- plt.show()
-
-
-def plot_row_run_time():
- # 10, 100
- # https://github.com/pandas-profiling/pandas-profiling/issues/270
- rows = [1000, 10000, 100000]
- col = 10
- default_times = [time_report(make_report, col, row) for row in rows]
- minimal_times = [time_report(make_report_minimal, col, row) for row in rows]
-
- ax1 = sns.scatterplot(rows, default_times)
- ax2 = sns.scatterplot(rows, minimal_times)
- _ = ax1.set(
- xlabel=f"Number of rows (col={col})",
- ylabel="time (s)",
- title="Run Time Complexity",
- )
- plt.show()
-
-
-if __name__ == "__main__":
- plot_col_run_time()
- plot_row_run_time()
diff --git a/tests/unit/test_custom_sample.py b/tests/unit/test_custom_sample.py
index 81a0bd551..4aab90280 100644
--- a/tests/unit/test_custom_sample.py
+++ b/tests/unit/test_custom_sample.py
@@ -1,5 +1,3 @@
-from pathlib import Path
-
import pandas as pd
from pandas_profiling import ProfileReport
diff --git a/tests/unit/test_decorator.py b/tests/unit/test_decorator.py
index 25d0a58bc..57f3dc5e0 100644
--- a/tests/unit/test_decorator.py
+++ b/tests/unit/test_decorator.py
@@ -1,5 +1,4 @@
import pandas as pd
-import pytest
import pandas_profiling
@@ -16,11 +15,3 @@ def test_decorator(get_data_file):
missing_diagrams={"heatmap": False, "dendrogram": False},
)
assert "Coursera Test Report" in report.to_html(), "Title is not found"
-
-
-def test_empty_decorator():
- df = pd.DataFrame().profile_report(progress_bar=False)
- with pytest.raises(ValueError) as e:
- df.get_description()
-
- assert e.value.args[0] == "df can not be empty"
diff --git a/tests/unit/test_describe.py b/tests/unit/test_describe.py
index 1d6589df3..cbec917f8 100644
--- a/tests/unit/test_describe.py
+++ b/tests/unit/test_describe.py
@@ -571,12 +571,6 @@ def test_describe_df(column, describe_data, expected_results, summarizer, typese
), f"Histogram missing for column {column}"
-def test_describe_empty(summarizer, typeset):
- empty_frame = pd.DataFrame()
- with pytest.raises(ValueError):
- describe("", empty_frame, summarizer, typeset)
-
-
def test_describe_list(summarizer, typeset):
with pytest.raises(AttributeError):
with pytest.warns(UserWarning):
diff --git a/tests/unit/test_duplicates.py b/tests/unit/test_duplicates.py
new file mode 100644
index 000000000..b3043edce
--- /dev/null
+++ b/tests/unit/test_duplicates.py
@@ -0,0 +1,30 @@
+"""Test for the duplicates functionality"""
+import numpy as np
+import pandas as pd
+import pytest
+
+from pandas_profiling.model.duplicates import get_duplicates
+
+
+@pytest.fixture(scope="module")
+def test_data():
+ np.random.seed(5)
+ df = pd.DataFrame(
+ np.random.randint(1, 100, (100, 5)),
+ columns=["a", "b", "c", "duplicates", "count"],
+ )
+ df = pd.concat([df, df], axis=0)
+ return df
+
+
+def test_issue725(test_data):
+ metrics, duplicates = get_duplicates(test_data, list(test_data.columns))
+ assert metrics["n_duplicates"] == 100
+ assert metrics["p_duplicates"] == 0.5
+ assert set(duplicates.columns) == set(test_data.columns).union({"# duplicates"})
+
+
+def test_issue725_existing(test_data):
+ test_data = test_data.rename(columns={"count": "# duplicates"})
+ with pytest.raises(ValueError):
+ _, _ = get_duplicates(test_data, list(test_data.columns))
diff --git a/tests/unit/test_example.py b/tests/unit/test_example.py
index cbb72a6ee..8b1487543 100644
--- a/tests/unit/test_example.py
+++ b/tests/unit/test_example.py
@@ -50,3 +50,16 @@ def test_example(get_data_file, test_output_dir):
and len(profile.get_description().items()) == 10
), "Unexpected result"
assert "12" in profile.to_html()
+
+
+def test_example_empty():
+ df = pd.DataFrame({"A": [], "B": []})
+ profile = ProfileReport(df)
+ description = profile.get_description()
+
+ assert len(description["correlations"]) == 0
+ assert len(description["missing"]) == 0
+ assert len(description["sample"]) == 0
+
+ html = profile.to_html()
+ assert "Dataset is empty" in html
diff --git a/tests/unit/test_formatters.py b/tests/unit/test_formatters.py
index 09711cd47..4f6f46faf 100644
--- a/tests/unit/test_formatters.py
+++ b/tests/unit/test_formatters.py
@@ -6,6 +6,7 @@
fmt_bytesize,
fmt_class,
fmt_color,
+ fmt_monotonic,
fmt_numeric,
)
@@ -79,7 +80,36 @@ def test_fmt_array(array, threshold, expected):
(81.000000, 10, "81"),
(81, 10, "81"),
(81.999861123123123123, 10, "81.99986112"),
+ (1e20, 10, "1 × 1020"),
+ (1e-20, 10, "1 × 10-20"),
+ (1e8, 3, "1 × 108"),
],
)
def test_fmt_numeric(value, precision, expected):
assert fmt_numeric(value, precision) == expected
+
+
+@pytest.mark.parametrize(
+ "value, expected",
+ [
+ (-2, "Strictly decreasing"),
+ (-1, "Decreasing"),
+ (0, "Not monotonic"),
+ (1, "Increasing"),
+ (2, "Strictly increasing"),
+ ],
+)
+def test_fmt_monotonic(value, expected):
+ assert fmt_monotonic(value) == expected
+
+
+@pytest.mark.parametrize(
+ "value",
+ [
+ -3,
+ 3,
+ ],
+)
+def test_fmt_monotonic_err(value):
+ with pytest.raises(ValueError):
+ fmt_monotonic(value)
diff --git a/tests/unit/test_ge_integration_expectations.py b/tests/unit/test_ge_integration_expectations.py
index 40f3850ca..443f2f1aa 100644
--- a/tests/unit/test_ge_integration_expectations.py
+++ b/tests/unit/test_ge_integration_expectations.py
@@ -123,6 +123,7 @@ def test_datetime_expectations(batch):
"column",
min_value=0,
max_value=100,
+ parse_strings_as_datetimes=True,
)
diff --git a/tests/unit/test_interactions.py b/tests/unit/test_interactions.py
index ac658bb99..e02dac497 100644
--- a/tests/unit/test_interactions.py
+++ b/tests/unit/test_interactions.py
@@ -1,5 +1,3 @@
-from pathlib import Path
-
import numpy as np
import pandas as pd
diff --git a/tests/unit/test_summary.py b/tests/unit/test_summary.py
new file mode 100644
index 000000000..f44de68b0
--- /dev/null
+++ b/tests/unit/test_summary.py
@@ -0,0 +1,10 @@
+import pandas as pd
+
+from pandas_profiling.model.summary import get_table_stats
+
+
+def test_get_table_stats_empty_df():
+ df = pd.DataFrame({"A": [], "B": []})
+ table_stats = get_table_stats(df, {})
+ assert table_stats["n"] == 0
+ assert table_stats["p_cells_missing"] == 0
diff --git a/tests/unit/test_summary_algos.py b/tests/unit/test_summary_algos.py
index 98460bec6..ec5846670 100644
--- a/tests/unit/test_summary_algos.py
+++ b/tests/unit/test_summary_algos.py
@@ -1,7 +1,12 @@
import numpy as np
import pandas as pd
+import pytest
-from pandas_profiling.model.summary_algorithms import describe_counts
+from pandas_profiling.model.summary_algorithms import (
+ describe_counts,
+ describe_generic,
+ describe_supported,
+)
def test_count_summary_sorted():
@@ -24,3 +29,25 @@ def test_count_summary_category():
)
sn, r = describe_counts(s, {})
assert len(r["value_counts_without_nan"].index) == 2
+
+
+@pytest.fixture(scope="class")
+def empty_data() -> pd.DataFrame:
+ return pd.DataFrame({"A": []})
+
+
+def test_summary_supported_empty_df(empty_data):
+ series, summary = describe_counts(empty_data["A"], {})
+ assert summary["n_missing"] == 0
+ assert "p_missing" not in summary
+
+ series, summary = describe_generic(series, summary)
+ assert summary["n_missing"] == 0
+ assert summary["p_missing"] == 0
+ assert summary["count"] == 0
+
+ _, summary = describe_supported(series, summary)
+ assert summary["n_distinct"] == 0
+ assert summary["p_distinct"] == 0
+ assert summary["n_unique"] == 0
+ assert not summary["is_unique"]