diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 000000000..d92839944 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,42 @@ +name: Performance Benchmarks + +on: + push: + branches: + - master + - develop + +jobs: + benchmark: + name: ${{ matrix.os }} x ${{ matrix.python }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ ubuntu-latest ] #, macos-latest, windows-latest ] + python: ['3.8'] + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + - uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python }} + - name: Run benchmark + run: | + pip install --upgrade pip setuptools wheel + pip install -r requirements.txt + pip install -r requirements-test.txt + - run: make install + - run: pytest tests/benchmarks/bench.py --benchmark-min-rounds 10 --benchmark-warmup "on" --benchmark-json benchmark.json + - name: Store benchmark result + uses: rhysd/github-action-benchmark@v1 + with: + name: Pandas Profiling Benchmarks + tool: 'pytest' + output-file-path: benchmark.json + github-token: ${{ secrets.GITHUB_TOKEN }} + auto-push: true + + comment-on-alert: true + alert-comment-cc-users: '@sbrugman' diff --git a/.github/workflows/commit.yml b/.github/workflows/commit.yml new file mode 100644 index 000000000..818987e0f --- /dev/null +++ b/.github/workflows/commit.yml @@ -0,0 +1,11 @@ +name: Lint Commit Messages +on: [pull_request] + +jobs: + commitlint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + - uses: wagoid/commitlint-github-action@v3 \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/release.yml similarity index 99% rename from .github/workflows/ci.yml rename to .github/workflows/release.yml index 48e1aa31b..e286aa439 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/release.yml @@ -1,4 +1,4 @@ -name: CI +name: Release CI on: push: diff --git a/.github/workflows/ci_test.yml b/.github/workflows/tests.yml similarity index 56% rename from .github/workflows/ci_test.yml rename to .github/workflows/tests.yml index cd797af42..95fbbbc21 100644 --- a/.github/workflows/ci_test.yml +++ b/.github/workflows/tests.yml @@ -1,9 +1,9 @@ -name: Tests and Coverage +name: CI on: push jobs: - build: + test: runs-on: ${{ matrix.os }} strategy: matrix: @@ -33,7 +33,53 @@ jobs: pandas: "pandas>1.1" numpy: "numpy" - name: python ${{ matrix.python-version }}, ${{ matrix.os }}, ${{ matrix.pandas }}, ${{ matrix.numpy }} + name: Tests | python ${{ matrix.python-version }}, ${{ matrix.os }}, ${{ matrix.pandas }}, ${{ matrix.numpy }} + steps: + - uses: actions/checkout@v2 + - name: Setup python + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + - uses: actions/cache@v2 + if: startsWith(runner.os, 'Linux') + with: + path: ~/.cache/pip + key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-${{ matrix.pandas }}-pip- + + - uses: actions/cache@v2 + if: startsWith(runner.os, 'macOS') + with: + path: ~/Library/Caches/pip + key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-${{ matrix.pandas }}-pip- + + - uses: actions/cache@v2 + if: startsWith(runner.os, 'Windows') + with: + path: ~\AppData\Local\pip\Cache + key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-${{ matrix.pandas }}-pip- + - run: | + pip install --upgrade pip setuptools wheel + pip install -r requirements.txt "${{ matrix.pandas }}" "${{ matrix.numpy }}" + pip install -r requirements-test.txt + - run: make install + - run: make test + coverage: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ ubuntu-latest ] + python-version: [ 3.8 ] + pandas: [ "pandas>1.1"] + numpy: ["numpy"] + + name: Coverage | python ${{ matrix.python-version }}, ${{ matrix.os }}, ${{ matrix.pandas }}, ${{ matrix.numpy }} steps: - uses: actions/checkout@v2 - name: Setup python diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 20305ac6d..a81c863b9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,7 @@ repos: - id: black language_version: python3.8 - repo: https://github.com/nbQA-dev/nbQA - rev: 0.5.9 + rev: 0.7.0 hooks: - id: nbqa-black additional_dependencies: [ black==20.8b1 ] @@ -17,12 +17,12 @@ repos: additional_dependencies: [ pyupgrade==2.7.3 ] args: [ --nbqa-mutate, --py36-plus ] - repo: https://github.com/asottile/pyupgrade - rev: v2.10.0 + rev: v2.12.0 hooks: - id: pyupgrade args: ['--py36-plus','--exit-zero-even-if-changed'] - repo: https://github.com/pycqa/isort - rev: 5.7.0 + rev: 5.8.0 hooks: - id: isort files: '.*' @@ -31,8 +31,8 @@ repos: rev: "0.46" hooks: - id: check-manifest -- repo: https://gitlab.com/pycqa/flake8 - rev: "3.8.4" +- repo: https://github.com/PyCQA/flake8 + rev: "3.9.1" hooks: - id: flake8 args: [ "--select=E9,F63,F7,F82"] #,T001 diff --git a/Makefile b/Makefile index bbcd539e9..3a8e2d836 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,9 @@ test: pytest tests/issues/ pytest --nbval tests/notebooks/ flake8 . --select=E9,F63,F7,F82 --show-source --statistics - + pandas_profiling -h + make typing + test_cov: pytest --cov=. tests/unit/ pytest --cov=. --cov-append tests/issues/ diff --git a/README.md b/README.md index b1838fe88..de9e02497 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@

Documentation | - Slack + Slack | Stack Overflow

@@ -79,6 +79,7 @@ The following examples can give you an impression of what the package can do: * [Vektis](https://pandas-profiling.github.io/pandas-profiling/examples/master/vektis/vektis_report.html) (Vektis Dutch Healthcare data) * [Colors](https://pandas-profiling.github.io/pandas-profiling/examples/master/colors/colors_report.html) (a simple colors dataset) * [UCI Bank Dataset](https://pandas-profiling.github.io/pandas-profiling/examples/master/cbank_marketing_data/uci_bank_marketing_report.html) (banking marketing dataset) +* [RDW](https://pandas-profiling.github.io/pandas-profiling/examples/master/rdw/rdw.html) (RDW, the Dutch DMV's vehicle registration 10 million rows, 71 features) Specific features: @@ -211,7 +212,7 @@ profile.to_file("your_report.json") Version 2.4 introduces minimal mode. -This is a default configuration that disables expensive computations (such as correlations and dynamic binning). +This is a default configuration that disables expensive computations (such as correlations and duplicate row detection). Use the following syntax: @@ -220,6 +221,8 @@ profile = ProfileReport(large_dataset, minimal=True) profile.to_file("output.html") ``` +Benchmarks are available [here](https://pandas-profiling.github.io/pandas-profiling/dev/bench/). + ### Command line usage For standard formatted CSV files that can be read immediately by pandas, you can use the `pandas_profiling` executable. @@ -239,7 +242,7 @@ A set of options is available in order to adapt the report generated. * `progress_bar` (`bool`): If True, `pandas-profiling` will display a progress bar. * `infer_dtypes` (`bool`): When `True` (default) the `dtype` of variables are inferred using `visions` using the typeset logic (for instance a column that has integers stored as string will be analyzed as if being numeric). -More settings can be found in the [default configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_default.yaml), [minimal configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_minimal.yaml) and [dark themed configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_dark.yaml). +More settings can be found in the [default configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_default.yaml) and [minimal configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_minimal.yaml). You find the configuration docs on the advanced usage page [here](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/advanced_usage.html) @@ -306,14 +309,15 @@ Types are a powerful abstraction for effective data analysis, that goes beyond t `pandas-profiling` currently, recognizes the following types: _Boolean, Numerical, Date, Categorical, URL, Path, File_ and _Image_. We have developed a type system for Python, tailored for data analysis: [visions](https://github.com/dylan-profiler/visions). -Selecting the right typeset drastically reduces the complexity the code of your analysis. -Future versions of `pandas-profiling` will have extended type support through `visions`! +Choosing an appropriate typeset can both improve the overall expressiveness and reduce the complexity of your analysis/code. +To learn more about `pandas-profiling`'s type system, check out the default implementation [here](https://github.com/pandas-profiling/pandas-profiling/blob/develop/src/pandas_profiling/model/typeset.py). +In the meantime, user customized summarizations and type definitions are now fully supported - if you have a specific use-case please reach out with ideas or a PR! ## Contributing Read on getting involved in the [Contribution Guide](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/contribution_guidelines.html). -A low threshold place to ask questions or start contributing is by reaching out on the pandas-profiling Slack. [Join the Slack community](https://join.slack.com/t/pandas-profiling/shared_invite/zt-hfy3iwp2-qEJSItye5QBZf8YGFMaMnQ). +A low threshold place to ask questions or start contributing is by reaching out on the pandas-profiling Slack. [Join the Slack community](https://join.slack.com/t/pandas-profiling/shared_invite/zt-oe5ol4yc-YtbOxNBGUCb~v73TamRLuA). ## Editor integration diff --git a/docsrc/source/pages/advanced_usage.rst b/docsrc/source/pages/advanced_usage.rst index 8f7099ef6..ba39f6bbb 100644 --- a/docsrc/source/pages/advanced_usage.rst +++ b/docsrc/source/pages/advanced_usage.rst @@ -165,3 +165,75 @@ It's possible to disable certain groups of features through configuration shorth r.set_variable("correlations", None) r.set_variable("missing_diagrams", None) r.set_variable("interactions", None) + + + + +Customise plots +--------------- + +A way how to pass arguments to the underlying matplotlib is to use the ``plot`` argument. It is possible to change the default format of images to png (default svg) using the key-pair ``image_format: "png"`` and also the resolution of the image using ``dpi: 800``. + +An example would be: + +.. code-block:: python + + profile = ProfileReport(planets, title='Pandas Profiling Report', explorative=True, + plot={ + 'dpi':200, + 'image_format': 'png' + }) + + +Furthermore, it is possible to change the default values of histograms, the options for that are the following: + + histogram: + x_axis_labels: True + + # Number of bins (set to 0 to automatically detect the bin size) + bins: 50 + + # Maximum number of bins (when bins=0) + max_bins: 250 + + + + + +Customise correlation matrix +----------------------------- + +It's possible to directly access the correlation matrix as well. That is done with the ``plot`` argument and then with the `correlation` key. It is possible to customise the palett, one can use the following list used in seaborn or create [their own custom matplotlib palette](https://matplotlib.org/stable/gallery/color/custom_cmap.html). Supported values are + +``` +'Accent', 'Accent_r', 'Blues', 'Blues_r', 'BrBG', 'BrBG_r', 'BuGn', 'BuGn_r', 'BuPu', 'BuPu_r', 'CMRmap', 'CMRmap_r', 'Dark2', 'Dark2_r', 'GnBu', 'GnBu_r', 'Greens', 'Greens_r', 'Greys', 'Greys_r', 'OrRd', 'OrRd_r', 'Oranges', 'Oranges_r', 'PRGn', 'PRGn_r', 'Paired', 'Paired_r', 'Pastel1', 'Pastel1_r', 'Pastel2', 'Pastel2_r', 'PiYG', 'PiYG_r', 'PuBu', 'PuBuGn', 'PuBuGn_r', 'PuBu_r', 'PuOr', 'PuOr_r', 'PuRd', 'PuRd_r', 'Purples', 'Purples_r', 'RdBu', 'RdBu_r', 'RdGy', 'RdGy_r', 'RdPu', 'RdPu_r', 'RdYlBu', 'RdYlBu_r', 'RdYlGn', 'RdYlGn_r', 'Reds', 'Reds_r', 'Set1', 'Set1_r', 'Set2', 'Set2_r', 'Set3', 'Set3_r', 'Spectral', 'Spectral_r', 'Wistia', 'Wistia_r', 'YlGn', 'YlGnBu', 'YlGnBu_r', 'YlGn_r', 'YlOrBr', 'YlOrBr_r', 'YlOrRd', 'YlOrRd_r', 'afmhot', 'afmhot_r', 'autumn', 'autumn_r', 'binary', 'binary_r', 'bone', 'bone_r', 'brg', 'brg_r', 'bwr', 'bwr_r', 'cividis', 'cividis_r', 'cool', 'cool_r', 'coolwarm', 'coolwarm_r', 'copper', 'copper_r', 'crest', 'crest_r', 'cubehelix', 'cubehelix_r', 'flag', 'flag_r', 'flare', 'flare_r', 'gist_earth', 'gist_earth_r', 'gist_gray', 'gist_gray_r', 'gist_heat', 'gist_heat_r', 'gist_ncar', 'gist_ncar_r', 'gist_rainbow', 'gist_rainbow_r', 'gist_stern', 'gist_stern_r', 'gist_yarg', 'gist_yarg_r', 'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 'gray', 'gray_r', 'hot', 'hot_r', 'hsv', 'hsv_r', 'icefire', 'icefire_r', 'inferno', 'inferno_r', 'jet', 'jet_r', 'magma', 'magma_r', 'mako', 'mako_r', 'nipy_spectral', 'nipy_spectral_r', 'ocean', 'ocean_r', 'pink', 'pink_r', 'plasma', 'plasma_r', 'prism', 'prism_r', 'rainbow', 'rainbow_r', 'rocket', 'rocket_r', 'seismic', 'seismic_r', 'spring', 'spring_r', 'summer', 'summer_r', 'tab10', 'tab10_r', 'tab20', 'tab20_r', 'tab20b', 'tab20b_r', 'tab20c', 'tab20c_r', 'terrain', 'terrain_r', 'turbo', 'turbo_r', 'twilight', 'twilight_r', 'twilight_shifted', 'twilight_shifted_r', 'viridis', 'viridis_r', 'vlag', 'vlag_r', 'winter', 'winter_r' +``` + +An example can be: + +.. code-block:: python + + from pandas_profiling import ProfileReport + + profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True, + plot={ + 'correlation':{ + 'cmap': 'RdBu_r', + 'bad': '#000000'}} + ) + + +Similarly, one can change the palette for *Missing values* using the ``missing`` argument, eg: + +.. code-block:: python + + from pandas_profiling import ProfileReport + + profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True, + plot={ + 'missing':{ + 'cmap': 'RdBu_r'}} + ) + + + diff --git a/docsrc/source/pages/changelog/v2_12_0.rst b/docsrc/source/pages/changelog/v2_12_0.rst index 2b6b6a5e8..02d35bb4a 100644 --- a/docsrc/source/pages/changelog/v2_12_0.rst +++ b/docsrc/source/pages/changelog/v2_12_0.rst @@ -3,14 +3,27 @@ Changelog v2.12.0 🎉 Features ^^^^^^^^^^^ -- Add the number and the percentage of negative values for numerical variables `[695] `- (contributed by @gverbock). +- Add the number and the percentage of negative values for numerical variables `[695] `_ (contributed by @gverbock) - Enable setting of typeset/summarizer (contributed by @ieaves) +- Allow empty data frames `[678] `_ (contributed by @spbail, @fwd2020-c) + +🐛 Bug fixes +^^^^^^^^^^^^ +- Patch args for great_expectations datetime profiler `[727] `_ (contributed by @jstammers) +- Negative exponent formatting `[723] `_ (reported by @rdpapworth) 📖 Documentation ^^^^^^^^^^^^^^^^ - Fix link syntax (contributed by @ChrisCarini) +👷‍♂️ Internal Improvements +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- Several performance improvements (minimal mode, duplicates, frequency table sorting) +- Introduce ``pytest-benchmark`` in CI to monitor commit performance impact +- Introduce ``commitlint`` in CI to start automating the changelog generation + ⬆️ Dependencies ^^^^^^^^^^^^^^^^^^ -- The `ipywidgets` dependency was moved to the `[notebook]` extra, so most of Jupyter will not be installed alongside this package by default (contributed by @akx). -- Replaced the (testing only) `fastparquet` dependency with `pyarrow` (default pandas parquet engine, contributed by @kurosch). \ No newline at end of file +- The ``ipywidgets`` dependency was moved to the ``[notebook]`` extra, so most of Jupyter will not be installed alongside this package by default (contributed by @akx) +- Replaced the (testing only) ``fastparquet`` dependency with ``pyarrow`` (default pandas parquet engine, contributed by @kurosch) +- Upgrade ``phik``. This drops the hard dependency on numba (contributed by @akx) diff --git a/docsrc/source/pages/changelog/v2_13_0.rst b/docsrc/source/pages/changelog/v2_13_0.rst index f2d44616a..d8b8eb1c3 100644 --- a/docsrc/source/pages/changelog/v2_13_0.rst +++ b/docsrc/source/pages/changelog/v2_13_0.rst @@ -1,5 +1,5 @@ -Changelog vx.y.z ----------------- +Changelog v2.13.0 +----------------- 🎉 Features ^^^^^^^^^^^ diff --git a/docsrc/source/pages/contribution_guidelines.rst b/docsrc/source/pages/contribution_guidelines.rst index e0d1b4bc3..2d0b80b42 100644 --- a/docsrc/source/pages/contribution_guidelines.rst +++ b/docsrc/source/pages/contribution_guidelines.rst @@ -9,6 +9,10 @@ Contributing a new feature * Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable. + +Slack community +--------------- +A low threshold place to ask questions or start contributing is by reaching out on the pandas-profiling Slack. `Join the Slack community `_. Developer tools --------------- @@ -61,4 +65,4 @@ Read Github's `open source legal guide `_ on Github. \ No newline at end of file +Read more on getting involved in the `Contribution Guide `_ on Github. diff --git a/docsrc/source/pages/resources.rst b/docsrc/source/pages/resources.rst index 16096af8d..81f85fff1 100644 --- a/docsrc/source/pages/resources.rst +++ b/docsrc/source/pages/resources.rst @@ -14,7 +14,7 @@ Notebooks Articles -------- - +- `Bringing Customization to Pandas Profiling `_ (Ian Eaves, March 5, 2021) - `Beginner Friendly Data Science Projects Accepting Contributions `_ (Adam Ross Nelson, January 18, 2021) - `Pandas profiling and exploratory data analysis with line one of code! `_ (Magdalena Konkiewicz, Jun 10, 2020) - `The Covid 19 health issue `_ (Concillier Kitungulu, April 20, 2020) diff --git a/docsrc/source/pages/support.rst b/docsrc/source/pages/support.rst index 46ed2e1e4..3f35ac3bd 100644 --- a/docsrc/source/pages/support.rst +++ b/docsrc/source/pages/support.rst @@ -35,6 +35,10 @@ Users with a request for help on how to use `pandas-profiling` should consider a :alt: Questions: Stackoverflow "pandas-profiling" :target: https://stackoverflow.com/questions/tagged/pandas-profiling +Slack community +--------------- + +`Join the Slack community `_ and come into contact with other users and developers, that might be able to answer your questions. Reporting a bug --------------- diff --git a/examples/bank_marketing_data/banking_data.py b/examples/bank_marketing_data/banking_data.py index 9d5eb285c..139c5e964 100644 --- a/examples/bank_marketing_data/banking_data.py +++ b/examples/bank_marketing_data/banking_data.py @@ -5,12 +5,12 @@ import pandas as pd from pandas_profiling import ProfileReport -from pandas_profiling.utils.cache import cache_file +from pandas_profiling.utils.cache import cache_zipped_file if __name__ == "__main__": - file_name = cache_file( + file_name = cache_zipped_file( "bank-full.csv", - "https://storage.googleapis.com/erwinh-public-data/bankingdata/bank-full.csv", + "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip", ) # Download the UCI Bank Marketing Dataset diff --git a/examples/rdw/rdw.py b/examples/rdw/rdw.py new file mode 100644 index 000000000..3c500882c --- /dev/null +++ b/examples/rdw/rdw.py @@ -0,0 +1,14 @@ +import pandas as pd + +from pandas_profiling import ProfileReport +from pandas_profiling.utils.cache import cache_file + +if __name__ == "__main__": + file_name = cache_file( + "rdw.parquet", + "https://raw.githubusercontent.com/pandas-profiling/pandas-profiling-data/master/data/rdw.parquet", + ) + data = pd.read_parquet(file_name) + + profile = ProfileReport(data, title="RDW Dataset", minimal=True) + profile.to_file("rdw.html") diff --git a/requirements-test.txt b/requirements-test.txt index 89c4b5df1..e92c82343 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -3,6 +3,7 @@ coverage<5 codecov pytest-mypy pytest-cov +pytest-benchmark~=3.4.1 nbval pyarrow flake8 diff --git a/requirements.txt b/requirements.txt index 56149d813..e290cb734 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ htmlmin>=0.1.12 # Missing values missingno>=0.4.2 # Correlations -phik>=0.10.0 +phik>=0.11.1 # Text analysis tangled-up-in-unicode>=0.0.6 # Examples diff --git a/src/pandas_profiling/config_default.yaml b/src/pandas_profiling/config_default.yaml index 42c1b1368..fa7009003 100644 --- a/src/pandas_profiling/config_default.yaml +++ b/src/pandas_profiling/config_default.yaml @@ -48,6 +48,7 @@ vars: chi_squared_threshold: 0.999 coerce_str_to_date: False redact: False + histogram_largest: 50 bool: n_obs: 3 # string to boolean mappings pairs (true, false) @@ -150,6 +151,7 @@ memory_deep: False # Configuration related to the duplicates duplicates: head: 10 + key: "# duplicates" # Configuration related to the samples area samples: diff --git a/src/pandas_profiling/config_minimal.yaml b/src/pandas_profiling/config_minimal.yaml index e1aacad3d..16076c90f 100644 --- a/src/pandas_profiling/config_minimal.yaml +++ b/src/pandas_profiling/config_minimal.yaml @@ -14,7 +14,7 @@ variables: descriptions: {} # infer dtypes -infer_dtypes: True +infer_dtypes: False # Show the description at each variable (in addition to the overview tab) show_variable_description: True @@ -48,6 +48,7 @@ vars: chi_squared_threshold: 0.0 coerce_str_to_date: False redact: False + histogram_largest: 10 bool: n_obs: 3 # string to boolean mappings pairs (true, false) @@ -151,6 +152,7 @@ memory_deep: False # Configuration related to the duplicates duplicates: head: 0 + key: "# duplicates" # Configuration related to the samples area samples: diff --git a/src/pandas_profiling/model/correlations.py b/src/pandas_profiling/model/correlations.py index 99756ed71..ee64edc0d 100644 --- a/src/pandas_profiling/model/correlations.py +++ b/src/pandas_profiling/model/correlations.py @@ -155,6 +155,9 @@ def calculate_correlation( The correlation matrices for the given correlation measures. Return None if correlation is empty. """ + if len(df) == 0: + return None + correlation_measures = { "pearson": Pearson, "spearman": Spearman, diff --git a/src/pandas_profiling/model/describe.py b/src/pandas_profiling/model/describe.py index 80b53d253..22d8b1f39 100644 --- a/src/pandas_profiling/model/describe.py +++ b/src/pandas_profiling/model/describe.py @@ -47,9 +47,6 @@ def describe( if not isinstance(df, pd.DataFrame): warnings.warn("df is not of type pandas.DataFrame") - if df.empty: - raise ValueError("df can not be empty") - disable_progress_bar = not config["progress_bar"].get(bool) date_start = datetime.utcnow() @@ -134,7 +131,8 @@ def describe( # Duplicates pbar.set_postfix_str("Locating duplicates") - duplicates = get_duplicates(df, supported_columns) + metrics, duplicates = get_duplicates(df, supported_columns) + table_stats.update(metrics) pbar.update() # Messages diff --git a/src/pandas_profiling/model/duplicates.py b/src/pandas_profiling/model/duplicates.py index b81fc232b..90b1ad0fe 100644 --- a/src/pandas_profiling/model/duplicates.py +++ b/src/pandas_profiling/model/duplicates.py @@ -1,11 +1,13 @@ -from typing import Optional +from typing import Any, Dict, Optional, Tuple import pandas as pd from pandas_profiling.config import config -def get_duplicates(df: pd.DataFrame, supported_columns) -> Optional[pd.DataFrame]: +def get_duplicates( + df: pd.DataFrame, supported_columns +) -> Tuple[Dict[str, Any], Optional[pd.DataFrame]]: """Obtain the most occurring duplicate rows in the DataFrame. Args: @@ -17,12 +19,34 @@ def get_duplicates(df: pd.DataFrame, supported_columns) -> Optional[pd.DataFrame """ n_head = config["duplicates"]["head"].get(int) - if n_head > 0 and supported_columns: - return ( - df[df.duplicated(subset=supported_columns, keep=False)] - .groupby(supported_columns) - .size() - .reset_index(name="count") - .nlargest(n_head, "count") - ) - return None + metrics: Dict[str, Any] = {} + if n_head > 0: + if supported_columns and len(df) > 0: + duplicates_key = config["duplicates"]["key"].get(str) + if duplicates_key in df.columns: + raise ValueError( + f"Duplicates key ({duplicates_key}) may not be part of the DataFrame. Either change the " + f" column name in the DataFrame or change the 'duplicates.key' parameter." + ) + + duplicated_rows = df.duplicated(subset=supported_columns, keep=False) + duplicated_rows = ( + df[duplicated_rows] + .groupby(supported_columns) + .size() + .reset_index(name=duplicates_key) + ) + + metrics["n_duplicates"] = len(duplicated_rows[duplicates_key]) + metrics["p_duplicates"] = metrics["n_duplicates"] / len(df) + + return ( + metrics, + duplicated_rows.nlargest(n_head, duplicates_key), + ) + else: + metrics["n_duplicates"] = 0 + metrics["p_duplicates"] = 0.0 + return metrics, None + else: + return metrics, None diff --git a/src/pandas_profiling/model/expectation_algorithms.py b/src/pandas_profiling/model/expectation_algorithms.py index 83e748a36..efac257b5 100644 --- a/src/pandas_profiling/model/expectation_algorithms.py +++ b/src/pandas_profiling/model/expectation_algorithms.py @@ -69,7 +69,10 @@ def path_expectations(name, summary, batch, *args): def datetime_expectations(name, summary, batch, *args): if any(k in summary for k in ["min", "max"]): batch.expect_column_values_to_be_between( - name, min_value=summary.get("min"), max_value=summary.get("max") + name, + min_value=summary.get("min"), + max_value=summary.get("max"), + parse_strings_as_datetimes=True, ) return name, summary, batch diff --git a/src/pandas_profiling/model/messages.py b/src/pandas_profiling/model/messages.py index 3330c049e..3e557cc63 100644 --- a/src/pandas_profiling/model/messages.py +++ b/src/pandas_profiling/model/messages.py @@ -56,6 +56,9 @@ class MessageType(Enum): UNIFORM = auto() """The variable is uniformly distributed""" + EMPTY = auto() + """The DataFrame is empty""" + class Message: """A message object (type, values, column).""" @@ -109,7 +112,7 @@ def check_table_messages(table: dict) -> List[Message]: A list of messages. """ messages = [] - if warning_value(table["n_duplicates"]): + if "n_duplicates" in table and warning_value(table["n_duplicates"]): messages.append( Message( message_type=MessageType.DUPLICATES, @@ -117,6 +120,14 @@ def check_table_messages(table: dict) -> List[Message]: fields={"n_duplicates"}, ) ) + if table["n"] == 0: + messages.append( + Message( + message_type=MessageType.EMPTY, + values=table, + fields={"n"}, + ) + ) return messages diff --git a/src/pandas_profiling/model/sample.py b/src/pandas_profiling/model/sample.py index 1df2acc78..50fac9397 100644 --- a/src/pandas_profiling/model/sample.py +++ b/src/pandas_profiling/model/sample.py @@ -1,3 +1,5 @@ +from typing import List + import attr import pandas as pd @@ -12,7 +14,7 @@ class Sample: caption = attr.ib(default=None) -def get_sample(df: pd.DataFrame) -> list: +def get_sample(df: pd.DataFrame) -> List[Sample]: """Obtains a sample from head and tail of the DataFrame Args: @@ -21,7 +23,10 @@ def get_sample(df: pd.DataFrame) -> list: Returns: a list of Sample objects """ - samples = [] + samples: List[Sample] = [] + if len(df) == 0: + return samples + n_head = config["samples"]["head"].get(int) if n_head > 0: samples.append(Sample("head", df.head(n=n_head), "First rows")) diff --git a/src/pandas_profiling/model/summary.py b/src/pandas_profiling/model/summary.py index a579275c1..ea14eae7e 100644 --- a/src/pandas_profiling/model/summary.py +++ b/src/pandas_profiling/model/summary.py @@ -4,7 +4,7 @@ import multiprocessing.pool import warnings from collections import Counter -from typing import Callable, Mapping, Optional, Tuple +from typing import Callable, Mapping, Tuple import numpy as np import pandas as pd @@ -16,7 +16,6 @@ check_variable_messages, ) from pandas_profiling.model.summarizer import BaseSummarizer -from pandas_profiling.model.typeset import Unsupported from pandas_profiling.visualisation.missing import ( missing_bar, missing_dendrogram, @@ -124,7 +123,7 @@ def get_table_stats(df: pd.DataFrame, variable_stats: dict) -> dict: n = len(df) memory_size = df.memory_usage(deep=config["memory_deep"].get(bool)).sum() - record_size = float(memory_size) / n + record_size = float(memory_size) / n if n > 0 else 0 table_stats = { "n": n, @@ -143,21 +142,9 @@ def get_table_stats(df: pd.DataFrame, variable_stats: dict) -> dict: if series_summary["n_missing"] == n: table_stats["n_vars_all_missing"] += 1 - table_stats["p_cells_missing"] = table_stats["n_cells_missing"] / ( - table_stats["n"] * table_stats["n_var"] - ) - - supported_columns = [ - k for k, v in variable_stats.items() if v["type"] != Unsupported - ] - table_stats["n_duplicates"] = ( - sum(df.duplicated(subset=supported_columns)) - if len(supported_columns) > 0 - else 0 - ) - table_stats["p_duplicates"] = ( - (table_stats["n_duplicates"] / len(df)) - if (len(supported_columns) > 0 and len(df) > 0) + table_stats["p_cells_missing"] = ( + table_stats["n_cells_missing"] / (table_stats["n"] * table_stats["n_var"]) + if table_stats["n"] > 0 else 0 ) @@ -169,29 +156,6 @@ def get_table_stats(df: pd.DataFrame, variable_stats: dict) -> dict: return table_stats -def get_duplicates(df: pd.DataFrame, supported_columns) -> Optional[pd.DataFrame]: - """Obtain the most occurring duplicate rows in the DataFrame. - - Args: - df: the Pandas DataFrame. - supported_columns: the columns to consider - - Returns: - A subset of the DataFrame, ordered by occurrence. - """ - n_head = config["duplicates"]["head"].get(int) - - if n_head > 0 and supported_columns: - return ( - df[df.duplicated(subset=supported_columns, keep=False)] - .groupby(supported_columns) - .size() - .reset_index(name="count") - .nlargest(n_head, "count") - ) - return None - - def get_missing_diagrams(df: pd.DataFrame, table_stats: dict) -> dict: """Gets the rendered diagrams for missing values. @@ -203,6 +167,9 @@ def get_missing_diagrams(df: pd.DataFrame, table_stats: dict) -> dict: A dictionary containing the base64 encoded plots for each diagram that is active in the config (matrix, bar, heatmap, dendrogram). """ + if len(df) == 0: + return {} + def warn_missing(missing_name, error): warnings.warn( f"""There was an attempt to generate the {missing_name} missing values diagrams, but this failed. diff --git a/src/pandas_profiling/model/summary_algorithms.py b/src/pandas_profiling/model/summary_algorithms.py index b5150228d..9a95f6be3 100644 --- a/src/pandas_profiling/model/summary_algorithms.py +++ b/src/pandas_profiling/model/summary_algorithms.py @@ -96,7 +96,7 @@ def describe_supported( stats = { "n_distinct": distinct_count, "p_distinct": distinct_count / count if count > 0 else 0, - "is_unique": unique_count == count, + "is_unique": unique_count == count and count > 0, "n_unique": unique_count, "p_unique": unique_count / count if count > 0 else 0, } @@ -120,7 +120,7 @@ def describe_generic(series: pd.Series, summary: dict) -> Tuple[pd.Series, dict] summary.update( { "n": length, - "p_missing": summary["n_missing"] / length, + "p_missing": summary["n_missing"] / length if length > 0 else 0, "count": length - summary["n_missing"], "memory_size": series.memory_usage(deep=config["memory_deep"].get(bool)), } @@ -233,6 +233,16 @@ def describe_numeric_1d(series: pd.Series, summary: dict) -> Tuple[pd.Series, di stats["monotonic_decrease_strict"] = ( stats["monotonic_decrease"] and series.is_unique ) + if summary["monotonic_increase_strict"]: + stats["monotonic"] = 2 + elif summary["monotonic_decrease_strict"]: + stats["monotonic"] = -2 + elif summary["monotonic_increase"]: + stats["monotonic"] = 1 + elif summary["monotonic_decrease"]: + stats["monotonic"] = -1 + else: + stats["monotonic"] = 0 stats.update( histogram_compute( @@ -295,10 +305,16 @@ def describe_categorical_1d(series: pd.Series, summary: dict) -> Tuple[pd.Series # Only run if at least 1 non-missing value value_counts = summary["value_counts_without_nan"] + histogram_largest = config["vars"]["cat"]["histogram_largest"].get(int) + histogram_data = value_counts + if histogram_largest > 0: + histogram_data = histogram_data.nlargest(histogram_largest) summary.update( histogram_compute( - value_counts, summary["n_distinct"], name="histogram_frequencies" + histogram_data, + summary["n_distinct"], + name="histogram_frequencies", ) ) diff --git a/src/pandas_profiling/report/formatters.py b/src/pandas_profiling/report/formatters.py index 6a29e8bdd..558f4fa4e 100644 --- a/src/pandas_profiling/report/formatters.py +++ b/src/pandas_profiling/report/formatters.py @@ -78,7 +78,7 @@ def fmt_timespan(num_seconds, detailed=False, max_units=3): import math import numbers import re - from datetime import datetime, timedelta + from datetime import timedelta time_units = ( dict( @@ -206,8 +206,10 @@ def fmt_numeric(value: float, precision=10) -> str: fmtted = f"{{:.{precision}g}}".format(value) for v in ["e+", "e-"]: if v in fmtted: + sign = "-" if v in "e-" else "" fmtted = fmtted.replace(v, " × 10") + "" fmtted = fmtted.replace("0", "") + fmtted = fmtted.replace("", f"{sign}") return fmtted @@ -255,6 +257,21 @@ def fmt(value) -> str: return str(escape(value)) +def fmt_monotonic(value: int) -> str: + if value == 2: + return "Strictly increasing" + elif value == 1: + return "Increasing" + elif value == 0: + return "Not monotonic" + elif value == -1: + return "Decreasing" + elif value == -2: + return "Strictly decreasing" + else: + raise ValueError("Value should be integer ranging from -2 to 2.") + + def help(title, url=None) -> str: """Creat help badge @@ -281,6 +298,7 @@ def get_fmt_mapping() -> Dict[str, Callable]: "fmt_bytesize": fmt_bytesize, "fmt_timespan": fmt_timespan, "fmt_numeric": fmt_numeric, + "fmt_monotonic": fmt_monotonic, "fmt_number": fmt_number, "fmt_array": fmt_array, "fmt": fmt, diff --git a/src/pandas_profiling/report/presentation/flavours/html/templates/warnings/warning_duplicates.html b/src/pandas_profiling/report/presentation/flavours/html/templates/warnings/warning_duplicates.html index 8820e1d50..59bb93c56 100644 --- a/src/pandas_profiling/report/presentation/flavours/html/templates/warnings/warning_duplicates.html +++ b/src/pandas_profiling/report/presentation/flavours/html/templates/warnings/warning_duplicates.html @@ -1 +1 @@ -Dataset has {{ message.values['n_duplicates'] }} ({{ message.values['p_duplicates'] | fmt_percent }}) duplicate rows \ No newline at end of file +Dataset has {{ message.values['n_duplicates'] }} ({{ message.values['p_duplicates'] | fmt_percent }}) duplicate rows \ No newline at end of file diff --git a/src/pandas_profiling/report/presentation/flavours/html/templates/warnings/warning_empty.html b/src/pandas_profiling/report/presentation/flavours/html/templates/warnings/warning_empty.html new file mode 100644 index 000000000..a676c9577 --- /dev/null +++ b/src/pandas_profiling/report/presentation/flavours/html/templates/warnings/warning_empty.html @@ -0,0 +1 @@ +Dataset is empty diff --git a/src/pandas_profiling/report/presentation/flavours/widget/warnings.py b/src/pandas_profiling/report/presentation/flavours/widget/warnings.py index 97f830014..43b959f79 100644 --- a/src/pandas_profiling/report/presentation/flavours/widget/warnings.py +++ b/src/pandas_profiling/report/presentation/flavours/widget/warnings.py @@ -25,6 +25,7 @@ def render(self): "skewed": "info", "high_correlation": "", "duplicates": "", + "empty": "", } items = [] diff --git a/src/pandas_profiling/report/presentation/frequency_table_utils.py b/src/pandas_profiling/report/presentation/frequency_table_utils.py index bb53e1dae..0862a19b8 100644 --- a/src/pandas_profiling/report/presentation/frequency_table_utils.py +++ b/src/pandas_profiling/report/presentation/frequency_table_utils.py @@ -1,7 +1,9 @@ -from typing import Dict, Sequence +from typing import Any, Dict, List +import numpy as np -def freq_table(freqtable, n: int, max_number_to_print: int) -> Sequence[Dict]: + +def freq_table(freqtable, n: int, max_number_to_print: int) -> List[Dict]: """Render the rows for a frequency table (value, count). Args: @@ -19,13 +21,13 @@ def freq_table(freqtable, n: int, max_number_to_print: int) -> Sequence[Dict]: max_number_to_print = n if max_number_to_print < len(freqtable): - freq_other = sum(freqtable.iloc[max_number_to_print:]) + freq_other = np.sum(freqtable.iloc[max_number_to_print:]) min_freq = freqtable.values[max_number_to_print] else: freq_other = 0 min_freq = 0 - freq_missing = n - sum(freqtable) + freq_missing = n - np.sum(freqtable) # No values if len(freqtable) == 0: return [] @@ -79,39 +81,37 @@ def freq_table(freqtable, n: int, max_number_to_print: int) -> Sequence[Dict]: return rows -def extreme_obs_table(freqtable, number_to_print, n, ascending=True) -> list: +def extreme_obs_table(freqtable, number_to_print: int, n: int) -> List[Dict[str, Any]]: """Similar to the frequency table, for extreme observations. Args: - freqtable: The frequency table. + freqtable: The (sorted) frequency table. number_to_print: The number of observations to print. n: The total number of observations. - ascending: The ordering of the observations (Default value = True) Returns: The HTML rendering of the extreme observation table. """ + # If it's mixed between base types (str, int) convert to str. Pure "mixed" types are filtered during type # discovery # TODO: should be in cast? - if "mixed" in freqtable.index.inferred_type: - freqtable.index = freqtable.index.astype(str) - - sorted_freqtable = freqtable.sort_index(ascending=ascending) - obs_to_print = sorted_freqtable.iloc[:number_to_print] - max_freq = max(obs_to_print.values) - - rows = [] - for label, freq in obs_to_print.items(): - rows.append( - { - "label": label, - "width": freq / max_freq if max_freq != 0 else 0, - "count": freq, - "percentage": float(freq) / n, - "extra_class": "", - "n": n, - } - ) + # if "mixed" in freqtable.index.inferred_type: + # freqtable.index = freqtable.index.astype(str) + + obs_to_print = freqtable.iloc[:number_to_print] + max_freq = obs_to_print.max() + + rows = [ + { + "label": label, + "width": freq / max_freq if max_freq != 0 else 0, + "count": freq, + "percentage": float(freq) / n, + "extra_class": "", + "n": n, + } + for label, freq in obs_to_print.items() + ] return rows diff --git a/src/pandas_profiling/report/structure/overview.py b/src/pandas_profiling/report/structure/overview.py index e8751086a..2ab3bcb60 100644 --- a/src/pandas_profiling/report/structure/overview.py +++ b/src/pandas_profiling/report/structure/overview.py @@ -7,38 +7,46 @@ def get_dataset_overview(summary): - dataset_info = Table( + table_metrics = [ + { + "name": "Number of variables", + "value": summary["table"]["n_var"], + "fmt": "fmt_number", + }, + { + "name": "Number of observations", + "value": summary["table"]["n"], + "fmt": "fmt_number", + }, + { + "name": "Missing cells", + "value": summary["table"]["n_cells_missing"], + "fmt": "fmt_number", + }, + { + "name": "Missing cells (%)", + "value": summary["table"]["p_cells_missing"], + "fmt": "fmt_percent", + }, + ] + if "n_duplicates" in summary["table"]: + table_metrics.extend( + [ + { + "name": "Duplicate rows", + "value": summary["table"]["n_duplicates"], + "fmt": "fmt_number", + }, + { + "name": "Duplicate rows (%)", + "value": summary["table"]["p_duplicates"], + "fmt": "fmt_percent", + }, + ] + ) + + table_metrics.extend( [ - { - "name": "Number of variables", - "value": summary["table"]["n_var"], - "fmt": "fmt_number", - }, - { - "name": "Number of observations", - "value": summary["table"]["n"], - "fmt": "fmt_number", - }, - { - "name": "Missing cells", - "value": summary["table"]["n_cells_missing"], - "fmt": "fmt_number", - }, - { - "name": "Missing cells (%)", - "value": summary["table"]["p_cells_missing"], - "fmt": "fmt_percent", - }, - { - "name": "Duplicate rows", - "value": summary["table"]["n_duplicates"], - "fmt": "fmt_number", - }, - { - "name": "Duplicate rows (%)", - "value": summary["table"]["p_duplicates"], - "fmt": "fmt_percent", - }, { "name": "Total size in memory", "value": summary["table"]["memory_size"], @@ -49,7 +57,11 @@ def get_dataset_overview(summary): "value": summary["table"]["record_size"], "fmt": "fmt_bytesize", }, - ], + ] + ) + + dataset_info = Table( + table_metrics, name="Dataset statistics", ) diff --git a/src/pandas_profiling/report/structure/variables/render_common.py b/src/pandas_profiling/report/structure/variables/render_common.py index 426f258b1..e55d29536 100644 --- a/src/pandas_profiling/report/structure/variables/render_common.py +++ b/src/pandas_profiling/report/structure/variables/render_common.py @@ -9,6 +9,8 @@ def render_common(summary): n_extreme_obs = config["n_extreme_obs"].get(int) n_freq_table_max = config["n_freq_table_max"].get(int) + sorted_freqtable = summary["value_counts_without_nan"].sort_index(ascending=True) + template_variables = { # TODO: with nan "freq_table_rows": freq_table( @@ -17,16 +19,14 @@ def render_common(summary): max_number_to_print=n_freq_table_max, ), "firstn_expanded": extreme_obs_table( - freqtable=summary["value_counts_without_nan"], + freqtable=sorted_freqtable, number_to_print=n_extreme_obs, n=summary["n"], - ascending=True, ), "lastn_expanded": extreme_obs_table( - freqtable=summary["value_counts_without_nan"], + freqtable=sorted_freqtable[::-1], number_to_print=n_extreme_obs, n=summary["n"], - ascending=False, ), } diff --git a/src/pandas_profiling/report/structure/variables/render_real.py b/src/pandas_profiling/report/structure/variables/render_real.py index e7ce82412..6624548d5 100644 --- a/src/pandas_profiling/report/structure/variables/render_real.py +++ b/src/pandas_profiling/report/structure/variables/render_real.py @@ -152,17 +152,6 @@ def render_real(summary): name="Quantile statistics", ) - if summary["monotonic_increase_strict"]: - monotocity = "Strictly increasing" - elif summary["monotonic_decrease_strict"]: - monotocity = "Strictly decreasing" - elif summary["monotonic_increase"]: - monotocity = "Increasing" - elif summary["monotonic_decrease"]: - monotocity = "Decreasing" - else: - monotocity = "Not monotonic" - descriptive_statistics = Table( [ { @@ -190,7 +179,11 @@ def render_real(summary): }, {"name": "Sum", "value": summary["sum"], "fmt": "fmt_numeric"}, {"name": "Variance", "value": summary["variance"], "fmt": "fmt_numeric"}, - {"name": "Monotocity", "value": monotocity, "fmt": "fmt"}, + { + "name": "Monotonicity", + "value": summary["monotonic"], + "fmt": "fmt_monotonic", + }, ], name="Descriptive statistics", ) diff --git a/src/pandas_profiling/utils/cache.py b/src/pandas_profiling/utils/cache.py index 1699b2c22..356d6fea8 100644 --- a/src/pandas_profiling/utils/cache.py +++ b/src/pandas_profiling/utils/cache.py @@ -1,4 +1,5 @@ """Dataset cache utility functions""" +import zipfile from pathlib import Path import requests @@ -20,9 +21,44 @@ def cache_file(file_name: str, url: str) -> Path: data_path = get_data_path() data_path.mkdir(exist_ok=True) + file_path = data_path / file_name + + # If not exists, download and create file + if not file_path.exists(): + response = requests.get(url) + file_path.write_bytes(response.content) + + return file_path + + +def cache_zipped_file(file_name: str, url: str) -> Path: + """Check if file_name already is in the data path, otherwise download it from url. + + Args: + file_name: the file name + url: the URL of the dataset + + Returns: + The relative path to the dataset + """ + + data_path = get_data_path() + data_path.mkdir(exist_ok=True) + + file_path = data_path / file_name + # If not exists, download and create file - if not (data_path / file_name).exists(): - data = requests.get(url) - (data_path / file_name).write_bytes(data.content) + if not file_path.exists(): + response = requests.get(url) + if response.status_code != 200: + raise FileNotFoundError("Could not download resource") + + tmp_path = data_path / "tmp.zip" + tmp_path.write_bytes(response.content) + + with zipfile.ZipFile(tmp_path, "r") as zip_file: + zip_file.extract(file_path.name, data_path) + + tmp_path.unlink() - return data_path / file_name + return file_path diff --git a/tests/benchmarks/bench.py b/tests/benchmarks/bench.py new file mode 100644 index 000000000..a1db2a0b0 --- /dev/null +++ b/tests/benchmarks/bench.py @@ -0,0 +1,59 @@ +from functools import partial + +import pandas as pd + +from pandas_profiling import ProfileReport +from pandas_profiling.utils.cache import cache_file + + +def func(df, **kwargs): + profile = ProfileReport(df, progress_bar=False, **kwargs) + report = profile.to_html() + return report + + +def test_titanic_explorative(benchmark): + file_name = cache_file( + "titanic.parquet", + "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet", + ) + + data = pd.read_parquet(file_name) + + kwargs = dict(explorative=True) + benchmark(partial(func, **kwargs), data) + + +def test_titanic_default(benchmark): + file_name = cache_file( + "titanic.parquet", + "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet", + ) + + data = pd.read_parquet(file_name) + + benchmark(partial(func), data) + + +def test_titanic_minimal(benchmark): + file_name = cache_file( + "titanic.parquet", + "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet", + ) + + data = pd.read_parquet(file_name) + + kwargs = dict(minimal=True) + benchmark(partial(func, **kwargs), data) + + +def test_rdw_minimal(benchmark): + file_name = cache_file( + "rdw_sample_100k.parquet", + "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/rdw_sample_100k.parquet", + ) + + data = pd.read_parquet(file_name) + + kwargs = dict(minimal=True) + benchmark(partial(func, **kwargs), data) diff --git a/tests/issues/test_issue377.py b/tests/issues/test_issue377.py index 2ffa39a92..3362e812e 100644 --- a/tests/issues/test_issue377.py +++ b/tests/issues/test_issue377.py @@ -6,25 +6,35 @@ import pandas as pd import pytest +import requests -import pandas_profiling -from pandas_profiling.utils.cache import cache_file +from pandas_profiling import ProfileReport +from pandas_profiling.utils.cache import cache_zipped_file -@pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher") -def test_issue377(): - file_name = cache_file( - "bank-full.csv", - "https://storage.googleapis.com/erwinh-public-data/bankingdata/bank-full.csv", - ) +@pytest.fixture() +def df(): + try: + file_name = cache_zipped_file( + "bank-full.csv", + "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip", + ) + except (requests.exceptions.ConnectionError, FileNotFoundError): + return # Download the UCI Bank Marketing Dataset df = pd.read_csv(file_name, sep=";") + return df + + +@pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher") +def test_issue377(df): + if df is None: + pytest.skip("dataset unavailable") + return original_order = tuple(df.columns.values) - profile = pandas_profiling.ProfileReport( - df, sort="None", pool_size=1, progress_bar=False - ) + profile = ProfileReport(df, sort="None", pool_size=1, progress_bar=False) new_order = tuple(profile.get_description()["variables"].keys()) assert original_order == new_order diff --git a/tests/issues/test_issue51.py b/tests/issues/test_issue51.py index 50617ca81..71815f23e 100644 --- a/tests/issues/test_issue51.py +++ b/tests/issues/test_issue51.py @@ -7,9 +7,6 @@ import pandas_profiling -# FIXME: correlations can be computed stand alone to speed up tests -from pandas_profiling.config import config - def test_issue51(get_data_file): # Categorical has empty ('') value diff --git a/tests/performance/time_inf.py b/tests/performance/time_inf.py deleted file mode 100644 index ba2aecaa4..000000000 --- a/tests/performance/time_inf.py +++ /dev/null @@ -1,25 +0,0 @@ -import timeit - -testcode = """ -import numpy as np -import pandas as pd - -np.random.seed(12) -vals = np.random.random(10000) -series = pd.Series(vals) -series[series < 0.3] = np.nan -series[series < 0.2] = np.Inf - - - -def f1(series): - return len(series.loc[(~np.isfinite(series)) & series.notnull()]) - - -def f2(series): - return ((series == np.inf) | (series == -np.inf)).sum() -""" - - -print(timeit.timeit("f1(series)", number=10, setup=testcode)) -print(timeit.timeit("f2(series)", number=10, setup=testcode)) diff --git a/tests/performance/time_kurtosis.py b/tests/performance/time_kurtosis.py deleted file mode 100644 index dfa106272..000000000 --- a/tests/performance/time_kurtosis.py +++ /dev/null @@ -1,36 +0,0 @@ -import timeit - -testcode = """ -import numpy as np -import pandas as pd -import scipy.stats - -np.random.seed(12) -vals = np.random.random(1000) -series = pd.Series(vals) -series[series < 0.2] = pd.NA - -def f1(series): - arr = series.values - return scipy.stats.kurtosis(arr, bias=False, nan_policy='omit') - - -def f2(series): - arr = series.values - arr_without_nan = arr[~np.isnan(arr)] - return scipy.stats.kurtosis(arr_without_nan, bias=False) - - -def f3(series): - return series.kurtosis() - - -def f4(series): - return series[series.notna()].kurtosis() -""" - - -print(timeit.timeit("f1(series)", number=10, setup=testcode)) -print(timeit.timeit("f2(series)", number=10, setup=testcode)) -print(timeit.timeit("f3(series)", number=10, setup=testcode)) -print(timeit.timeit("f4(series)", number=10, setup=testcode)) diff --git a/tests/performance/time_mad.py b/tests/performance/time_mad.py deleted file mode 100644 index 8c6107614..000000000 --- a/tests/performance/time_mad.py +++ /dev/null @@ -1,56 +0,0 @@ -import timeit - -testcode = ''' -import numpy as np -import pandas as pd - -np.random.seed(12) -vals = np.random.random(1000) -series = pd.Series(vals) -series[series < 0.2] = pd.NA - - -def mad(arr): - """ Median Absolute Deviation: a "Robust" version of standard deviation. - Indices variabililty of the sample. - https://en.wikipedia.org/wiki/Median_absolute_deviation - """ - arr = np.ma.array(arr).compressed() # should be faster to not use masked arrays. - med = np.median(arr) - return np.median(np.abs(arr - med)) - - -def mad2(arr): - """ Median Absolute Deviation: a "Robust" version of standard deviation. - Indices variabililty of the sample. - https://en.wikipedia.org/wiki/Median_absolute_deviation - """ - med = np.median(arr) - return np.median(np.abs(arr - med)) - - -def f1(series): - arr = series.values - arr_without_nan = arr[~np.isnan(arr)] - return mad(arr_without_nan) - - -def f2(series): - arr = series.values - arr_without_nan = arr[~np.isnan(arr)] - return mad(arr_without_nan) - - -def f3(series): - return series.mad() - - -def f4(series): - return series[series.notna()].mad() -''' - - -print(timeit.timeit("f1(series)", number=10, setup=testcode)) -print(timeit.timeit("f2(series)", number=10, setup=testcode)) -print(timeit.timeit("f3(series)", number=10, setup=testcode)) -print(timeit.timeit("f4(series)", number=10, setup=testcode)) diff --git a/tests/performance/time_mean.py b/tests/performance/time_mean.py deleted file mode 100644 index f6149a4c0..000000000 --- a/tests/performance/time_mean.py +++ /dev/null @@ -1,36 +0,0 @@ -import timeit - -testcode = """ -import numpy as np -import pandas as pd - -np.random.seed(12) -vals = np.random.random(1000) -series = pd.Series(vals) -series[series < 0.2] = pd.NA - - -def f1(series): - arr = series.values - arr_without_nan = arr[~np.isnan(arr)] - return np.mean(arr_without_nan) - - -def f2(series): - arr = series.values - return np.nanmean(arr) - - -def f3(series): - return series.mean() - - -def f4(series): - return series[series.notna()].mean() -""" - - -print(timeit.timeit("f1(series)", number=10, setup=testcode)) -print(timeit.timeit("f2(series)", number=10, setup=testcode)) -print(timeit.timeit("f3(series)", number=10, setup=testcode)) -print(timeit.timeit("f4(series)", number=10, setup=testcode)) diff --git a/tests/performance/timings.py b/tests/performance/timings.py deleted file mode 100644 index acde9360d..000000000 --- a/tests/performance/timings.py +++ /dev/null @@ -1,113 +0,0 @@ -import timeit -from itertools import product -from string import ascii_lowercase - -import numpy as np -import pandas as pd -import seaborn as sns -from matplotlib import pyplot as plt - -from pandas_profiling import ProfileReport - - -def generate_column_names(n): - column_names = [] - iters = 1 - while len(column_names) < n: - column_names += list( - "".join(combo) for combo in product(ascii_lowercase, repeat=iters) - ) - iters += 1 - return column_names - - -def make_sample_data(cols, rows): - column_names = generate_column_names(cols) - - df = pd.DataFrame( - np.random.randint(0, 1000000, size=(rows, cols)), columns=column_names[0:cols] - ) - df = df.astype(str) - - assert df.shape == (rows, cols) - return df.copy() - - -def make_report_minimal(df): - report = ProfileReport( - df, - minimal=True, - pool_size=0, - sort="None", - title="Dataset with Numeric Categories", - ) - html = report.to_html() - assert type(html) == str and '

Dataset info

' in html - - -def make_report(df): - report = ProfileReport( - df, - minimal=False, - pool_size=0, - sort="None", - title="Dataset with Numeric Categories", - ) - html = report.to_html() - assert type(html) == str and '

Dataset info

' in html - - -def wrap_func(function): - def inner(df): - def double_inner(): - return function(df) - - return double_inner - - return inner - - -def time_report(func, cols, rows, runs=5): - df = make_sample_data(cols, rows) - print(df.shape) - test = wrap_func(func)(df.copy()) - return timeit.timeit(test, number=runs) / runs - - -def plot_col_run_time(): - cols = [2, 4, 10, 50] - row = 1000 - default_times = [time_report(make_report, col, row) for col in cols] - minimal_times = [time_report(make_report_minimal, col, row) for col in cols] - - ax1 = sns.scatterplot(cols, default_times) - ax2 = sns.scatterplot(cols, minimal_times) - _ = ax1.set( - xlabel=f"Number of columns (row={row})", - ylabel="time (s)", - title="Run Time Complexity", - ) - plt.show() - - -def plot_row_run_time(): - # 10, 100 - # https://github.com/pandas-profiling/pandas-profiling/issues/270 - rows = [1000, 10000, 100000] - col = 10 - default_times = [time_report(make_report, col, row) for row in rows] - minimal_times = [time_report(make_report_minimal, col, row) for row in rows] - - ax1 = sns.scatterplot(rows, default_times) - ax2 = sns.scatterplot(rows, minimal_times) - _ = ax1.set( - xlabel=f"Number of rows (col={col})", - ylabel="time (s)", - title="Run Time Complexity", - ) - plt.show() - - -if __name__ == "__main__": - plot_col_run_time() - plot_row_run_time() diff --git a/tests/unit/test_custom_sample.py b/tests/unit/test_custom_sample.py index 81a0bd551..4aab90280 100644 --- a/tests/unit/test_custom_sample.py +++ b/tests/unit/test_custom_sample.py @@ -1,5 +1,3 @@ -from pathlib import Path - import pandas as pd from pandas_profiling import ProfileReport diff --git a/tests/unit/test_decorator.py b/tests/unit/test_decorator.py index 25d0a58bc..57f3dc5e0 100644 --- a/tests/unit/test_decorator.py +++ b/tests/unit/test_decorator.py @@ -1,5 +1,4 @@ import pandas as pd -import pytest import pandas_profiling @@ -16,11 +15,3 @@ def test_decorator(get_data_file): missing_diagrams={"heatmap": False, "dendrogram": False}, ) assert "Coursera Test Report" in report.to_html(), "Title is not found" - - -def test_empty_decorator(): - df = pd.DataFrame().profile_report(progress_bar=False) - with pytest.raises(ValueError) as e: - df.get_description() - - assert e.value.args[0] == "df can not be empty" diff --git a/tests/unit/test_describe.py b/tests/unit/test_describe.py index 1d6589df3..cbec917f8 100644 --- a/tests/unit/test_describe.py +++ b/tests/unit/test_describe.py @@ -571,12 +571,6 @@ def test_describe_df(column, describe_data, expected_results, summarizer, typese ), f"Histogram missing for column {column}" -def test_describe_empty(summarizer, typeset): - empty_frame = pd.DataFrame() - with pytest.raises(ValueError): - describe("", empty_frame, summarizer, typeset) - - def test_describe_list(summarizer, typeset): with pytest.raises(AttributeError): with pytest.warns(UserWarning): diff --git a/tests/unit/test_duplicates.py b/tests/unit/test_duplicates.py new file mode 100644 index 000000000..b3043edce --- /dev/null +++ b/tests/unit/test_duplicates.py @@ -0,0 +1,30 @@ +"""Test for the duplicates functionality""" +import numpy as np +import pandas as pd +import pytest + +from pandas_profiling.model.duplicates import get_duplicates + + +@pytest.fixture(scope="module") +def test_data(): + np.random.seed(5) + df = pd.DataFrame( + np.random.randint(1, 100, (100, 5)), + columns=["a", "b", "c", "duplicates", "count"], + ) + df = pd.concat([df, df], axis=0) + return df + + +def test_issue725(test_data): + metrics, duplicates = get_duplicates(test_data, list(test_data.columns)) + assert metrics["n_duplicates"] == 100 + assert metrics["p_duplicates"] == 0.5 + assert set(duplicates.columns) == set(test_data.columns).union({"# duplicates"}) + + +def test_issue725_existing(test_data): + test_data = test_data.rename(columns={"count": "# duplicates"}) + with pytest.raises(ValueError): + _, _ = get_duplicates(test_data, list(test_data.columns)) diff --git a/tests/unit/test_example.py b/tests/unit/test_example.py index cbb72a6ee..8b1487543 100644 --- a/tests/unit/test_example.py +++ b/tests/unit/test_example.py @@ -50,3 +50,16 @@ def test_example(get_data_file, test_output_dir): and len(profile.get_description().items()) == 10 ), "Unexpected result" assert "12" in profile.to_html() + + +def test_example_empty(): + df = pd.DataFrame({"A": [], "B": []}) + profile = ProfileReport(df) + description = profile.get_description() + + assert len(description["correlations"]) == 0 + assert len(description["missing"]) == 0 + assert len(description["sample"]) == 0 + + html = profile.to_html() + assert "Dataset is empty" in html diff --git a/tests/unit/test_formatters.py b/tests/unit/test_formatters.py index 09711cd47..4f6f46faf 100644 --- a/tests/unit/test_formatters.py +++ b/tests/unit/test_formatters.py @@ -6,6 +6,7 @@ fmt_bytesize, fmt_class, fmt_color, + fmt_monotonic, fmt_numeric, ) @@ -79,7 +80,36 @@ def test_fmt_array(array, threshold, expected): (81.000000, 10, "81"), (81, 10, "81"), (81.999861123123123123, 10, "81.99986112"), + (1e20, 10, "1 × 1020"), + (1e-20, 10, "1 × 10-20"), + (1e8, 3, "1 × 108"), ], ) def test_fmt_numeric(value, precision, expected): assert fmt_numeric(value, precision) == expected + + +@pytest.mark.parametrize( + "value, expected", + [ + (-2, "Strictly decreasing"), + (-1, "Decreasing"), + (0, "Not monotonic"), + (1, "Increasing"), + (2, "Strictly increasing"), + ], +) +def test_fmt_monotonic(value, expected): + assert fmt_monotonic(value) == expected + + +@pytest.mark.parametrize( + "value", + [ + -3, + 3, + ], +) +def test_fmt_monotonic_err(value): + with pytest.raises(ValueError): + fmt_monotonic(value) diff --git a/tests/unit/test_ge_integration_expectations.py b/tests/unit/test_ge_integration_expectations.py index 40f3850ca..443f2f1aa 100644 --- a/tests/unit/test_ge_integration_expectations.py +++ b/tests/unit/test_ge_integration_expectations.py @@ -123,6 +123,7 @@ def test_datetime_expectations(batch): "column", min_value=0, max_value=100, + parse_strings_as_datetimes=True, ) diff --git a/tests/unit/test_interactions.py b/tests/unit/test_interactions.py index ac658bb99..e02dac497 100644 --- a/tests/unit/test_interactions.py +++ b/tests/unit/test_interactions.py @@ -1,5 +1,3 @@ -from pathlib import Path - import numpy as np import pandas as pd diff --git a/tests/unit/test_summary.py b/tests/unit/test_summary.py new file mode 100644 index 000000000..f44de68b0 --- /dev/null +++ b/tests/unit/test_summary.py @@ -0,0 +1,10 @@ +import pandas as pd + +from pandas_profiling.model.summary import get_table_stats + + +def test_get_table_stats_empty_df(): + df = pd.DataFrame({"A": [], "B": []}) + table_stats = get_table_stats(df, {}) + assert table_stats["n"] == 0 + assert table_stats["p_cells_missing"] == 0 diff --git a/tests/unit/test_summary_algos.py b/tests/unit/test_summary_algos.py index 98460bec6..ec5846670 100644 --- a/tests/unit/test_summary_algos.py +++ b/tests/unit/test_summary_algos.py @@ -1,7 +1,12 @@ import numpy as np import pandas as pd +import pytest -from pandas_profiling.model.summary_algorithms import describe_counts +from pandas_profiling.model.summary_algorithms import ( + describe_counts, + describe_generic, + describe_supported, +) def test_count_summary_sorted(): @@ -24,3 +29,25 @@ def test_count_summary_category(): ) sn, r = describe_counts(s, {}) assert len(r["value_counts_without_nan"].index) == 2 + + +@pytest.fixture(scope="class") +def empty_data() -> pd.DataFrame: + return pd.DataFrame({"A": []}) + + +def test_summary_supported_empty_df(empty_data): + series, summary = describe_counts(empty_data["A"], {}) + assert summary["n_missing"] == 0 + assert "p_missing" not in summary + + series, summary = describe_generic(series, summary) + assert summary["n_missing"] == 0 + assert summary["p_missing"] == 0 + assert summary["count"] == 0 + + _, summary = describe_supported(series, summary) + assert summary["n_distinct"] == 0 + assert summary["p_distinct"] == 0 + assert summary["n_unique"] == 0 + assert not summary["is_unique"]