Merge branch 'develop' into fix/spark_column_conflict

ydataai · May 6, 2024 · 2d7f8bb · 2d7f8bb
2 parents 925bcee + 12376b3
commit 2d7f8bb
Show file tree

Hide file tree

Showing 13 changed files with 58 additions and 33 deletions.
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
@@ -16,7 +16,7 @@ jobs:
       with:
         fetch-depth: 0
 
-    - uses: wagoid/commitlint-github-action@v5
+    - uses: wagoid/commitlint-github-action@v6
 
   lint:
     if: github.actor != 'renovate[bot]'

diff --git a/.github/workflows/sonarqube.yaml b/.github/workflows/sonarqube.yaml
@@ -25,7 +25,7 @@ jobs:
         echo "sonar.projectKey=${{ github.event.repository.name }}" > sonar-project.properties
 
     - name: SonarQube Scan
-      uses: sonarsource/sonarqube-scan-action@v2.0.1
+      uses: sonarsource/sonarqube-scan-action@v2.0.2
       env:
         SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
         SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }}
diff --git a/README.md b/README.md
@@ -220,7 +220,7 @@ pip install -e .
 
 The profiling report is written in HTML and CSS, which means a modern browser is required. 
 
-You need [Python 3](https://python3statement.org/) to run the package. Other dependencies can be found in the requirements files:
+You need [Python 3](https://python3statement.github.io/) to run the package. Other dependencies can be found in the requirements files:
 
 | Filename | Requirements|
 |----------|-------------|

diff --git a/docs/index.md b/docs/index.md
@@ -7,18 +7,12 @@ YData-profiling is a leading tool in the data understanding step of the data sci
 complete with statistics and visualizations. The significance of the package lies in how it streamlines the process of
 understanding and preparing data for analysis in a single line of code! If you're ready to get started see the [quickstart](getting-started/quickstart.md)!
 
-!!! tip "Advent of Code - Get featured on ydata-profiling"
-
-    *“I want to get into open source, but I don’t know how.”* - Does this sound familiar to you? Have you been wanting to
-    get more involved with open-source software, but no one’s given you an entry point?
+!!! tip "Profiling and scale and for databases"
 
-    That's why we joined [The Advent of code this year](https://zilliz.com/advent-of-code). Contribute to ydata-profiling and win some 🐼🐼 swag!
-
-    How can you be part of it?
+    Take your data profiling to the next level - try ydata-profiling at scale and for databases! 
 
-    - Give us some love with a Github ⭐
-    - Write an article or create a tutorial like other [members the community already did.](https://medium.com/@seckindinc/data-profiling-with-python-36497d3a1261)
-    - Feeling adventurous? Contribute with a PR. We have a list of [great issues to get you started.](https://github.com/ydataai/ydata-profiling/issues?q=label%3A%22getting+started+%E2%98%9D%22+)
+    Experience enterprise-level scalability and database support while enjoying the familiar open-source features you love. 
+    Dive into large datasets with ease and ensure data quality like never before. Try [YData Fabric community version](https://ydata.ai/register)! 
 
 ![ydata-profiling report](_static/img/ydata-profiling.gif)
 

diff --git a/requirements-docs.txt b/requirements-docs.txt
@@ -2,6 +2,6 @@ mkdocs>=1.4.2,<1.5.2
 mkdocs-material>=9.0.12,<10.0.0
 mkdocs-material-extensions>=1.1.1,<2.0.0
 mkdocs-table-reader-plugin<=2.0.1
-mike>=2.0.0,<2.1.0
+mike>=2.1.1,<2.2.0
 mkdocstrings[python]>=0.20.0,<1.0.0
 mkdocs-badges
diff --git a/requirements.txt b/requirements.txt
@@ -15,7 +15,7 @@ phik>=0.11.1,<0.13
 requests>=2.24.0, <3
 # Progress bar
 tqdm>=4.48.2, <5
-seaborn>=0.10.1, <0.13
+seaborn>=0.10.1, <0.14
 multimethod>=1.4, <2
 # metrics
 statsmodels>=0.13.2, <1

diff --git a/src/ydata_profiling/model/pandas/describe_boolean_pandas.py b/src/ydata_profiling/model/pandas/describe_boolean_pandas.py
@@ -1,5 +1,6 @@
 from typing import Tuple
 
+import numpy as np
 import pandas as pd
 
 from ydata_profiling.config import Settings
@@ -26,9 +27,17 @@ def pandas_describe_boolean_1d(
         A dict containing calculated series description values.
     """
 
-    value_counts = summary["value_counts_without_nan"]
-    summary.update({"top": value_counts.index[0], "freq": value_counts.iloc[0]})
-
-    summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts))
+    value_counts: pd.Series = summary["value_counts_without_nan"]
+    if not value_counts.empty:
+        summary.update({"top": value_counts.index[0], "freq": value_counts.iloc[0]})
+        summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts))
+    else:
+        summary.update(
+            {
+                "top": np.nan,
+                "freq": 0,
+                "imbalance": 0,
+            }
+        )
 
     return config, series, summary
diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
@@ -195,10 +195,14 @@ def length_summary_vc(vc: pd.Series) -> dict:
 
     summary = {
         "max_length": np.max(length_counts.index),
-        "mean_length": np.average(length_counts.index, weights=length_counts.values),
+        "mean_length": np.average(length_counts.index, weights=length_counts.values)
+        if not length_counts.empty
+        else np.nan,
         "median_length": weighted_median(
             length_counts.index.values, weights=length_counts.values
-        ),
+        )
+        if not length_counts.empty
+        else np.nan,
         "min_length": np.min(length_counts.index),
         "length_histogram": length_counts,
     }

diff --git a/src/ydata_profiling/model/pandas/describe_date_pandas.py b/src/ydata_profiling/model/pandas/describe_date_pandas.py
@@ -29,16 +29,26 @@ def pandas_describe_date_1d(
     Returns:
         A dict containing calculated series description values.
     """
-    summary.update(
-        {
-            "min": pd.Timestamp.to_pydatetime(series.min()),
-            "max": pd.Timestamp.to_pydatetime(series.max()),
-        }
-    )
-
-    summary["range"] = summary["max"] - summary["min"]
-
-    values = series.values.astype(np.int64) // 10**9
+    if summary["value_counts_without_nan"].empty:
+        values = series.values
+        summary.update(
+            {
+                "min": pd.NaT,
+                "max": pd.NaT,
+                "range": 0,
+            }
+        )
+    else:
+        summary.update(
+            {
+                "min": pd.Timestamp.to_pydatetime(series.min()),
+                "max": pd.Timestamp.to_pydatetime(series.max()),
+            }
+        )
+
+        summary["range"] = summary["max"] - summary["min"]
+
+        values = series.values.astype(np.int64) // 10**9
 
     if config.vars.num.chi_squared_threshold > 0.0:
         summary["chi_squared"] = chi_square(values)

diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py
@@ -34,6 +34,8 @@ def histogram_compute(
     weights: Optional[np.ndarray] = None,
 ) -> dict:
     stats = {}
+    if len(finite_values) == 0:
+        return {name: []}
     hist_config = config.plot.histogram
     bins_arg = "auto" if hist_config.bins == 0 else min(hist_config.bins, n_unique)
     bins = np.histogram_bin_edges(finite_values, bins=bins_arg)
@@ -54,6 +56,8 @@ def chi_square(
     if histogram is None:
         bins = np.histogram_bin_edges(values, bins="auto")
         histogram, _ = np.histogram(values, bins=bins)
+    if len(histogram) == 0 or np.sum(histogram) == 0:
+        return {"statistic": 0, "pvalue": 0}
     return dict(chisquare(histogram)._asdict())
 
 

diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_constant.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_constant.html
@@ -1 +1 @@
-<a href="#pp_var_{{ alert.anchor_id }}"><code>{{ alert.column_name }}</code></a> has constant value "{{ alert.values['mode'] }}"
+<a href="#pp_var_{{ alert.anchor_id }}"><code>{{ alert.column_name }}</code></a> has constant value "{{ alert.values['value_counts_without_nan'].index[0] }}"
diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/sequence/sections.html b/src/ydata_profiling/report/presentation/flavours/html/templates/sequence/sections.html
@@ -5,6 +5,9 @@
             <div class="row header">
                 <a class="anchor-pos" id="{{ section.content['anchor_id'] }}"></a>
                 <h1 class="page-header">{{ section.content['name'] }}</h1>
+                {% if section.content['name'] == 'Overview' %}
+                <p class="text-muted text-right">Brought to you by <a href="https://ydata.ai/?utm_source=opensource&utm_medium=ydataprofiling&utm_campaign=report">YData</a></p>
+                {% endif %}
             </div>
             <div class="section-items">
                 {{ html }}

diff --git a/src/ydata_profiling/report/structure/variables/render_date.py b/src/ydata_profiling/report/structure/variables/render_date.py
@@ -103,13 +103,14 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
         )
 
     # Bottom
+    n_bins = len(summary["histogram"][1]) - 1 if summary["histogram"] else 0
     bottom = Container(
         [
             Image(
                 hist_data,
                 image_format=image_format,
                 alt="Histogram",
-                caption=f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
+                caption=f"<strong>Histogram with fixed size bins</strong> (bins={n_bins})",
                 name="Histogram",
                 anchor_id=f"{varid}histogram",
             )
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		<a href="#pp_var_{{ alert.anchor_id }}"><code>{{ alert.column_name }}</code></a> has constant value "{{ alert.values['mode'] }}"
		<a href="#pp_var_{{ alert.anchor_id }}"><code>{{ alert.column_name }}</code></a> has constant value "{{ alert.values['value_counts_without_nan'].index[0] }}"