From e4983c51ea52c46eaec444a9f1b6ca7a5f6c11cd Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Fri, 7 Oct 2022 20:33:07 +0300
Subject: [PATCH] fix: drop joblib dependency (#1090)

Joblib was only used for `joblib.hash` for dataframes, but there's `hash_pandas_object` for that.

Tangentially refs #1056
---
 requirements.txt                        |  3 +--
 src/pandas_profiling/utils/dataframe.py | 17 ++++++++++++++---
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 08c18ff8b..493bf006e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
-joblib~=1.2.0 # 1.1.0
-scipy>=1.4.1, <1.10 
+scipy>=1.4.1, <1.10
 pandas>1.1, <1.6, !=1.4.0
 matplotlib>=3.2, <3.6
 pydantic>=1.8.1, <1.11
diff --git a/src/pandas_profiling/utils/dataframe.py b/src/pandas_profiling/utils/dataframe.py
index 88d216674..bf6ad3473 100644
--- a/src/pandas_profiling/utils/dataframe.py
+++ b/src/pandas_profiling/utils/dataframe.py
@@ -1,12 +1,13 @@
 """Utils for pandas DataFrames."""
+import hashlib
 import re
 import unicodedata
 import warnings
 from pathlib import Path
 from typing import Any, Optional
 
-import joblib
 import pandas as pd
+from pandas.core.util.hashing import hash_pandas_object
 
 
 def warn_read(extension: str) -> None:
@@ -180,8 +181,12 @@ def expand_mixed(df: pd.DataFrame, types: Any = None) -> pd.DataFrame:
     return df
 
 
+# Change this if `hash_dataframe`'s implementation changes.
+HASH_PREFIX = "2@"
+
+
 def hash_dataframe(df: pd.DataFrame) -> str:
-    """Hash a DataFrame (wrapper around joblib.hash, might change in the future)
+    """Hash a DataFrame (implementation might change in the future)
 
     Args:
         df: the DataFrame
@@ -189,7 +194,13 @@ def hash_dataframe(df: pd.DataFrame) -> str:
     Returns:
         The DataFrame's hash
     """
-    return joblib.hash(df)
+    # hash_pandas_object returns a series of uint64s. Using their
+    # binary representation would be more efficient, but it's not
+    # necessarily portable across architectures. Using the human-readable
+    # string values should be good enough.
+    hash_values = "\n".join(hash_pandas_object(df).values.astype(str))
+    digest = hashlib.sha256(hash_values.encode("utf-8")).hexdigest()
+    return f"{HASH_PREFIX}{digest}"
 
 
 def slugify(value: str, allow_unicode: bool = False) -> str: