From e4983c51ea52c46eaec444a9f1b6ca7a5f6c11cd Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Fri, 7 Oct 2022 20:33:07 +0300 Subject: [PATCH] fix: drop joblib dependency (#1090) Joblib was only used for `joblib.hash` for dataframes, but there's `hash_pandas_object` for that. Tangentially refs #1056 --- requirements.txt | 3 +-- src/pandas_profiling/utils/dataframe.py | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 08c18ff8b..493bf006e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ -joblib~=1.2.0 # 1.1.0 -scipy>=1.4.1, <1.10 +scipy>=1.4.1, <1.10 pandas>1.1, <1.6, !=1.4.0 matplotlib>=3.2, <3.6 pydantic>=1.8.1, <1.11 diff --git a/src/pandas_profiling/utils/dataframe.py b/src/pandas_profiling/utils/dataframe.py index 88d216674..bf6ad3473 100644 --- a/src/pandas_profiling/utils/dataframe.py +++ b/src/pandas_profiling/utils/dataframe.py @@ -1,12 +1,13 @@ """Utils for pandas DataFrames.""" +import hashlib import re import unicodedata import warnings from pathlib import Path from typing import Any, Optional -import joblib import pandas as pd +from pandas.core.util.hashing import hash_pandas_object def warn_read(extension: str) -> None: @@ -180,8 +181,12 @@ def expand_mixed(df: pd.DataFrame, types: Any = None) -> pd.DataFrame: return df +# Change this if `hash_dataframe`'s implementation changes. +HASH_PREFIX = "2@" + + def hash_dataframe(df: pd.DataFrame) -> str: - """Hash a DataFrame (wrapper around joblib.hash, might change in the future) + """Hash a DataFrame (implementation might change in the future) Args: df: the DataFrame @@ -189,7 +194,13 @@ def hash_dataframe(df: pd.DataFrame) -> str: Returns: The DataFrame's hash """ - return joblib.hash(df) + # hash_pandas_object returns a series of uint64s. Using their + # binary representation would be more efficient, but it's not + # necessarily portable across architectures. Using the human-readable + # string values should be good enough. + hash_values = "\n".join(hash_pandas_object(df).values.astype(str)) + digest = hashlib.sha256(hash_values.encode("utf-8")).hexdigest() + return f"{HASH_PREFIX}{digest}" def slugify(value: str, allow_unicode: bool = False) -> str: