From 0427602f4cf26b196f418f537cf681d273ea497d Mon Sep 17 00:00:00 2001 From: Daniel Obraczka Date: Thu, 11 Apr 2024 11:17:03 +0200 Subject: [PATCH] Fallback use for existing cluster helper file if no parquet is present (#41) * Use fallback for cache loading if cluster helper file exists * Use constant for column names --- sylloge/base.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sylloge/base.py b/sylloge/base.py index 3c2c441..b6c2c90 100644 --- a/sylloge/base.py +++ b/sylloge/base.py @@ -310,6 +310,16 @@ def _read_ch_or_df_links( raise ValueError( "Need to supply read_parquet_fn if not using ClusterHelper" ) + parquet_path = f"{path}_parquet" + if not os.path.exists(parquet_path) and os.path.exists(path): + logger.info( + f"Did not find {parquet_path}, but ClusterHelper file. Creating parquet file from ClusterHelper (no intra-dataset links are used!)" + ) + assert ds_prefixes is not None + ch = PrefixedClusterHelper.from_file(path, ds_prefixes=ds_prefixes) # type: ignore[return-value] + pd.DataFrame(list(ch.all_pairs_no_intra()), columns=EA_SIDES).to_parquet( + parquet_path + ) return read_parquet_fn(f"{path}_parquet", **kwargs) # type: ignore[return-value] @classmethod