From f738a3a84d7dd4e19f0a51f0ff3e2556f407d05f Mon Sep 17 00:00:00 2001 From: Dhruv Anand <105786647+dhruv-anand-aintech@users.noreply.github.com> Date: Thu, 16 May 2024 13:41:51 +0530 Subject: [PATCH] support json, jsonl, xlsx input files --- latentscope/scripts/ingest.py | 10 ++++++---- web/src/components/Home.jsx | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/latentscope/scripts/ingest.py b/latentscope/scripts/ingest.py index 9ed1df5..45195ba 100644 --- a/latentscope/scripts/ingest.py +++ b/latentscope/scripts/ingest.py @@ -10,7 +10,7 @@ def main(): parser = argparse.ArgumentParser(description='Ingest a dataset') parser.add_argument('id', type=str, help='Dataset id (directory name in data folder)') - parser.add_argument('--path', type=str, help='Path to csv or parquet file, otherwise assumes input.csv in dataset directory') + parser.add_argument('--path', type=str, help='Path to csv/parquet/json/jsonl/xlsx file, otherwise assumes input.csv in dataset directory') parser.add_argument('--text_column', type=str, help='Column to use as text for the scope') args = parser.parse_args() ingest_file(args.id, args.path, args.text_column) @@ -40,9 +40,11 @@ def ingest_file(dataset_id, file_path, text_column = None): elif file_type == "parquet": df = pd.read_parquet(file) elif file_type == "jsonl": - with open(file, 'r') as f: - lines = f.readlines() - df = pd.DataFrame([json.loads(line) for line in lines]) + df = pd.read_json(file, lines=True) + elif file_type == "json": + df = pd.read_json(file) + elif file_type == "xlsx": + df = pd.read_excel(file) else: raise ValueError(f"Unsupported file type: {file_type}") diff --git a/web/src/components/Home.jsx b/web/src/components/Home.jsx index 1112f6d..30c4dd8 100644 --- a/web/src/components/Home.jsx +++ b/web/src/components/Home.jsx @@ -115,7 +115,7 @@ function Home() {