From f738a3a84d7dd4e19f0a51f0ff3e2556f407d05f Mon Sep 17 00:00:00 2001
From: Dhruv Anand <105786647+dhruv-anand-aintech@users.noreply.github.com>
Date: Thu, 16 May 2024 13:41:51 +0530
Subject: [PATCH] support json, jsonl, xlsx input files

---
 latentscope/scripts/ingest.py | 10 ++++++----
 web/src/components/Home.jsx   |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/latentscope/scripts/ingest.py b/latentscope/scripts/ingest.py
index 9ed1df5..45195ba 100644
--- a/latentscope/scripts/ingest.py
+++ b/latentscope/scripts/ingest.py
@@ -10,7 +10,7 @@
 def main():
     parser = argparse.ArgumentParser(description='Ingest a dataset')
     parser.add_argument('id', type=str, help='Dataset id (directory name in data folder)')
-    parser.add_argument('--path', type=str, help='Path to csv or parquet file, otherwise assumes input.csv in dataset directory')
+    parser.add_argument('--path', type=str, help='Path to csv/parquet/json/jsonl/xlsx file, otherwise assumes input.csv in dataset directory')
     parser.add_argument('--text_column', type=str, help='Column to use as text for the scope')
     args = parser.parse_args()
     ingest_file(args.id, args.path, args.text_column)
@@ -40,9 +40,11 @@ def ingest_file(dataset_id, file_path, text_column = None):
     elif file_type == "parquet":
         df = pd.read_parquet(file)
     elif file_type == "jsonl":
-        with open(file, 'r') as f:
-            lines = f.readlines()
-            df = pd.DataFrame([json.loads(line) for line in lines])
+        df = pd.read_json(file, lines=True)
+    elif file_type == "json":
+        df = pd.read_json(file)
+    elif file_type == "xlsx":
+        df = pd.read_excel(file)
     else:
         raise ValueError(f"Unsupported file type: {file_type}")
 
diff --git a/web/src/components/Home.jsx b/web/src/components/Home.jsx
index 1112f6d..30c4dd8 100644
--- a/web/src/components/Home.jsx
+++ b/web/src/components/Home.jsx
@@ -115,7 +115,7 @@ function Home() {
         <h3>Create new dataset</h3>
         <form onSubmit={handleNewDataset} onDragOver={handleDragOver} onDrop={handleDrop}>
           <label htmlFor="upload-button">
-            <span>Import a CSV or Parquet file to create a new dataset</span>
+            <span>Import a CSV/Parquet/JSON/JSONL/XLSX file to create a new dataset</span>
           </label>
           <input
             hidden