Skip to content

Commit

Permalink
create export page for saving data. enhance scope datastructure for e…
Browse files Browse the repository at this point in the history
…xport. address #32
  • Loading branch information
enjalot committed Mar 13, 2024
1 parent 9f2fe16 commit 89057ba
Show file tree
Hide file tree
Showing 9 changed files with 319 additions and 60 deletions.
52 changes: 50 additions & 2 deletions latentscope/scripts/scope.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
import json
import argparse
import pandas as pd
from latentscope.util import get_data_dir


Expand All @@ -14,8 +15,12 @@ def main():
parser.add_argument('cluster_labels_id', type=str, help='Cluster labels id')
parser.add_argument('label', type=str, help='Label for the scope')
parser.add_argument('description', type=str, help='Description of the scope')
parser.add_argument('--scope_id', type=str, help='Scope id to overwrite existing scope', default=None)

def scope(dataset_id, embedding_id, umap_id, cluster_id, cluster_labels_id, label, description):
args = parser.parse_args()
scope(**vars(args))

def scope(dataset_id, embedding_id, umap_id, cluster_id, cluster_labels_id, label, description, scope_id=None):
DATA_DIR = get_data_dir()
print("DATA DIR", DATA_DIR)
directory = os.path.join(DATA_DIR, dataset_id, "scopes")
Expand All @@ -33,7 +38,11 @@ def get_next_scopes_number(dataset):

next_scopes_number = get_next_scopes_number(dataset_id)
# make the umap name from the number, zero padded to 3 digits
id = f"scopes-{next_scopes_number:03d}"
if not scope_id:
id = f"scopes-{next_scopes_number:03d}"
else:
id = scope_id

print("RUNNING:", id)

scope = {
Expand All @@ -45,6 +54,45 @@ def get_next_scopes_number(dataset):
"label": label,
"description": description
}

# read each json file and add its contents to the scope file
embedding_file = os.path.join(DATA_DIR, dataset_id, "embeddings", embedding_id + ".json")
with open(embedding_file) as f:
embedding = json.load(f)
scope["embedding"] = embedding

umap_file = os.path.join(DATA_DIR, dataset_id, "umaps", umap_id + ".json")
with open(umap_file) as f:
umap = json.load(f)
scope["umap"] = umap

cluster_file = os.path.join(DATA_DIR, dataset_id, "clusters", cluster_id + ".json")
with open(cluster_file) as f:
cluster = json.load(f)
scope["cluster"] = cluster

if cluster_labels_id == "default":
cluster_labels_id = cluster_id + "-labels-default"
scope["cluster_labels"] = {"id": cluster_labels_id, "cluster_id": cluster_id}
else:
cluster_labels_file = os.path.join(DATA_DIR, dataset_id, "clusters", cluster_labels_id + ".json")
with open(cluster_labels_file) as f:
cluster_labels = json.load(f)
scope["cluster_labels"] = cluster_labels

# create a scope parquet by combining the parquets from umap and cluster, as well as getting the labels from cluster_labels
# then write the parquet to the scopes directory
umap_df = pd.read_parquet(os.path.join(DATA_DIR, dataset_id, "umaps", umap_id + ".parquet"))
cluster_df = pd.read_parquet(os.path.join(DATA_DIR, dataset_id, "clusters", cluster_id + ".parquet"))
cluster_labels_df = pd.read_parquet(os.path.join(DATA_DIR, dataset_id, "clusters", cluster_labels_id + ".parquet"))
# create a column where we lookup the label from cluster_labels_df for the index found in the cluster_df
cluster_df["label"] = cluster_df["cluster"].apply(lambda x: cluster_labels_df.loc[x]["label"])
scope_parquet = pd.concat([umap_df, cluster_df], axis=1)
scope_parquet.to_parquet(os.path.join(directory, id + ".parquet"))

scope["rows"] = len(scope_parquet)
scope["columns"] = scope_parquet.columns.tolist()
scope["size"] = os.path.getsize(os.path.join(directory, id + ".parquet"))

file_path = os.path.join(directory, id + ".json")
with open(file_path, 'w') as f:
Expand Down
50 changes: 22 additions & 28 deletions latentscope/server/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,31 +197,25 @@ def get_dataset_scope(dataset, scope):
json_contents = json.load(json_file)
return jsonify(json_contents)

@datasets_write_bp.route('/<dataset>/scopes/save', methods=['POST'])
def save_dataset_scope(dataset):
if not request.json:
return jsonify({"error": "Invalid data format, JSON expected"}), 400
id = request.json.get('id')
embedding_id = request.json.get('embedding_id')
umap_id = request.json.get('umap_id')
cluster_id = request.json.get('cluster_id')
cluster_labels_id = request.json.get('cluster_labels_id')
label = request.json.get('label')
description = request.json.get('description')
scope = {
"embedding_id": embedding_id,
"umap_id": umap_id,
"cluster_id": cluster_id,
"cluster_labels_id": cluster_labels_id,
"label": label,
"description": description
}
if not id:
next_scopes_number = get_next_scopes_number(dataset)
# make the umap name from the number, zero padded to 3 digits
id = f"scopes-{next_scopes_number:03d}"
scope["id"] = id
file_path = os.path.join(DATA_DIR, dataset, "scopes", id + ".json")
with open(file_path, 'w') as f:
json.dump(scope, f, indent=2)
return jsonify(scope)
@datasets_bp.route('/<dataset>/export/list', methods=['GET'])
def get_dataset_export_list(dataset):
directory_path = os.path.join(DATA_DIR, dataset)
print("dataset", dataset, directory_path)
# scan the directory for files and directories
# then walk the directories to find all the files
# then return the list of files
file_list = []
for root, dirs, files in os.walk(directory_path):
if "jobs" in root:
continue
for file in files:
if file == ".DS_Store":
continue
full_path = os.path.join(root, file)
file_name = os.path.basename(full_path)
relative_path = os.path.relpath(full_path, directory_path)
directory = os.path.relpath(root, directory_path)
size = os.path.getsize(full_path)
file_list.append((file_name, directory, relative_path, full_path, size))

return jsonify(file_list)
20 changes: 20 additions & 0 deletions latentscope/server/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,23 @@ def run_cluster_label():
command = f'ls-label {dataset} "{text_column}" {cluster_id} {chat_id} "{context}"'
threading.Thread(target=run_job, args=(dataset, job_id, command)).start()
return jsonify({"job_id": job_id})

@jobs_write_bp.route('/scope')
def run_scope():
dataset = request.args.get('dataset')
embedding_id = request.args.get('embedding_id')
umap_id = request.args.get('umap_id')
cluster_id = request.args.get('cluster_id')
cluster_labels_id = request.args.get('cluster_labels_id')
label = request.args.get('label')
description = request.args.get('description')
scope_id = request.args.get('scope_id')
print("run scope", dataset, embedding_id, umap_id, cluster_id, cluster_labels_id, label, description, scope_id)

job_id = str(uuid.uuid4())
command = f'ls-scope {dataset} {embedding_id} {umap_id} {cluster_id} {cluster_labels_id} "{label}" "{description}"'
if scope_id:
command += f' --scope_id={scope_id}'
threading.Thread(target=run_job, args=(dataset, job_id, command)).start()
return jsonify({"job_id": job_id})

3 changes: 3 additions & 0 deletions web/src/App.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import Compare from './pages/Compare';
import Setup from './pages/Setup';
import Jobs from './pages/Jobs';
import Job from './pages/Job';
import Export from './pages/Export';
import Nav from './components/Nav';
import './App.css';

Expand All @@ -28,6 +29,8 @@ function App() {
<Route path="/settings" element={<Settings />} />
<Route path="/datasets/:dataset/explore/:scope" element={<Explore />} />
<Route path="/datasets/:dataset/compare/" element={<Compare/>} />
<Route path="/datasets/:dataset/export" element={<Export />} />
<Route path="/datasets/:dataset/export/:scope" element={<Export />} />

{readonly ? null : <Route path="/datasets/:dataset/setup" element={<Setup />} />}
{readonly ? null : <Route path="/datasets/:dataset/setup/:scope" element={<Setup />} />}
Expand Down
1 change: 1 addition & 0 deletions web/src/components/Home.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ function Home() {
<span className="scope-description">{scope.description}</span>
<br/>
{readonly ? null : <Link to={`/datasets/${dataset.id}/setup/${scope.id}`}>Configure</Link> }
{readonly ? null : <> | <Link to={`/datasets/${dataset.id}/export/${scope.id}`}>Export</Link> </>}
</div>
))}
</div>
Expand Down
86 changes: 56 additions & 30 deletions web/src/components/Setup/Scope.jsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
// NewEmbedding.jsx
import { useState, useEffect, useCallback} from 'react';
import { Link, useNavigate } from 'react-router-dom';
import { useStartJobPolling } from '../Job/Run';
import JobProgress from '../Job/Progress';

const apiUrl = import.meta.env.VITE_API_URL


Expand All @@ -19,21 +22,37 @@ Scope.propTypes = {
};

function Scope({ dataset, scope, umap, embedding, cluster, clusterLabelId, onNew, onChange}) {
// const[scopes, setScopes] = useState([]);
const navigate = useNavigate();

const [scopeJob, setScopeJob] = useState(null);
const { startJob: startScopeJob} = useStartJobPolling(dataset, setScopeJob, `${apiUrl}/jobs/scope`);

useEffect(() => {
if(dataset) {
console.log("fetching scopes")
fetch(`${apiUrl}/datasets/${dataset.id}/scopes`)
fetchScopes(dataset.id, onNew)
}
}, [dataset]);

function fetchScopes(datasetId, onNew) {
fetch(`${apiUrl}/datasets/${datasetId}/scopes`)
.then(response => response.json())
.then(data => {
const sorted = data.sort((a,b) => a.id.localeCompare(b.id))
// setScopes(sorted)
onNew(sorted)
});
}

useEffect(() => {
if(scopeJob?.status == "completed") {
fetchScopes(dataset.id, (scopes) => {
setScopeJob(null)
onNew(scopes)
// onChange(scopes.find(d => d.id == scopeJob.run_id))
navigate(`/datasets/${dataset.id}/setup/${scopeJob.run_id}`);
})
}
}, [dataset]);
}, [scopeJob, dataset, navigate, onNew, onChange]);


const handleSaveScope = useCallback((event) => {
Expand All @@ -53,32 +72,33 @@ function Scope({ dataset, scope, umap, embedding, cluster, clusterLabelId, onNew
const action = data.get('action')
console.log("action", action)
if(action == "save") {
payload.id = scope.id
payload.scope_id = scope.id
}
startScopeJob(payload)

fetch(`${apiUrl}/datasets/${dataset.id}/scopes/save`, {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(payload)
})
.then(response => response.json())
.then(data => {
const tscope = data
fetch(`${apiUrl}/datasets/${dataset.id}/scopes`)
.then(response => response.json())
.then(data => {
// setScopes(data)
onNew(data)
onChange(data.find(s => s.id == tscope.id))
});
navigate(`/datasets/${dataset.id}/setup/${data.id}`);
})
.catch(error => {
console.error('Error saving scope:', error);
});
}, [dataset, scope, cluster, clusterLabelId, umap, embedding , navigate, onChange, onNew]);
// fetch(`${apiUrl}/datasets/${dataset.id}/scopes/save`, {
// method: 'POST',
// headers: {
// 'Content-Type': 'application/json'
// },
// body: JSON.stringify(payload)
// })
// .then(response => response.json())
// .then(data => {
// const tscope = data
// fetch(`${apiUrl}/datasets/${dataset.id}/scopes`)
// .then(response => response.json())
// .then(data => {
// // setScopes(data)
// onNew(data)
// onChange(data.find(s => s.id == tscope.id))
// });
// navigate(`/datasets/${dataset.id}/setup/${data.id}`);
// })
// .catch(error => {
// console.error('Error saving scope:', error);
// });
}, [dataset, scope, cluster, clusterLabelId, umap, embedding]);

const [isDifferent, setIsDifferent] = useState(false);
useEffect(() => {
Expand Down Expand Up @@ -134,15 +154,21 @@ function Scope({ dataset, scope, umap, embedding, cluster, clusterLabelId, onNew
Labels: { scope.cluster_labels_id }<br/>

</div> : null }
{scope && isDifferent ?

<JobProgress job={scopeJob} clearJob={()=>setScopeJob(null)} />

{scope && !scopeJob ?
<button type="submit" disabled={cluster ? false : true } onClick={() => {
document.querySelector('input[name="action"]').value = 'save';
}}>Overwrite {scope.name}</button> : null }
{ isDifferent ? <button type="submit" disabled={cluster ? false : true } onClick={() => {
{ isDifferent && !scopeJob ? <button type="submit" disabled={cluster ? false : true } onClick={() => {
document.querySelector('input[name="action"]').value = 'new';
}}>New scope</button> : null }
</form>


{ scope ? <Link to={`/datasets/${dataset?.id}/explore/${scope?.id}`}> Explore {scope.label} ({scope.id}) <br/></Link> : null }
{ scope ? <Link to={`/datasets/${dataset?.id}/export/${scope?.id}`}> Export data ({scope.id}) <br/></Link> : null }

</div>
</div>
Expand Down
Loading

0 comments on commit 89057ba

Please sign in to comment.