Canner · cyyeh · Oct 16, 2024 · Oct 14, 2024 · Oct 14, 2024 · Oct 14, 2024
diff --git a/.github/workflows/ai-service-test.yaml b/.github/workflows/ai-service-test.yaml
@@ -28,7 +28,7 @@ defaults:
 jobs:
   pytest:
     if: ${{ contains(github.event.pull_request.labels.*.name, 'ci/ai-service') || github.event_name == 'push' }}
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     timeout-minutes: 10
     steps:
       - uses: actions/checkout@v4

diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ wren-ai-service/src/eval/wren-engine/etc/archived
 wren-ai-service/src/eval/data
 wren-ai-service/**/outputs/
 wren-ai-service/**/spider/
+!wren-ai-service/**/metrics/spider/
 !wren-ai-service/tests/data
 !wren-ai-service/src/eval/data/book_2*.json
 !wren-ai-service/src/eval/data/baseball_1*.json

diff --git a/wren-ai-service/Justfile b/wren-ai-service/Justfile
@@ -10,6 +10,9 @@ start:
 curate_eval_data:
 	poetry run streamlit run eval/data_curation/app.py
 
+prep:
+	poetry run python -m eval.preparation
+
 predict dataset pipeline='ask':
     @poetry run python -u eval/prediction.py --file {{dataset}} --pipeline {{pipeline}}
 

diff --git a/wren-ai-service/eval/.env.example b/wren-ai-service/eval/.env.example
@@ -3,8 +3,10 @@ OPENAI_API_KEY=
 WREN_IBIS_ENDPOINT=http://localhost:8000
 WREN_ENGINE_ENDPOINT=http://localhost:8080
 WREN_IBIS_TIMEOUT=10
+BATCH_SIZE=4
+BATCH_INTERVAL=1
+
+DATA_SOURCE=bigquery
 bigquery.project-id=
 bigquery.dataset-id=
-bigquery.credentials-key=
-BATCH_SIZE=4
-BATCH_INTERVAL=1
+bigquery.credentials-key=
diff --git a/wren-ai-service/eval/README.md b/wren-ai-service/eval/README.md
@@ -15,6 +15,25 @@ The dataset curation process is used to prepare the evaluation dataset for the W
 - copy `.env.example` to `.env` and fill in the environment variables
 - execute the command under the `wren-ai-service` folder: `just curate_eval_data`
 
+## Eval Dataset Preparation(If using Spider 1.0 dataset)
+
+This command will do two things:
+1. download Spider 1.0 dataset in `wren-ai-service/tools/dev/spider1.0`; and there are two folders inside: database and spider_data
+    - database: it contains test data. It's downloaded from [this repo](https://github.com/taoyds/test-suite-sql-eval).
+    - spider_data: it contains table schema, ground truths(question sql pairs), etc. For more information, please refer to [this repo](https://github.com/taoyds/spider).
+2. prepare evaluation dataset and put them in `wren-ai-service/eval/dataset`. File name of eval dataset for Spider would look like this: `spider_<db_name>_eval_dataset.toml`
+
+```cli
+just prep
+```
+
+## Evaluation Dataset Schema
+
+- dataset_id(UUID)
+- date
+- mdl
+- eval dataset
+
 ## Prediction Process
 
 The prediction process is used to produce the results of the evaluation data using the Wren AI service. It will create traces and a session on Langfuse to make the results available to the user. You can use the following command to predict the evaluation dataset under the `eval/dataset` directory:

diff --git a/wren-ai-service/eval/data_curation/app.py b/wren-ai-service/eval/data_curation/app.py
@@ -12,16 +12,23 @@
 from streamlit_tags import st_tags
 from utils import (
     DATA_SOURCES,
+    WREN_ENGINE_ENDPOINT,
+    WREN_IBIS_ENDPOINT,
     get_contexts_from_sqls,
     get_data_from_wren_engine_with_sqls,
-    get_documents_given_contexts,
-    get_eval_dataset_in_toml_string,
     get_openai_client,
     get_question_sql_pairs,
     is_sql_valid,
     prettify_sql,
 )
 
+from eval.utils import (
+    get_documents_given_contexts,
+    get_eval_dataset_in_toml_string,
+    prepare_duckdb_init_sql,
+    prepare_duckdb_session_sql,
+)
+
 st.set_page_config(layout="wide")
 st.title("WrenAI Data Curation App")
 
@@ -101,11 +108,17 @@ def on_click_setup_uploaded_file():
             uploaded_file.getvalue().decode("utf-8")
         )
 
-        st.session_state["connection_info"] = {
-            "project_id": os.getenv("bigquery.project-id"),
-            "dataset_id": os.getenv("bigquery.dataset-id"),
-            "credentials": os.getenv("bigquery.credentials-key"),
-        }
+        if data_source == "bigquery":
+            st.session_state["connection_info"] = {
+                "project_id": os.getenv("bigquery.project-id"),
+                "dataset_id": os.getenv("bigquery.dataset-id"),
+                "credentials": os.getenv("bigquery.credentials-key"),
+            }
+        elif data_source == "duckdb":
+            prepare_duckdb_session_sql(WREN_ENGINE_ENDPOINT)
+            prepare_duckdb_init_sql(
+                WREN_ENGINE_ENDPOINT, st.session_state["mdl_json"]["catalog"]
+            )
     else:
         st.session_state["data_source"] = None
         st.session_state["mdl_json"] = None
@@ -126,6 +139,9 @@ def on_change_sql(i: int, key: str):
             st.session_state["data_source"],
             st.session_state["mdl_json"],
             st.session_state["connection_info"],
+            WREN_ENGINE_ENDPOINT
+            if st.session_state["data_source"] == "duckdb"
+            else WREN_IBIS_ENDPOINT,
         )
     )
     if valid:
@@ -388,6 +404,9 @@ def on_click_remove_candidate_dataset_button(i: int):
                             st.session_state["data_source"],
                             st.session_state["mdl_json"],
                             st.session_state["connection_info"],
+                            WREN_ENGINE_ENDPOINT
+                            if st.session_state["data_source"] == "duckdb"
+                            else WREN_IBIS_ENDPOINT,
                         )
                     )[0]
                     st.dataframe(