Simplify template variables setting

superduper-io · Nov 7, 2024 · 5e24f0c · 5e24f0c
1 parent 677f560
commit 5e24f0c
Show file tree

Hide file tree

Showing 9 changed files with 273 additions and 34 deletions.
diff --git a/.github/workflows/ci_code.yml b/.github/workflows/ci_code.yml
@@ -61,6 +61,7 @@ jobs:
     - name: Install superduper-framework
       run: |
         # Install core and testsuite dependencies on the cached python environment.
+        cp -r templates/* superduper/templates/
         python -m pip install '.[test]'
         # TODO: We currently need a default plugin to run tests using MongoDB. 
         # Once the local file database is complete, we may need to update this section.

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 **Before you create a Pull Request, remember to update the Changelog with your changes.**
 
-## Changes Since Last Release 
+## Changes Since Last Release
 
 #### Changed defaults / behaviours
 

diff --git a/superduper/base/document.py b/superduper/base/document.py
@@ -191,8 +191,9 @@ def decode(
         :param db: The datalayer to use.
         """
         if '_variables' in r:
+            variables = {**r['_variables'], 'output_prefix': CFG.output_prefix}
             r = _replace_variables(
-                {k: v for k, v in r.items() if k != '_variables'}, **r['_variables']
+                {k: v for k, v in r.items() if k != '_variables'}, **variables
             )
         schema = schema or r.get(KEY_SCHEMA)
         schema = get_schema(db, schema)
@@ -216,6 +217,7 @@ def decode(
 
         if not isinstance(getters, _Getters):
             getters = _Getters(getters)
+        assert isinstance(getters, _Getters)
 
         # Prioritize using the local artifact storage getter,
         # and then use the DB read getter.

diff --git a/superduper/cli/main.py b/superduper/cli/main.py
@@ -176,6 +176,7 @@ def _apply(name: str, variables: str | None = None, data_backend: str | None = N
     variables = variables or '{}'
     variables = json.loads(variables)
 
+    # TODO remove all of this template logic
     def _build_from_template(t):
         assert variables is not None, 'Variables must be provided for templates'
         all_values = variables.copy()

diff --git a/superduper/components/template.py b/superduper/components/template.py
@@ -80,7 +80,6 @@ def default_values(self):
     def form_template(self):
         """Form to be diplayed to user."""
         return {
-            'identifier': '<enter-a-unique-identifier>',
             '_variables': {
                 k: (
                     f'<value-{i}>'

diff --git a/superduper/rest/build.py b/superduper/rest/build.py
@@ -18,7 +18,6 @@
 from superduper import logging
 from superduper.backends.base.query import Query
 from superduper.base.document import Document
-from superduper.components.component import Component
 from superduper.components.template import Template
 from superduper.rest.base import DatalayerDependency, SuperDuperApp
 
@@ -169,39 +168,14 @@ def _process_db_apply(db, component, id: str | None = None):
         else:
             db.apply(component, force=True)
 
-    def _process_apply_info(db, info):
-        if '_variables' in info:
-            assert {'_variables', 'identifier'}.issubset(info.keys())
-            variables = info.pop('_variables')
-            for k in variables:
-                if isinstance(variables[k], str):
-                    assert '<' not in variables[k]
-                    assert '>' not in variables[k]
-
-            identifier = info.pop('identifier')
-            template_name = info.pop('_template_name', None)
-
-            component = Component.from_template(
-                identifier=identifier,
-                template_body=info,
-                template_name=template_name,
-                db=db,
-                **variables,
-            )
-            return component
-        component = Document.decode(info, db=db).unpack()
-        # TODO this shouldn't be necessary to do twice
-        component.unpack()
-        return component
-
     @app.add('/db/apply', method='post')
     async def db_apply(
         info: t.Dict,
         background_tasks: BackgroundTasks,
         id: str | None = 'test',
         db: 'Datalayer' = DatalayerDependency(),
     ):
-        component = _process_apply_info(db, info)
+        component = Document.decode(info, db=db).unpack()
         background_tasks.add_task(_process_db_apply, db, component, id)
         return {'status': 'ok'}
 

diff --git a/templates/pdf_rag/streamlit.py b/templates/pdf_rag/streamlit.py
@@ -42,10 +42,7 @@ def init_db():
 
 
 def load_questions():
-    return [
-        "What is sparse-vector retrieval?",
-        "How to perform Query Optimization?"
-    ]
+    return ["What is sparse-vector retrieval?", "How to perform Query Optimization?"]
 
 
 db, model_rag = st.cache_resource(init_db)()

diff --git a/test/integration/usecase/test_build_interface.py b/test/integration/usecase/test_build_interface.py
@@ -0,0 +1,19 @@
+import json
+
+from superduper import Application, Document
+
+
+def test_build_from_template(db):
+    from superduper import templates
+
+    db.apply(templates.simple_rag)
+
+    with open('test/material/sample_app/component.json') as f:
+        component = json.load(f)
+
+    component = templates.simple_rag.form_template
+    component['_variables']['output_prefix'] = '_output__'
+
+    c = Document.decode(component, db=db).unpack()
+
+    assert isinstance(c, Application)
diff --git a/test/material/sample_app/component.json b/test/material/sample_app/component.json
@@ -0,0 +1,246 @@
+{
+  "_variables": {
+    "table_name": "sample_simple_rag",
+    "id_field": "_id",
+    "databackend": "mongodb",
+    "base_url": null,
+    "api_key": null,
+    "embedding_model": "text-embedding-ada-002",
+    "llm_model": "gpt-3.5-turbo"
+  },
+  "types": {
+    "id_field": {
+      "type": "str",
+      "default": "_id"
+    },
+    "embedding_model": {
+      "type": "str",
+      "default": "text-embedding-ada-002"
+    },
+    "llm_model": {
+      "type": "str",
+      "default": "gpt-3.5-turbo"
+    },
+    "table_name": {
+      "type": "str",
+      "default": "sample_simple_rag"
+    },
+    "databackend": {
+      "type": "str",
+      "default": "mongodb"
+    },
+    "base_url": {
+      "type": "str",
+      "optional": true,
+      "default": null
+    },
+    "api_key": {
+      "type": "str",
+      "optional": true,
+      "default": null
+    }
+  },
+  "_base": "?simple-rag-app",
+  "_builds": {
+    "datatype:dill": {
+      "_path": "superduper.components.datatype.get_serializer",
+      "method": "dill",
+      "encodable": "artifact"
+    },
+    "727d3bb560939e1211f9cac189d56e07e9622eeb": {
+      "_path": "superduper.components.datatype.Artifact",
+      "datatype": "?datatype:dill",
+      "uri": null,
+      "blob": "&:blob:727d3bb560939e1211f9cac189d56e07e9622eeb"
+    },
+    "model:chunker": {
+      "_object": "?727d3bb560939e1211f9cac189d56e07e9622eeb",
+      "upstream": null,
+      "plugins": null,
+      "cache": true,
+      "status": null,
+      "signature": "singleton",
+      "datatype": null,
+      "output_schema": null,
+      "model_update_kwargs": {},
+      "predict_kwargs": {},
+      "compute_kwargs": {},
+      "validation": null,
+      "metric_values": {},
+      "num_workers": 0,
+      "serve": false,
+      "trainer": null,
+      "deploy": false,
+      "chunk_size": 200
+    },
+    "var-table-name-select-var-id-field-x": {
+      "_path": "superduper_<var:databackend>.query.parse_query",
+      "documents": [],
+      "query": "<var:table_name>.select(\"<var:id_field>\", \"x\")"
+    },
+    "listener:chunker": {
+      "_path": "superduper.components.listener.Listener",
+      "upstream": null,
+      "plugins": null,
+      "cache": true,
+      "status": null,
+      "cdc_table": "<var:table_name>",
+      "key": "x",
+      "model": "?model:chunker",
+      "predict_kwargs": {},
+      "select": "?var-table-name-select-var-id-field-x",
+      "flatten": true
+    },
+    "datatype:sqlvector[1536]": {
+      "_path": "superduper.components.vector_index.sqlvector",
+      "shape": [
+        1536
+      ]
+    },
+    "model:<var:embedding_model>": {
+      "_path": "superduper_openai.model.OpenAIEmbedding",
+      "upstream": null,
+      "plugins": null,
+      "cache": true,
+      "status": null,
+      "signature": "singleton",
+      "datatype": "?datatype:sqlvector[1536]",
+      "output_schema": null,
+      "model_update_kwargs": {},
+      "predict_kwargs": {},
+      "compute_kwargs": {},
+      "validation": null,
+      "metric_values": {},
+      "num_workers": 0,
+      "serve": false,
+      "trainer": null,
+      "deploy": false,
+      "model": "<var:embedding_model>",
+      "max_batch_size": 8,
+      "openai_api_key": null,
+      "openai_api_base": null,
+      "client_kwargs": {
+        "base_url": null,
+        "api_key": null
+      },
+      "shape": [
+        1536
+      ],
+      "batch_size": 100
+    },
+    "outputs-chunker-?(listener:chunker.uuid)-select-id-source-outputs-chunker-?(listener:chunker.uuid)": {
+      "_path": "superduper_<var:databackend>.query.parse_query",
+      "documents": [],
+      "query": "<var:output_prefix>chunker__?(listener:chunker.uuid).select(\"id\", \"_source\", \"<var:output_prefix>chunker__?(listener:chunker.uuid)\")"
+    },
+    "listener:embeddinglistener": {
+      "_path": "superduper.components.listener.Listener",
+      "upstream": [
+        "?listener:chunker",
+        "?listener:chunker"
+      ],
+      "plugins": null,
+      "cache": true,
+      "status": null,
+      "cdc_table": "<var:output_prefix>chunker__?(listener:chunker.uuid)",
+      "key": "<var:output_prefix>chunker__?(listener:chunker.uuid)",
+      "model": "?model:<var:embedding_model>",
+      "predict_kwargs": {},
+      "select": "?outputs-chunker-?(listener:chunker.uuid)-select-id-source-outputs-chunker-?(listener:chunker.uuid)",
+      "flatten": false
+    },
+    "vector_index:vectorindex": {
+      "_path": "superduper.components.vector_index.VectorIndex",
+      "upstream": null,
+      "plugins": null,
+      "cache": true,
+      "status": null,
+      "cdc_table": "<var:output_prefix>embeddinglistener__?(listener:embeddinglistener.uuid)",
+      "indexing_listener": "?listener:embeddinglistener",
+      "compatible_listener": null,
+      "measure": "cosine",
+      "metric_values": {}
+    },
+    "outputs-chunker-?(listener:chunker.uuid)-select-like-outputs-chunker-?(listener:chunker.uuid)-var-query-vector-index-vectorindex-n-5": {
+      "_path": "superduper_<var:databackend>.query.parse_query",
+      "documents": [
+        {
+          "<var:output_prefix>chunker__?(listener:chunker.uuid)": "<var:query>"
+        }
+      ],
+      "query": "<var:output_prefix>chunker__?(listener:chunker.uuid).select().like(documents[0], vector_index=\"vectorindex\", n=5)"
+    },
+    "model:llm-model": {
+      "_path": "superduper_openai.model.OpenAIChatCompletion",
+      "upstream": null,
+      "plugins": null,
+      "cache": true,
+      "status": null,
+      "signature": "singleton",
+      "datatype": null,
+      "output_schema": null,
+      "model_update_kwargs": {},
+      "predict_kwargs": {},
+      "compute_kwargs": {},
+      "validation": null,
+      "metric_values": {},
+      "num_workers": 0,
+      "serve": false,
+      "trainer": null,
+      "deploy": false,
+      "model": "<var:llm_model>",
+      "max_batch_size": 8,
+      "openai_api_key": null,
+      "openai_api_base": null,
+      "client_kwargs": {
+        "base_url": null,
+        "api_key": null
+      },
+      "batch_size": 1,
+      "prompt": ""
+    },
+    "model:simple_rag": {
+      "_path": "superduper.components.model.RAGModel",
+      "upstream": null,
+      "plugins": null,
+      "cache": true,
+      "status": null,
+      "signature": "singleton",
+      "datatype": null,
+      "output_schema": null,
+      "model_update_kwargs": {},
+      "predict_kwargs": {},
+      "compute_kwargs": {},
+      "validation": null,
+      "metric_values": {},
+      "num_workers": 0,
+      "serve": false,
+      "trainer": null,
+      "deploy": false,
+      "prompt_template": "Use the following context snippets, these snippets are not ordered!, Answer the question based on this context.\nThese snippets are samples from our internal data-repositories, and should be used exclusively and as a matter of priority to answer the question\n\n{context}\n\nHere's the question: {query}",
+      "select": "?outputs-chunker-?(listener:chunker.uuid)-select-like-outputs-chunker-?(listener:chunker.uuid)-var-query-vector-index-vectorindex-n-5",
+      "key": "<var:output_prefix>chunker__?(listener:chunker.uuid)",
+      "llm": "?model:llm-model"
+    },
+    "simple-rag-app": {
+      "_path": "superduper.components.application.Application",
+      "upstream": null,
+      "plugins": null,
+      "cache": true,
+      "status": null,
+      "components": [
+        "?listener:chunker",
+        "?vector_index:vectorindex",
+        "?model:simple_rag"
+      ],
+      "namespace": null,
+      "link": null,
+      "_literals": [
+        "template"
+      ]
+    }
+  },
+  "_blobs": {},
+  "_files": {},
+  "_template_name": "simple_rag"
+}