Skip to content

Commit

Permalink
Simplify template variables setting
Browse files Browse the repository at this point in the history
  • Loading branch information
blythed committed Nov 7, 2024
1 parent 677f560 commit 5e24f0c
Show file tree
Hide file tree
Showing 9 changed files with 273 additions and 34 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci_code.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ jobs:
- name: Install superduper-framework
run: |
# Install core and testsuite dependencies on the cached python environment.
cp -r templates/* superduper/templates/
python -m pip install '.[test]'
# TODO: We currently need a default plugin to run tests using MongoDB.
# Once the local file database is complete, we may need to update this section.
Expand Down
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

**Before you create a Pull Request, remember to update the Changelog with your changes.**

## Changes Since Last Release
## Changes Since Last Release

#### Changed defaults / behaviours

Expand Down
4 changes: 3 additions & 1 deletion superduper/base/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,9 @@ def decode(
:param db: The datalayer to use.
"""
if '_variables' in r:
variables = {**r['_variables'], 'output_prefix': CFG.output_prefix}
r = _replace_variables(
{k: v for k, v in r.items() if k != '_variables'}, **r['_variables']
{k: v for k, v in r.items() if k != '_variables'}, **variables
)
schema = schema or r.get(KEY_SCHEMA)
schema = get_schema(db, schema)
Expand All @@ -216,6 +217,7 @@ def decode(

if not isinstance(getters, _Getters):
getters = _Getters(getters)
assert isinstance(getters, _Getters)

# Prioritize using the local artifact storage getter,
# and then use the DB read getter.
Expand Down
1 change: 1 addition & 0 deletions superduper/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ def _apply(name: str, variables: str | None = None, data_backend: str | None = N
variables = variables or '{}'
variables = json.loads(variables)

# TODO remove all of this template logic
def _build_from_template(t):
assert variables is not None, 'Variables must be provided for templates'
all_values = variables.copy()
Expand Down
1 change: 0 additions & 1 deletion superduper/components/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ def default_values(self):
def form_template(self):
"""Form to be diplayed to user."""
return {
'identifier': '<enter-a-unique-identifier>',
'_variables': {
k: (
f'<value-{i}>'
Expand Down
28 changes: 1 addition & 27 deletions superduper/rest/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from superduper import logging
from superduper.backends.base.query import Query
from superduper.base.document import Document
from superduper.components.component import Component
from superduper.components.template import Template
from superduper.rest.base import DatalayerDependency, SuperDuperApp

Expand Down Expand Up @@ -169,39 +168,14 @@ def _process_db_apply(db, component, id: str | None = None):
else:
db.apply(component, force=True)

def _process_apply_info(db, info):
if '_variables' in info:
assert {'_variables', 'identifier'}.issubset(info.keys())
variables = info.pop('_variables')
for k in variables:
if isinstance(variables[k], str):
assert '<' not in variables[k]
assert '>' not in variables[k]

identifier = info.pop('identifier')
template_name = info.pop('_template_name', None)

component = Component.from_template(
identifier=identifier,
template_body=info,
template_name=template_name,
db=db,
**variables,
)
return component
component = Document.decode(info, db=db).unpack()
# TODO this shouldn't be necessary to do twice
component.unpack()
return component

@app.add('/db/apply', method='post')
async def db_apply(
info: t.Dict,
background_tasks: BackgroundTasks,
id: str | None = 'test',
db: 'Datalayer' = DatalayerDependency(),
):
component = _process_apply_info(db, info)
component = Document.decode(info, db=db).unpack()
background_tasks.add_task(_process_db_apply, db, component, id)
return {'status': 'ok'}

Expand Down
5 changes: 1 addition & 4 deletions templates/pdf_rag/streamlit.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,7 @@ def init_db():


def load_questions():
return [
"What is sparse-vector retrieval?",
"How to perform Query Optimization?"
]
return ["What is sparse-vector retrieval?", "How to perform Query Optimization?"]


db, model_rag = st.cache_resource(init_db)()
Expand Down
19 changes: 19 additions & 0 deletions test/integration/usecase/test_build_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import json

from superduper import Application, Document


def test_build_from_template(db):
from superduper import templates

db.apply(templates.simple_rag)

with open('test/material/sample_app/component.json') as f:
component = json.load(f)

component = templates.simple_rag.form_template
component['_variables']['output_prefix'] = '_output__'

c = Document.decode(component, db=db).unpack()

assert isinstance(c, Application)
246 changes: 246 additions & 0 deletions test/material/sample_app/component.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
{
"_variables": {
"table_name": "sample_simple_rag",
"id_field": "_id",
"databackend": "mongodb",
"base_url": null,
"api_key": null,
"embedding_model": "text-embedding-ada-002",
"llm_model": "gpt-3.5-turbo"
},
"types": {
"id_field": {
"type": "str",
"default": "_id"
},
"embedding_model": {
"type": "str",
"default": "text-embedding-ada-002"
},
"llm_model": {
"type": "str",
"default": "gpt-3.5-turbo"
},
"table_name": {
"type": "str",
"default": "sample_simple_rag"
},
"databackend": {
"type": "str",
"default": "mongodb"
},
"base_url": {
"type": "str",
"optional": true,
"default": null
},
"api_key": {
"type": "str",
"optional": true,
"default": null
}
},
"_base": "?simple-rag-app",
"_builds": {
"datatype:dill": {
"_path": "superduper.components.datatype.get_serializer",
"method": "dill",
"encodable": "artifact"
},
"727d3bb560939e1211f9cac189d56e07e9622eeb": {
"_path": "superduper.components.datatype.Artifact",
"datatype": "?datatype:dill",
"uri": null,
"blob": "&:blob:727d3bb560939e1211f9cac189d56e07e9622eeb"
},
"model:chunker": {
"_object": "?727d3bb560939e1211f9cac189d56e07e9622eeb",
"upstream": null,
"plugins": null,
"cache": true,
"status": null,
"signature": "singleton",
"datatype": null,
"output_schema": null,
"model_update_kwargs": {},
"predict_kwargs": {},
"compute_kwargs": {},
"validation": null,
"metric_values": {},
"num_workers": 0,
"serve": false,
"trainer": null,
"deploy": false,
"chunk_size": 200
},
"var-table-name-select-var-id-field-x": {
"_path": "superduper_<var:databackend>.query.parse_query",
"documents": [],
"query": "<var:table_name>.select(\"<var:id_field>\", \"x\")"
},
"listener:chunker": {
"_path": "superduper.components.listener.Listener",
"upstream": null,
"plugins": null,
"cache": true,
"status": null,
"cdc_table": "<var:table_name>",
"key": "x",
"model": "?model:chunker",
"predict_kwargs": {},
"select": "?var-table-name-select-var-id-field-x",
"flatten": true
},
"datatype:sqlvector[1536]": {
"_path": "superduper.components.vector_index.sqlvector",
"shape": [
1536
]
},
"model:<var:embedding_model>": {
"_path": "superduper_openai.model.OpenAIEmbedding",
"upstream": null,
"plugins": null,
"cache": true,
"status": null,
"signature": "singleton",
"datatype": "?datatype:sqlvector[1536]",
"output_schema": null,
"model_update_kwargs": {},
"predict_kwargs": {},
"compute_kwargs": {},
"validation": null,
"metric_values": {},
"num_workers": 0,
"serve": false,
"trainer": null,
"deploy": false,
"model": "<var:embedding_model>",
"max_batch_size": 8,
"openai_api_key": null,
"openai_api_base": null,
"client_kwargs": {
"base_url": null,
"api_key": null
},
"shape": [
1536
],
"batch_size": 100
},
"outputs-chunker-?(listener:chunker.uuid)-select-id-source-outputs-chunker-?(listener:chunker.uuid)": {
"_path": "superduper_<var:databackend>.query.parse_query",
"documents": [],
"query": "<var:output_prefix>chunker__?(listener:chunker.uuid).select(\"id\", \"_source\", \"<var:output_prefix>chunker__?(listener:chunker.uuid)\")"
},
"listener:embeddinglistener": {
"_path": "superduper.components.listener.Listener",
"upstream": [
"?listener:chunker",
"?listener:chunker"
],
"plugins": null,
"cache": true,
"status": null,
"cdc_table": "<var:output_prefix>chunker__?(listener:chunker.uuid)",
"key": "<var:output_prefix>chunker__?(listener:chunker.uuid)",
"model": "?model:<var:embedding_model>",
"predict_kwargs": {},
"select": "?outputs-chunker-?(listener:chunker.uuid)-select-id-source-outputs-chunker-?(listener:chunker.uuid)",
"flatten": false
},
"vector_index:vectorindex": {
"_path": "superduper.components.vector_index.VectorIndex",
"upstream": null,
"plugins": null,
"cache": true,
"status": null,
"cdc_table": "<var:output_prefix>embeddinglistener__?(listener:embeddinglistener.uuid)",
"indexing_listener": "?listener:embeddinglistener",
"compatible_listener": null,
"measure": "cosine",
"metric_values": {}
},
"outputs-chunker-?(listener:chunker.uuid)-select-like-outputs-chunker-?(listener:chunker.uuid)-var-query-vector-index-vectorindex-n-5": {
"_path": "superduper_<var:databackend>.query.parse_query",
"documents": [
{
"<var:output_prefix>chunker__?(listener:chunker.uuid)": "<var:query>"
}
],
"query": "<var:output_prefix>chunker__?(listener:chunker.uuid).select().like(documents[0], vector_index=\"vectorindex\", n=5)"
},
"model:llm-model": {
"_path": "superduper_openai.model.OpenAIChatCompletion",
"upstream": null,
"plugins": null,
"cache": true,
"status": null,
"signature": "singleton",
"datatype": null,
"output_schema": null,
"model_update_kwargs": {},
"predict_kwargs": {},
"compute_kwargs": {},
"validation": null,
"metric_values": {},
"num_workers": 0,
"serve": false,
"trainer": null,
"deploy": false,
"model": "<var:llm_model>",
"max_batch_size": 8,
"openai_api_key": null,
"openai_api_base": null,
"client_kwargs": {
"base_url": null,
"api_key": null
},
"batch_size": 1,
"prompt": ""
},
"model:simple_rag": {
"_path": "superduper.components.model.RAGModel",
"upstream": null,
"plugins": null,
"cache": true,
"status": null,
"signature": "singleton",
"datatype": null,
"output_schema": null,
"model_update_kwargs": {},
"predict_kwargs": {},
"compute_kwargs": {},
"validation": null,
"metric_values": {},
"num_workers": 0,
"serve": false,
"trainer": null,
"deploy": false,
"prompt_template": "Use the following context snippets, these snippets are not ordered!, Answer the question based on this context.\nThese snippets are samples from our internal data-repositories, and should be used exclusively and as a matter of priority to answer the question\n\n{context}\n\nHere's the question: {query}",
"select": "?outputs-chunker-?(listener:chunker.uuid)-select-like-outputs-chunker-?(listener:chunker.uuid)-var-query-vector-index-vectorindex-n-5",
"key": "<var:output_prefix>chunker__?(listener:chunker.uuid)",
"llm": "?model:llm-model"
},
"simple-rag-app": {
"_path": "superduper.components.application.Application",
"upstream": null,
"plugins": null,
"cache": true,
"status": null,
"components": [
"?listener:chunker",
"?vector_index:vectorindex",
"?model:simple_rag"
],
"namespace": null,
"link": null,
"_literals": [
"template"
]
}
},
"_blobs": {},
"_files": {},
"_template_name": "simple_rag"
}

0 comments on commit 5e24f0c

Please sign in to comment.