Skip to content

Commit

Permalink
Merge pull request #217 from SylphAI-Inc/li
Browse files Browse the repository at this point in the history
add rag optimization documentation
  • Loading branch information
Sylph-AI authored Sep 17, 2024
2 parents 030c128 + 54f5228 commit 7752729
Show file tree
Hide file tree
Showing 6 changed files with 443 additions and 18 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,9 @@ To optimize your pipeline, simply define a ``Parameter`` and pass it to our ``Ge
Whether you need to optimize task instructions or few-shot demonstrations,
our unified framework offers an easy way to **diagnose**, **visualize**, **debug**, and **train** your pipeline.

This [Trace Graph](https://adalflow.sylph.ai/tutorials/trace_graph.html) demonstrates how our auto-differentiation works.
This [Dynamic Computation Graph](https://adalflow.sylph.ai/tutorials/trace_graph.html) demonstrates how our auto-differentiation and the dynamic computation graph work.

No need to manually defined nodes and edges, AdalFlow will automatically trace the computation graph for you.

### **Trainable Task Pipeline**

Expand Down
6 changes: 3 additions & 3 deletions adalflow/adalflow/datasets/hotpot_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,12 @@ def __len__(self):


if __name__ == "__main__":
dataset = HotPotQA(root="BBH_object_counting", split="train", size=20)
dataset = HotPotQA(split="train", size=20)
print(dataset[0], type(dataset[0]))
print(len(dataset))
valdataset = HotPotQA(root="BBH_object_counting", split="val", size=50)
valdataset = HotPotQA(split="val", size=50)
print(len(valdataset))
testdataset = HotPotQA(root="BBH_object_counting", split="test", size=50)
testdataset = HotPotQA(split="test", size=50)
print(len(testdataset))
print(f"valdataset[0]: {valdataset[0]}")
print(f"testdataset[0]: {testdataset[0]}")
61 changes: 50 additions & 11 deletions benchmarks/hotpot_qa/adal_exp/build_vanilla_rag.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""We will use dspy's retriever to keep that the same and only use our generator and optimizer"""

from typing import List, Optional
from typing import List, Optional, Union
from dataclasses import dataclass, field
import dspy

Expand Down Expand Up @@ -166,26 +166,26 @@ def __init__(self, passages_per_hop=3, model_client=None, model_kwargs=None):
use_cache=True,
)

# user should just treat it as a call function
# and we will handle the connection between the components
# they should directly pass the retriever_output along with
# each output's successor_map_fn.
# what if it is passed to two different componnents?
# we can create a copy

def call(self, question: str, id: str = None) -> adal.GeneratorOutput:
if self.training:
raise ValueError(
"This component is not supposed to be called in training mode"
)
# user should just treat it as a call function
# and we will handle the connection between the components
# they should directly pass the retriever_output along with
# each output's successor_map_fn.
# what if it is passed to two different componnents?
# we can create a copy

retriever_out = self.retriever.call(input=question)

successor_map_fn = lambda x: ( # noqa E731
"\n\n".join(x[0].documents) if x and x[0] and x[0].documents else ""
)
retrieved_context = successor_map_fn(retriever_out)

# print(f"retrieved_context: {retrieved_context}")
# print(f"retriever_out: {retriever_out}")
prompt_kwargs = {
"context": retrieved_context,
"question": question,
Expand All @@ -196,12 +196,14 @@ def call(self, question: str, id: str = None) -> adal.GeneratorOutput:
id=id,
)
# self.llm.print_prompt(**prompt_kwargs)
# print(f"retrieved_context: {retrieved_context}")
# print(f"retriever_out: {retriever_out}")
return output

# TODO: add id in the retriever output
def forward(self, question: str, id: str = None) -> adal.Parameter:
if not self.training:
raise ValueError("This component is not supposed to be called in eval mode")
# TODO: add id in the retriever output
retriever_out = self.retriever.forward(input=question)
successor_map_fn = lambda x: ( # noqa E731
"\n\n".join(x.data[0].documents)
Expand All @@ -214,6 +216,42 @@ def forward(self, question: str, id: str = None) -> adal.Parameter:
)
return generator_out

def bicall(
self, question: str, id: str = None
) -> Union[adal.GeneratorOutput, adal.Parameter]:
"""You can also combine both the forward and call in the same function.
Supports both training and eval mode by using __call__ for GradComponents
like Retriever and Generator
"""
retriever_out = self.retriever(input=question)
if isinstance(retriever_out, adal.Parameter):
successor_map_fn = lambda x: ( # noqa E731
"\n\n".join(x.data[0].documents)
if x.data and x.data[0] and x.data[0].documents
else ""
)
retriever_out.add_successor_map_fn(
successor=self.llm, map_fn=successor_map_fn
)
else:
successor_map_fn = lambda x: ( # noqa E731
"\n\n".join(x[0].documents) if x and x[0] and x[0].documents else ""
)
retrieved_context = successor_map_fn(retriever_out)
prompt_kwargs = {
"context": retrieved_context,
"question": question,
}
output = self.llm(prompt_kwargs=prompt_kwargs, id=id)
return output


def test_retriever():
question = "How many storeys are in the castle that David Gregory inherited?"
retriever = DspyRetriever(top_k=3)
retriever_out = retriever(input=question)
print(f"retriever_out: {retriever_out}")


def test_vailla_rag():

Expand Down Expand Up @@ -248,4 +286,5 @@ def test_vailla_rag():


if __name__ == "__main__":
test_vailla_rag()
test_retriever()
# test_vailla_rag()
2 changes: 1 addition & 1 deletion benchmarks/hotpot_qa/adal_exp/train_vanilla.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(
def handle_one_task_sample(
self, sample: HotPotQAData
) -> Tuple[Callable[..., Any], Dict]:
if self.task.training: # TODO: make the components more clear
if self.task.training:
return self.task.forward, {"question": sample.question, "id": sample.id}
else:
return self.task.call, {"question": sample.question, "id": sample.id}
Expand Down
4 changes: 3 additions & 1 deletion docs/source/use_cases/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ Optimization
* - :doc:`classification`
- Classification with `gpt-3.5-turbo`. The optimized task pipeline performs on-par with `gpt-4o`.
* - :doc:`rag_opt`
- RAG and multi-hop question answering with hotpotqa dataset, two generators, and one retriever, optimizing zero-shot and few-shot learning (coming soon).
- Different from previous tasks where we only used one generator component, in this of hotpotqa dataset, we will demonstrates how to optimize a pipeline composed of multiple `GradComponent`(`Retriever` & `Generator`) with a standard RAG architectures

.. - RAG and multi-hop question answering with hotpotqa dataset, two generators, and one retriever, optimizing zero-shot and few-shot learning (coming soon).
Expand Down
Loading

0 comments on commit 7752729

Please sign in to comment.