Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add rag optimization documentation #217

Merged
merged 1 commit into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,9 @@ To optimize your pipeline, simply define a ``Parameter`` and pass it to our ``Ge
Whether you need to optimize task instructions or few-shot demonstrations,
our unified framework offers an easy way to **diagnose**, **visualize**, **debug**, and **train** your pipeline.

This [Trace Graph](https://adalflow.sylph.ai/tutorials/trace_graph.html) demonstrates how our auto-differentiation works.
This [Dynamic Computation Graph](https://adalflow.sylph.ai/tutorials/trace_graph.html) demonstrates how our auto-differentiation and the dynamic computation graph work.

No need to manually defined nodes and edges, AdalFlow will automatically trace the computation graph for you.

### **Trainable Task Pipeline**

Expand Down
6 changes: 3 additions & 3 deletions adalflow/adalflow/datasets/hotpot_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,12 @@ def __len__(self):


if __name__ == "__main__":
dataset = HotPotQA(root="BBH_object_counting", split="train", size=20)
dataset = HotPotQA(split="train", size=20)
print(dataset[0], type(dataset[0]))
print(len(dataset))
valdataset = HotPotQA(root="BBH_object_counting", split="val", size=50)
valdataset = HotPotQA(split="val", size=50)
print(len(valdataset))
testdataset = HotPotQA(root="BBH_object_counting", split="test", size=50)
testdataset = HotPotQA(split="test", size=50)
print(len(testdataset))
print(f"valdataset[0]: {valdataset[0]}")
print(f"testdataset[0]: {testdataset[0]}")
61 changes: 50 additions & 11 deletions benchmarks/hotpot_qa/adal_exp/build_vanilla_rag.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""We will use dspy's retriever to keep that the same and only use our generator and optimizer"""

from typing import List, Optional
from typing import List, Optional, Union
from dataclasses import dataclass, field
import dspy

Expand Down Expand Up @@ -166,26 +166,26 @@ def __init__(self, passages_per_hop=3, model_client=None, model_kwargs=None):
use_cache=True,
)

# user should just treat it as a call function
# and we will handle the connection between the components
# they should directly pass the retriever_output along with
# each output's successor_map_fn.
# what if it is passed to two different componnents?
# we can create a copy

def call(self, question: str, id: str = None) -> adal.GeneratorOutput:
if self.training:
raise ValueError(
"This component is not supposed to be called in training mode"
)
# user should just treat it as a call function
# and we will handle the connection between the components
# they should directly pass the retriever_output along with
# each output's successor_map_fn.
# what if it is passed to two different componnents?
# we can create a copy

retriever_out = self.retriever.call(input=question)

successor_map_fn = lambda x: ( # noqa E731
"\n\n".join(x[0].documents) if x and x[0] and x[0].documents else ""
)
retrieved_context = successor_map_fn(retriever_out)

# print(f"retrieved_context: {retrieved_context}")
# print(f"retriever_out: {retriever_out}")
prompt_kwargs = {
"context": retrieved_context,
"question": question,
Expand All @@ -196,12 +196,14 @@ def call(self, question: str, id: str = None) -> adal.GeneratorOutput:
id=id,
)
# self.llm.print_prompt(**prompt_kwargs)
# print(f"retrieved_context: {retrieved_context}")
# print(f"retriever_out: {retriever_out}")
return output

# TODO: add id in the retriever output
def forward(self, question: str, id: str = None) -> adal.Parameter:
if not self.training:
raise ValueError("This component is not supposed to be called in eval mode")
# TODO: add id in the retriever output
retriever_out = self.retriever.forward(input=question)
successor_map_fn = lambda x: ( # noqa E731
"\n\n".join(x.data[0].documents)
Expand All @@ -214,6 +216,42 @@ def forward(self, question: str, id: str = None) -> adal.Parameter:
)
return generator_out

def bicall(
self, question: str, id: str = None
) -> Union[adal.GeneratorOutput, adal.Parameter]:
"""You can also combine both the forward and call in the same function.
Supports both training and eval mode by using __call__ for GradComponents
like Retriever and Generator
"""
retriever_out = self.retriever(input=question)
if isinstance(retriever_out, adal.Parameter):
successor_map_fn = lambda x: ( # noqa E731
"\n\n".join(x.data[0].documents)
if x.data and x.data[0] and x.data[0].documents
else ""
)
retriever_out.add_successor_map_fn(
successor=self.llm, map_fn=successor_map_fn
)
else:
successor_map_fn = lambda x: ( # noqa E731
"\n\n".join(x[0].documents) if x and x[0] and x[0].documents else ""
)
retrieved_context = successor_map_fn(retriever_out)
prompt_kwargs = {
"context": retrieved_context,
"question": question,
}
output = self.llm(prompt_kwargs=prompt_kwargs, id=id)
return output


def test_retriever():
question = "How many storeys are in the castle that David Gregory inherited?"
retriever = DspyRetriever(top_k=3)
retriever_out = retriever(input=question)
print(f"retriever_out: {retriever_out}")


def test_vailla_rag():

Expand Down Expand Up @@ -248,4 +286,5 @@ def test_vailla_rag():


if __name__ == "__main__":
test_vailla_rag()
test_retriever()
# test_vailla_rag()
2 changes: 1 addition & 1 deletion benchmarks/hotpot_qa/adal_exp/train_vanilla.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(
def handle_one_task_sample(
self, sample: HotPotQAData
) -> Tuple[Callable[..., Any], Dict]:
if self.task.training: # TODO: make the components more clear
if self.task.training:
return self.task.forward, {"question": sample.question, "id": sample.id}
else:
return self.task.call, {"question": sample.question, "id": sample.id}
Expand Down
4 changes: 3 additions & 1 deletion docs/source/use_cases/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ Optimization
* - :doc:`classification`
- Classification with `gpt-3.5-turbo`. The optimized task pipeline performs on-par with `gpt-4o`.
* - :doc:`rag_opt`
- RAG and multi-hop question answering with hotpotqa dataset, two generators, and one retriever, optimizing zero-shot and few-shot learning (coming soon).
- Different from previous tasks where we only used one generator component, in this of hotpotqa dataset, we will demonstrates how to optimize a pipeline composed of multiple `GradComponent`(`Retriever` & `Generator`) with a standard RAG architectures

.. - RAG and multi-hop question answering with hotpotqa dataset, two generators, and one retriever, optimizing zero-shot and few-shot learning (coming soon).



Expand Down
Loading
Loading