Merge pull request #216 from SylphAI-Inc/main

[v0.2.3.beta.1] Optimize any llm task pipeline & RAG playbook v1
SylphAI-Inc · Sep 17, 2024 · 9b7038b · 9b7038b
2 parents 82a7878 + 030c128
commit 9b7038b
Show file tree

Hide file tree

Showing 57 changed files with 2,516 additions and 1,063 deletions.
diff --git a/.gitignore b/.gitignore
@@ -36,3 +36,4 @@ index.faiss
 *.svg
 # ignore the softlink to adalflow cache
 *.adalflow
+extend/
diff --git a/README.md b/README.md
@@ -29,6 +29,7 @@
         <a href="https://adalflow.sylph.ai/apis/components/components.model_client.html">Models</a> |
         <a href="https://adalflow.sylph.ai/apis/components/components.retriever.html">Retrievers</a> |
         <a href="https://adalflow.sylph.ai/apis/components/components.agent.html">Agents</a> |
+        <a href="https://adalflow.sylph.ai/tutorials/evaluation.html"> LLM evaluation</a> |
         <a href="https://adalflow.sylph.ai/use_cases/question_answering.html">Trainer & Optimizers</a>
     <p>
 </h4>
@@ -212,7 +213,7 @@ AdalFlow full documentation available at [adalflow.sylph.ai](https://adalflow.sy
 
 # AdalFlow: A Tribute to Ada Lovelace
 
-AdalFlow is named in honor of [Ada Lovelace](https://en.wikipedia.org/wiki/Ada_Lovelace), the pioneering female mathematician who first recognized that machines could do more than just calculations. As a female-led team, we aim to inspire more women to enter the AI field.
+AdalFlow is named in honor of [Ada Lovelace](https://en.wikipedia.org/wiki/Ada_Lovelace), the pioneering female mathematician who first recognized that machines could do more than just calculations. As a team led by female founder, we aim to inspire more women to enter the AI field.
 
 # Contributors
 
@@ -238,6 +239,12 @@ Many existing works greatly inspired AdalFlow library! Here is a non-exhaustive
   month = {7},
   year = {2024},
   doi = {10.5281/zenodo.12639531},
-  url = {https://github.com/SylphAI-Inc/LightRAG}
+  url = {https://github.com/SylphAI-Inc/AdalFlow}
 }
 ```
+
+# Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=SylphAI-Inc/AdalFlow&type=Date)](https://star-history.com/#SylphAI-Inc/AdalFlow&Date)
+<!--
+<a href="https://trendshift.io/repositories/11559" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11559" alt="SylphAI-Inc%2FAdalFlow | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a> -->
diff --git a/adalflow/CHANGELOG.md b/adalflow/CHANGELOG.md
@@ -1,3 +1,16 @@
+## [0.2.3.beta.1] - 2024-09-17
+### Removed
+- Removed /reasoning as COT is just too simple to be a separate module.
+### Fixed
+- datasets/hotpotqa.py
+- eval/answer_match_acc: added lower() to both the gt and pred in the fuzzy match. On hotpotqa, the accuracy goes from 0.15 to 0.4 on one test.
+- eval/functional: fixed the `confidence_interval` to be able to customize the confidence level.
+
+### Added
+Auto-grad system to support retriever and any component:
+- `GradComponent` has a default `forward` which wraps the `call` to handle the auto-grad automatically for any component that has subclassed `GradComponent`.
+- Clarified the `ParamType` to include `input`, `output`, `hyperparam` instead of following PyTorch's tensor and Parameter design pattern.
+- `TraceGraph` of the `Parameter` at `draw_graph` to support `ParamType`.
 ## [0.2.2] - 2024-09-09
 ### Added
 - `get_cache_path`, instead of print out the cache path all the time, we add a ``get_cache_path`` to get the cache path.

diff --git a/adalflow/adalflow/__init__.py b/adalflow/adalflow/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.2.2"
+__version__ = "0.2.3.beta.1"
 
 from adalflow.core.component import Component, fun_to_component
 from adalflow.core.container import Sequential
@@ -24,6 +24,7 @@
     ListParser,
     BooleanParser,
 )
+from adalflow.core.retriever import Retriever
 from adalflow.components.output_parsers import (
     YamlOutputParser,
     JsonOutputParser,
@@ -70,6 +71,7 @@
     "ModelClient",
     "Generator",
     "Embedder",
+    "Retriever",
     "Parameter",
     "AdalComponent",
     "Trainer",

diff --git a/adalflow/adalflow/components/reasoning/__init__.py b/adalflow/adalflow/components/reasoning/__init__.py
diff --git a/adalflow/adalflow/components/reasoning/chain_of_thought.py b/adalflow/adalflow/components/reasoning/chain_of_thought.py
diff --git a/adalflow/adalflow/core/README.md b/adalflow/adalflow/core/README.md
diff --git a/adalflow/adalflow/core/base_data_class.py b/adalflow/adalflow/core/base_data_class.py
@@ -27,7 +27,15 @@
     represent_ordereddict,
 )
 
-
+__all__ = [
+    "DataClass",
+    "DataClassFormatType",
+    "required_field",
+    "ExcludeType",
+    "IncludeType",
+    "check_adal_dataclass",
+    "DynamicDataClassFactory",
+]
 logger = logging.getLogger(__name__)
 
 
@@ -125,7 +133,8 @@ class DataClass:
     Overall, we have a unified class method :meth:`format_str` to generate formatted output based on the type of operation and class/instance context.
 
     note::
-        You do not need to use our format, overwrite any method in the subclass to fit in your needs.
+        1. Avoid using Optional[Type] for the type of fields, as dataclass already distingushes between optional and required fields using default value.
+        2. If you need to customize, you can subclass and overwrite any method to fit your needs.
 
     Loading data:
 
@@ -176,8 +185,8 @@ class MyOutputs(DataClass):
         # name: John Doe
 
     """
-    __input_fields__ = []
-    __output_fields__ = []
+    __input_fields__: List[str] = []
+    __output_fields__: List[str] = []
 
     def __post_init__(self):
 
@@ -687,9 +696,6 @@ def format_example_str(
         else:
             raise ValueError(f"Unsupported format type: {format_type}")
 
-    # TODO:support Generic[Type[T]] for the type of fields
-    # it will automatically use __type_var_map__ attribute
-
 
 def check_adal_dataclass(data_class: Type) -> None:
     """Check if the provided class is a valid dataclass for the AdalFlow framework.

diff --git a/adalflow/adalflow/core/db.py b/adalflow/adalflow/core/db.py
@@ -23,7 +23,7 @@
 # TODO: DB clarity can be further improved
 @dataclass
 class LocalDB(Generic[T], Component):
-    __doc__ = r"""LocalDB with in-memory CRUD operations, data transformation/processing pipelines, and persistence.
+    __doc__ = """LocalDB with in-memory CRUD operations, data transformation/processing pipelines, and persistence.
 
     LocalDB is highly flexible.
     1. It can store any type of data items in the `items` attribute.
@@ -126,10 +126,7 @@ def length(self):
     def get_transformer_keys(self) -> List[str]:
         return list(self.transformed_items.keys())
 
-    # def get_transformed_data(self, key: str) -> List[U]:
-    #     """Get the transformed items by key."""
-    #     return self.transformed_items[key]
-
+    # TODO: combine this to fetch_transformed_items
     def get_transformed_data(
         self, key: str, filter_fn: Callable[[Any], bool] = lambda x: True
     ) -> List[U]:

diff --git a/adalflow/adalflow/core/embedder.py b/adalflow/adalflow/core/embedder.py
@@ -15,6 +15,7 @@
 from adalflow.core.component import Component
 import adalflow.core.functional as F
 
+__all__ = ["Embedder", "BatchEmbedder"]
 
 log = logging.getLogger(__name__)
 

diff --git a/adalflow/adalflow/core/generator.py b/adalflow/adalflow/core/generator.py
@@ -41,6 +41,9 @@
     OBJECTIVE_INSTRUCTION_CHAIN,
 )
 
+__all__ = ["Generator", "BackwardEngine", "create_teacher_generator"]
+
+
 log = logging.getLogger(__name__)
 
 PromptArgType = Dict[str, Union[str, Parameter]]
@@ -66,7 +69,8 @@ class Generator(GradComponent, CachedEngine, CallbackManager):
         trainable_params (Optional[List[str]], optional): The list of trainable parameters. Defaults to [].
 
     Note:
-        The output_processors will be applied to the string output of the model completion. And the result will be stored in the data field of the output. And we encourage you to only use it to parse the response to data format you will use later.
+        The output_processors will be applied to the string output of the model completion. And the result will be stored in the data field of the output.
+        And we encourage you to only use it to parse the response to data format you will use later.
     """
 
     model_type: ModelType = ModelType.LLM
@@ -264,6 +268,7 @@ def _compose_model_kwargs(self, **model_kwargs) -> Dict:
             combined_model_kwargs.update(model_kwargs)
         return combined_model_kwargs
 
+    # TODO: use prompt_kwargs as users are already familiar with it
     def print_prompt(self, **kwargs) -> str:
         return self.prompt.print_prompt(**kwargs)
 
@@ -334,7 +339,8 @@ def _model_client_call(self, api_kwargs: Dict, use_cache: bool = False) -> Any:
             raise e
 
     ##############################################################################################################
-    ### Forward and backwards, and teacher generator are for training
+    ### Forward, backwards, teacher generator, create demo data instance,
+    # are for training and backpropagation
     ##############################################################################################################
 
     def create_demo_data_instance(
@@ -343,6 +349,10 @@ def create_demo_data_instance(
         output: GeneratorOutput,
         id: Optional[str] = None,
     ):
+        r"""Automatically create a demo data instance from the input and output of the generator.
+        Used to trace the demos for the demo paramter in the prompt_kwargs.
+        Part of the few-shot learning.
+        """
         from adalflow.core.base_data_class import DynamicDataClassFactory
 
         # map the input fields
@@ -352,7 +362,10 @@ def create_demo_data_instance(
         )
 
         for k, v in input_prompt_kwargs.items():
-            demo_data[k] = v
+            if isinstance(v, Parameter):
+                demo_data[k] = v.map_to_successor(self)
+            else:
+                demo_data[k] = v
         # map the output fields
         for key, value in demo_data_class_output_mapping.items():
             demo_data[key] = value(output)
@@ -473,15 +486,10 @@ def forward(
                 raise ValueError(
                     "ID is required for tracing. Please pass it to your Geneartor call."
                 )
-            input_prompt_kwargs = {
-                k: v.data if isinstance(v, Parameter) else v
-                for k, v in prompt_kwargs.items()
-            }
 
             demo = self.create_demo_data_instance(
-                input_prompt_kwargs,
+                prompt_kwargs,
                 output,
-                # self._demo_data_class_output_mapping,
                 id=id,
             )
             demo_param.add_to_trace(demo, is_teacher=self.teacher_mode)
@@ -842,7 +850,8 @@ def _extra_repr(self) -> str:
 
     def to_dict(self) -> Dict[str, Any]:
         r"""Convert the generator to a dictionary."""
-        # exclude default functions
+        # TODO: exclude default functions
+        return super().to_dict()
 
     @staticmethod
     def failure_message_to_backward_engine(