crewAIInc · bhancockio · Nov 4, 2024 · Nov 4, 2024 · Nov 5, 2024 · Nov 6, 2024
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -26,7 +26,7 @@ jobs:
         run: uv python install 3.11.9
 
       - name: Install the project
-        run: uv sync --dev
+        run: uv sync --dev --all-extras
 
       - name: Run tests
         run: uv run pytest tests
diff --git a/path/to/src/crewai/knowledge/source/base_knowledge_source.py b/path/to/src/crewai/knowledge/source/base_knowledge_source.py
@@ -0,0 +1,32 @@
+from abc import ABC, abstractmethod
+from typing import List
+
+from crewai.knowledge.embedder.base_embedder import BaseEmbedder
+
+
+class BaseKnowledgeSource(ABC):
+    """Abstract base class for different types of knowledge sources."""
+
+    def __init__(
+        self,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+    ):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.chunks: List[str] = []
+
+    @abstractmethod
+    def load_content(self):
+        """Load and preprocess content from the source."""
+        pass
+
+    @abstractmethod
+    def add(self, embedder: BaseEmbedder) -> None:
+        """Add content to the knowledge base, chunk it, and compute embeddings."""
+        pass
+
+    @abstractmethod
+    def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
+        """Query the knowledge base using semantic search."""
+        pass
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,16 @@ Repository = "https://github.com/crewAIInc/crewAI"
 [project.optional-dependencies]
 tools = ["crewai-tools>=0.14.0"]
 agentops = ["agentops>=0.3.0"]
+fastembed = ["fastembed>=0.4.1"]
+pdfplumber = [
+    "pdfplumber>=0.11.4",
+]
+pandas = [
+    "pandas>=2.2.3",
+]
+openpyxl = [
+    "openpyxl>=3.1.5",
+]
 mem0 = ["mem0ai>=0.1.29"]
 
 [tool.uv]

diff --git a/src/crewai/__init__.py b/src/crewai/__init__.py
@@ -1,7 +1,9 @@
 import warnings
+
 from crewai.agent import Agent
 from crewai.crew import Crew
 from crewai.flow.flow import Flow
+from crewai.knowledge.knowledge import Knowledge
 from crewai.llm import LLM
 from crewai.pipeline import Pipeline
 from crewai.process import Process
@@ -15,4 +17,14 @@
     module="pydantic.main",
 )
 __version__ = "0.79.4"
-__all__ = ["Agent", "Crew", "Process", "Task", "Pipeline", "Router", "LLM", "Flow"]
+__all__ = [
+    "Agent",
+    "Crew",
+    "Process",
+    "Task",
+    "Pipeline",
+    "Router",
+    "LLM",
+    "Flow",
+    "Knowledge",
+]
diff --git a/src/crewai/agent.py b/src/crewai/agent.py
@@ -9,10 +9,12 @@
 from crewai.agents.agent_builder.base_agent import BaseAgent
 from crewai.agents.crew_agent_executor import CrewAgentExecutor
 from crewai.cli.constants import ENV_VARS
+from crewai.knowledge.knowledge import Knowledge
+from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
 from crewai.llm import LLM
 from crewai.memory.contextual.contextual_memory import ContextualMemory
-from crewai.tools.agent_tools.agent_tools import AgentTools
 from crewai.tools import BaseTool
+from crewai.tools.agent_tools.agent_tools import AgentTools
 from crewai.utilities import Converter, Prompts
 from crewai.utilities.constants import TRAINED_AGENTS_DATA_FILE, TRAINING_DATA_FILE
 from crewai.utilities.token_counter_callback import TokenCalcHandler
@@ -52,6 +54,7 @@ class Agent(BaseAgent):
             role: The role of the agent.
             goal: The objective of the agent.
             backstory: The backstory of the agent.
+            knowledge: The knowledge base of the agent.
             config: Dict representation of agent configuration.
             llm: The language model that will run the agent.
             function_calling_llm: The language model that will handle the tool calling for this agent, it overrides the crew function_calling_llm.
@@ -85,6 +88,10 @@ class Agent(BaseAgent):
     llm: Union[str, InstanceOf[LLM], Any] = Field(
         description="Language model that will run the agent.", default=None
     )
+    knowledge_sources: Optional[List[BaseKnowledgeSource]] = Field(
+        default=None,
+        description="Knowledge sources for the agent.",
+    )
     function_calling_llm: Optional[Any] = Field(
         description="Language model that will run the agent.", default=None
     )
@@ -119,6 +126,8 @@ class Agent(BaseAgent):
         default="safe",
         description="Mode for code execution: 'safe' (using Docker) or 'unsafe' (direct execution).",
     )
+    # TODO: Lorenze add knowledge_embedder. Support direct class or config dict.
+    _knowledge: Optional[Knowledge] = PrivateAttr(default=None)
 
     @model_validator(mode="after")
     def post_init_setup(self):
@@ -227,6 +236,12 @@ def post_init_setup(self):
         if self.allow_code_execution:
             self._validate_docker_installation()
 
+        # Initialize the Knowledge object if knowledge_sources are provided
+        if self.knowledge_sources:
+            self._knowledge = Knowledge(sources=self.knowledge_sources)
+        else:
+            self._knowledge = None
+
         return self
 
     def _setup_agent_executor(self):
@@ -272,6 +287,14 @@ def execute_task(
             if memory.strip() != "":
                 task_prompt += self.i18n.slice("memory").format(memory=memory)
 
+        # Integrate the knowledge base
+        if self._knowledge:
+            # Query the knowledge base for relevant information
+            knowledge_snippets = self._knowledge.query(query=task.prompt())
+            if knowledge_snippets:
+                formatted_knowledge = "\n".join(knowledge_snippets)
+                task_prompt += f"\n\nAdditional Information:\n{formatted_knowledge}"
+
         tools = tools or self.tools or []
         self.create_agent_executor(tools=tools, task=task)
 

diff --git a/src/crewai/cli/cli.py b/src/crewai/cli/cli.py
@@ -136,14 +136,15 @@ def log_tasks_outputs() -> None:
 @click.option("-l", "--long", is_flag=True, help="Reset LONG TERM memory")
 @click.option("-s", "--short", is_flag=True, help="Reset SHORT TERM memory")
 @click.option("-e", "--entities", is_flag=True, help="Reset ENTITIES memory")
+@click.option("-kn", "--knowledge", is_flag=True, help="Reset KNOWLEDGE")
 @click.option(
     "-k",
     "--kickoff-outputs",
     is_flag=True,
     help="Reset LATEST KICKOFF TASK OUTPUTS",
 )
 @click.option("-a", "--all", is_flag=True, help="Reset ALL memories")
-def reset_memories(long, short, entities, kickoff_outputs, all):
+def reset_memories(long, short, entities, knowledge, kickoff_outputs, all):
     """
     Reset the crew memories (long, short, entity, latest_crew_kickoff_ouputs). This will delete all the data saved.
     """
@@ -153,7 +154,7 @@ def reset_memories(long, short, entities, kickoff_outputs, all):
                 "Please specify at least one memory type to reset using the appropriate flags."
             )
             return
-        reset_memories_command(long, short, entities, kickoff_outputs, all)
+        reset_memories_command(long, short, entities, knowledge, kickoff_outputs, all)
     except Exception as e:
         click.echo(f"An error occurred while resetting memories: {e}", err=True)
 

diff --git a/src/crewai/cli/reset_memories_command.py b/src/crewai/cli/reset_memories_command.py
@@ -5,9 +5,12 @@
 from crewai.memory.long_term.long_term_memory import LongTermMemory
 from crewai.memory.short_term.short_term_memory import ShortTermMemory
 from crewai.utilities.task_output_storage_handler import TaskOutputStorageHandler
+from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
 
 
-def reset_memories_command(long, short, entity, kickoff_outputs, all) -> None:
+def reset_memories_command(
+    long, short, entity, kickoff_outputs, all, knowledge
+) -> None:
     """
     Reset the crew memories.
 
@@ -17,6 +20,7 @@ def reset_memories_command(long, short, entity, kickoff_outputs, all) -> None:
       entity (bool): Whether to reset the entity memory.
       kickoff_outputs (bool): Whether to reset the latest kickoff task outputs.
       all (bool): Whether to reset all memories.
+      knowledge (bool): Whether to reset the knowledge.
     """
 
     try:
@@ -25,6 +29,7 @@ def reset_memories_command(long, short, entity, kickoff_outputs, all) -> None:
             EntityMemory().reset()
             LongTermMemory().reset()
             TaskOutputStorageHandler().reset()
+            KnowledgeStorage().reset()
             click.echo("All memories have been reset.")
         else:
             if long:
@@ -40,6 +45,9 @@ def reset_memories_command(long, short, entity, kickoff_outputs, all) -> None:
             if kickoff_outputs:
                 TaskOutputStorageHandler().reset()
                 click.echo("Latest Kickoff outputs stored has been reset.")
+            if knowledge:
+                KnowledgeStorage().reset()
+                click.echo("Knowledge has been reset.")
 
     except subprocess.CalledProcessError as e:
         click.echo(f"An error occurred while resetting the memories: {e}", err=True)

diff --git a/src/crewai/knowledge/__init__.py b/src/crewai/knowledge/__init__.py
diff --git a/src/crewai/knowledge/embedder/__init__.py b/src/crewai/knowledge/embedder/__init__.py
diff --git a/src/crewai/knowledge/embedder/base_embedder.py b/src/crewai/knowledge/embedder/base_embedder.py
@@ -0,0 +1,55 @@
+from abc import ABC, abstractmethod
+from typing import List
+
+import numpy as np
+
+
+class BaseEmbedder(ABC):
+    """
+    Abstract base class for text embedding models
+    """
+
+    @abstractmethod
+    def embed_chunks(self, chunks: List[str]) -> np.ndarray:
+        """
+        Generate embeddings for a list of text chunks
+
+        Args:
+            chunks: List of text chunks to embed
+
+        Returns:
+            Array of embeddings
+        """
+        pass
+
+    @abstractmethod
+    def embed_texts(self, texts: List[str]) -> np.ndarray:
+        """
+        Generate embeddings for a list of texts
+
+        Args:
+            texts: List of texts to embed
+
+        Returns:
+            Array of embeddings
+        """
+        pass
+
+    @abstractmethod
+    def embed_text(self, text: str) -> np.ndarray:
+        """
+        Generate embedding for a single text
+
+        Args:
+            text: Text to embed
+
+        Returns:
+            Embedding array
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def dimension(self) -> int:
+        """Get the dimension of the embeddings"""
+        pass
diff --git a/src/crewai/knowledge/embedder/fastembed.py b/src/crewai/knowledge/embedder/fastembed.py
@@ -0,0 +1,93 @@
+from pathlib import Path
+from typing import List, Optional, Union
+
+import numpy as np
+
+from .base_embedder import BaseEmbedder
+
+try:
+    from fastembed_gpu import TextEmbedding  # type: ignore
+
+    FASTEMBED_AVAILABLE = True
+except ImportError:
+    try:
+        from fastembed import TextEmbedding
+
+        FASTEMBED_AVAILABLE = True
+    except ImportError:
+        FASTEMBED_AVAILABLE = False
+
+
+class FastEmbed(BaseEmbedder):
+    """
+    A wrapper class for text embedding models using FastEmbed
+    """
+
+    def __init__(
+        self,
+        model_name: str = "BAAI/bge-small-en-v1.5",
+        cache_dir: Optional[Union[str, Path]] = None,
+    ):
+        """
+        Initialize the embedding model
+
+        Args:
+            model_name: Name of the model to use
+            cache_dir: Directory to cache the model
+            gpu: Whether to use GPU acceleration
+        """
+        if not FASTEMBED_AVAILABLE:
+            raise ImportError(
+                "FastEmbed is not installed. Please install it with: "
+                "uv pip install fastembed or uv pip install fastembed-gpu for GPU support"
+            )
+
+        self.model = TextEmbedding(
+            model_name=model_name,
+            cache_dir=str(cache_dir) if cache_dir else None,
+        )
+
+    def embed_chunks(self, chunks: List[str]) -> List[np.ndarray]:
+        """
+        Generate embeddings for a list of text chunks
+
+        Args:
+            chunks: List of text chunks to embed
+
+        Returns:
+            List of embeddings
+        """
+        embeddings = list(self.model.embed(chunks))
+        return embeddings
+
+    def embed_texts(self, texts: List[str]) -> List[np.ndarray]:
+        """
+        Generate embeddings for a list of texts
+
+        Args:
+            texts: List of texts to embed
+
+        Returns:
+            List of embeddings
+        """
+        embeddings = list(self.model.embed(texts))
+        return embeddings
+
+    def embed_text(self, text: str) -> np.ndarray:
+        """
+        Generate embedding for a single text
+
+        Args:
+            text: Text to embed
+
+        Returns:
+            Embedding array
+        """
+        return self.embed_texts([text])[0]
+
+    @property
+    def dimension(self) -> int:
+        """Get the dimension of the embeddings"""
+        # Generate a test embedding to get dimensions
+        test_embed = self.embed_text("test")
+        return len(test_embed)