Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Knowledge #1567

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
run: uv python install 3.11.9

- name: Install the project
run: uv sync --dev
run: uv sync --dev --all-extras

- name: Run tests
run: uv run pytest tests
32 changes: 32 additions & 0 deletions path/to/src/crewai/knowledge/source/base_knowledge_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from abc import ABC, abstractmethod
from typing import List

from crewai.knowledge.embedder.base_embedder import BaseEmbedder


class BaseKnowledgeSource(ABC):
"""Abstract base class for different types of knowledge sources."""

def __init__(
self,
chunk_size: int = 1000,
chunk_overlap: int = 200,
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.chunks: List[str] = []

@abstractmethod
def load_content(self):
"""Load and preprocess content from the source."""
pass

@abstractmethod
def add(self, embedder: BaseEmbedder) -> None:
"""Add content to the knowledge base, chunk it, and compute embeddings."""
pass

@abstractmethod
def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
"""Query the knowledge base using semantic search."""
pass
10 changes: 10 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@ Repository = "https://github.com/crewAIInc/crewAI"
[project.optional-dependencies]
tools = ["crewai-tools>=0.14.0"]
agentops = ["agentops>=0.3.0"]
fastembed = ["fastembed>=0.4.1"]
pdfplumber = [
"pdfplumber>=0.11.4",
]
pandas = [
"pandas>=2.2.3",
]
openpyxl = [
"openpyxl>=3.1.5",
]
mem0 = ["mem0ai>=0.1.29"]

[tool.uv]
Expand Down
14 changes: 13 additions & 1 deletion src/crewai/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import warnings

from crewai.agent import Agent
from crewai.crew import Crew
from crewai.flow.flow import Flow
from crewai.knowledge.knowledge import Knowledge
from crewai.llm import LLM
from crewai.pipeline import Pipeline
from crewai.process import Process
Expand All @@ -15,4 +17,14 @@
module="pydantic.main",
)
__version__ = "0.79.4"
__all__ = ["Agent", "Crew", "Process", "Task", "Pipeline", "Router", "LLM", "Flow"]
__all__ = [
"Agent",
"Crew",
"Process",
"Task",
"Pipeline",
"Router",
"LLM",
"Flow",
"Knowledge",
]
25 changes: 24 additions & 1 deletion src/crewai/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.agents.crew_agent_executor import CrewAgentExecutor
from crewai.cli.constants import ENV_VARS
from crewai.knowledge.knowledge import Knowledge
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.llm import LLM
from crewai.memory.contextual.contextual_memory import ContextualMemory
from crewai.tools.agent_tools.agent_tools import AgentTools
from crewai.tools import BaseTool
from crewai.tools.agent_tools.agent_tools import AgentTools
from crewai.utilities import Converter, Prompts
from crewai.utilities.constants import TRAINED_AGENTS_DATA_FILE, TRAINING_DATA_FILE
from crewai.utilities.token_counter_callback import TokenCalcHandler
Expand Down Expand Up @@ -52,6 +54,7 @@ class Agent(BaseAgent):
role: The role of the agent.
goal: The objective of the agent.
backstory: The backstory of the agent.
knowledge: The knowledge base of the agent.
config: Dict representation of agent configuration.
llm: The language model that will run the agent.
function_calling_llm: The language model that will handle the tool calling for this agent, it overrides the crew function_calling_llm.
Expand Down Expand Up @@ -85,6 +88,10 @@ class Agent(BaseAgent):
llm: Union[str, InstanceOf[LLM], Any] = Field(
description="Language model that will run the agent.", default=None
)
knowledge_sources: Optional[List[BaseKnowledgeSource]] = Field(
default=None,
description="Knowledge sources for the agent.",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be better to declare this on the crew class. the task prompt will query from the relevant trickling down to the agent level, then defining here on the agent level

)
function_calling_llm: Optional[Any] = Field(
description="Language model that will run the agent.", default=None
)
Expand Down Expand Up @@ -119,6 +126,8 @@ class Agent(BaseAgent):
default="safe",
description="Mode for code execution: 'safe' (using Docker) or 'unsafe' (direct execution).",
)
# TODO: Lorenze add knowledge_embedder. Support direct class or config dict.
_knowledge: Optional[Knowledge] = PrivateAttr(default=None)

@model_validator(mode="after")
def post_init_setup(self):
Expand Down Expand Up @@ -227,6 +236,12 @@ def post_init_setup(self):
if self.allow_code_execution:
self._validate_docker_installation()

# Initialize the Knowledge object if knowledge_sources are provided
if self.knowledge_sources:
self._knowledge = Knowledge(sources=self.knowledge_sources)
else:
self._knowledge = None

return self

def _setup_agent_executor(self):
Expand Down Expand Up @@ -272,6 +287,14 @@ def execute_task(
if memory.strip() != "":
task_prompt += self.i18n.slice("memory").format(memory=memory)

# Integrate the knowledge base
if self._knowledge:
# Query the knowledge base for relevant information
knowledge_snippets = self._knowledge.query(query=task.prompt())
if knowledge_snippets:
formatted_knowledge = "\n".join(knowledge_snippets)
task_prompt += f"\n\nAdditional Information:\n{formatted_knowledge}"

tools = tools or self.tools or []
self.create_agent_executor(tools=tools, task=task)

Expand Down
5 changes: 3 additions & 2 deletions src/crewai/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,14 +136,15 @@ def log_tasks_outputs() -> None:
@click.option("-l", "--long", is_flag=True, help="Reset LONG TERM memory")
@click.option("-s", "--short", is_flag=True, help="Reset SHORT TERM memory")
@click.option("-e", "--entities", is_flag=True, help="Reset ENTITIES memory")
@click.option("-kn", "--knowledge", is_flag=True, help="Reset KNOWLEDGE")
@click.option(
"-k",
"--kickoff-outputs",
is_flag=True,
help="Reset LATEST KICKOFF TASK OUTPUTS",
)
@click.option("-a", "--all", is_flag=True, help="Reset ALL memories")
def reset_memories(long, short, entities, kickoff_outputs, all):
def reset_memories(long, short, entities, knowledge, kickoff_outputs, all):
"""
Reset the crew memories (long, short, entity, latest_crew_kickoff_ouputs). This will delete all the data saved.
"""
Expand All @@ -153,7 +154,7 @@ def reset_memories(long, short, entities, kickoff_outputs, all):
"Please specify at least one memory type to reset using the appropriate flags."
)
return
reset_memories_command(long, short, entities, kickoff_outputs, all)
reset_memories_command(long, short, entities, knowledge, kickoff_outputs, all)
except Exception as e:
click.echo(f"An error occurred while resetting memories: {e}", err=True)

Expand Down
10 changes: 9 additions & 1 deletion src/crewai/cli/reset_memories_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@
from crewai.memory.long_term.long_term_memory import LongTermMemory
from crewai.memory.short_term.short_term_memory import ShortTermMemory
from crewai.utilities.task_output_storage_handler import TaskOutputStorageHandler
from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage


def reset_memories_command(long, short, entity, kickoff_outputs, all) -> None:
def reset_memories_command(
long, short, entity, kickoff_outputs, all, knowledge
) -> None:
"""
Reset the crew memories.

Expand All @@ -17,6 +20,7 @@ def reset_memories_command(long, short, entity, kickoff_outputs, all) -> None:
entity (bool): Whether to reset the entity memory.
kickoff_outputs (bool): Whether to reset the latest kickoff task outputs.
all (bool): Whether to reset all memories.
knowledge (bool): Whether to reset the knowledge.
"""

try:
Expand All @@ -25,6 +29,7 @@ def reset_memories_command(long, short, entity, kickoff_outputs, all) -> None:
EntityMemory().reset()
LongTermMemory().reset()
TaskOutputStorageHandler().reset()
KnowledgeStorage().reset()
click.echo("All memories have been reset.")
else:
if long:
Expand All @@ -40,6 +45,9 @@ def reset_memories_command(long, short, entity, kickoff_outputs, all) -> None:
if kickoff_outputs:
TaskOutputStorageHandler().reset()
click.echo("Latest Kickoff outputs stored has been reset.")
if knowledge:
KnowledgeStorage().reset()
click.echo("Knowledge has been reset.")

except subprocess.CalledProcessError as e:
click.echo(f"An error occurred while resetting the memories: {e}", err=True)
Expand Down
Empty file.
Empty file.
55 changes: 55 additions & 0 deletions src/crewai/knowledge/embedder/base_embedder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from abc import ABC, abstractmethod
from typing import List

import numpy as np


class BaseEmbedder(ABC):
"""
Abstract base class for text embedding models
"""

@abstractmethod
def embed_chunks(self, chunks: List[str]) -> np.ndarray:
"""
Generate embeddings for a list of text chunks

Args:
chunks: List of text chunks to embed

Returns:
Array of embeddings
"""
pass

@abstractmethod
def embed_texts(self, texts: List[str]) -> np.ndarray:
"""
Generate embeddings for a list of texts

Args:
texts: List of texts to embed

Returns:
Array of embeddings
"""
pass

@abstractmethod
def embed_text(self, text: str) -> np.ndarray:
"""
Generate embedding for a single text

Args:
text: Text to embed

Returns:
Embedding array
"""
pass

@property
@abstractmethod
def dimension(self) -> int:
"""Get the dimension of the embeddings"""
pass
93 changes: 93 additions & 0 deletions src/crewai/knowledge/embedder/fastembed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from pathlib import Path
from typing import List, Optional, Union

import numpy as np

from .base_embedder import BaseEmbedder

try:
from fastembed_gpu import TextEmbedding # type: ignore

FASTEMBED_AVAILABLE = True
except ImportError:
try:
from fastembed import TextEmbedding

FASTEMBED_AVAILABLE = True
except ImportError:
FASTEMBED_AVAILABLE = False


class FastEmbed(BaseEmbedder):
"""
A wrapper class for text embedding models using FastEmbed
"""

def __init__(
self,
model_name: str = "BAAI/bge-small-en-v1.5",
cache_dir: Optional[Union[str, Path]] = None,
):
"""
Initialize the embedding model

Args:
model_name: Name of the model to use
cache_dir: Directory to cache the model
gpu: Whether to use GPU acceleration
"""
if not FASTEMBED_AVAILABLE:
raise ImportError(
"FastEmbed is not installed. Please install it with: "
"uv pip install fastembed or uv pip install fastembed-gpu for GPU support"
)

self.model = TextEmbedding(
model_name=model_name,
cache_dir=str(cache_dir) if cache_dir else None,
)

def embed_chunks(self, chunks: List[str]) -> List[np.ndarray]:
"""
Generate embeddings for a list of text chunks

Args:
chunks: List of text chunks to embed

Returns:
List of embeddings
"""
embeddings = list(self.model.embed(chunks))
return embeddings

def embed_texts(self, texts: List[str]) -> List[np.ndarray]:
"""
Generate embeddings for a list of texts

Args:
texts: List of texts to embed

Returns:
List of embeddings
"""
embeddings = list(self.model.embed(texts))
return embeddings

def embed_text(self, text: str) -> np.ndarray:
"""
Generate embedding for a single text

Args:
text: Text to embed

Returns:
Embedding array
"""
return self.embed_texts([text])[0]

@property
def dimension(self) -> int:
"""Get the dimension of the embeddings"""
# Generate a test embedding to get dimensions
test_embed = self.embed_text("test")
return len(test_embed)
Loading