-
-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
154ca4c
commit f837dc1
Showing
7 changed files
with
260 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
""" | ||
Basic example of scraping pipeline using SmartScraperMultiConcatGraph with Groq | ||
""" | ||
|
||
import os | ||
import json | ||
from dotenv import load_dotenv | ||
from scrapegraphai.graphs import SmartScraperMultiCondGraph | ||
|
||
load_dotenv() | ||
|
||
# ************************************************ | ||
# Define the configuration for the graph | ||
# ************************************************ | ||
|
||
groq_key = os.getenv("GROQ_APIKEY") | ||
|
||
graph_config = { | ||
"llm": { | ||
"model": "groq/gemma-7b-it", | ||
"api_key": groq_key, | ||
"temperature": 0 | ||
}, | ||
"headless": False | ||
} | ||
|
||
# ******************************************************* | ||
# Create the SmartScraperMultiCondGraph instance and run it | ||
# ******************************************************* | ||
|
||
multiple_search_graph = SmartScraperMultiCondGraph( | ||
prompt="Who is Marco Perini?", | ||
source=[ | ||
"https://perinim.github.io/", | ||
"https://perinim.github.io/cv/" | ||
], | ||
schema=None, | ||
config=graph_config | ||
) | ||
|
||
result = multiple_search_graph.run() | ||
print(json.dumps(result, indent=4)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,3 +18,4 @@ undetected-playwright>=0.3.0 | |
google>=3.0.0 | ||
semchunk>=1.0.1 | ||
langchain-ollama>=0.1.3 | ||
simpleeval>=0.9.13 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
""" | ||
SmartScraperMultiCondGraph Module with ConditionalNode | ||
""" | ||
from copy import deepcopy | ||
from typing import List, Optional | ||
from pydantic import BaseModel | ||
from .base_graph import BaseGraph | ||
from .abstract_graph import AbstractGraph | ||
from .smart_scraper_graph import SmartScraperGraph | ||
from ..nodes import ( | ||
GraphIteratorNode, | ||
MergeAnswersNode, | ||
ConcatAnswersNode, | ||
ConditionalNode | ||
) | ||
from ..utils.copy import safe_deepcopy | ||
|
||
class SmartScraperMultiCondGraph(AbstractGraph): | ||
""" | ||
SmartScraperMultiConditionalGraph is a scraping pipeline that scrapes a | ||
list of URLs and generates answers to a given prompt. | ||
Attributes: | ||
prompt (str): The user prompt to search the internet. | ||
llm_model (dict): The configuration for the language model. | ||
embedder_model (dict): The configuration for the embedder model. | ||
headless (bool): A flag to run the browser in headless mode. | ||
verbose (bool): A flag to display the execution information. | ||
model_token (int): The token limit for the language model. | ||
Args: | ||
prompt (str): The user prompt to search the internet. | ||
source (List[str]): The source of the graph. | ||
config (dict): Configuration parameters for the graph. | ||
schema (Optional[BaseModel]): The schema for the graph output. | ||
Example: | ||
>>> search_graph = MultipleSearchGraph( | ||
... "What is Chioggia famous for?", | ||
... {"llm": {"model": "openai/gpt-3.5-turbo"}} | ||
... ) | ||
>>> result = search_graph.run() | ||
""" | ||
|
||
def __init__(self, prompt: str, source: List[str], | ||
config: dict, schema: Optional[BaseModel] = None): | ||
|
||
self.max_results = config.get("max_results", 3) | ||
self.copy_config = safe_deepcopy(config) | ||
self.copy_schema = deepcopy(schema) | ||
|
||
super().__init__(prompt, config, source, schema) | ||
|
||
def _create_graph(self) -> BaseGraph: | ||
""" | ||
Creates the graph of nodes representing the workflow for web scraping and searching, | ||
including a ConditionalNode to decide between merging or concatenating the results. | ||
Returns: | ||
BaseGraph: A graph instance representing the web scraping and searching workflow. | ||
""" | ||
|
||
# Node that iterates over the URLs and collects results | ||
graph_iterator_node = GraphIteratorNode( | ||
input="user_prompt & urls", | ||
output=["results"], | ||
node_config={ | ||
"graph_instance": SmartScraperGraph, | ||
"scraper_config": self.copy_config, | ||
}, | ||
schema=self.copy_schema, | ||
node_name="GraphIteratorNode" | ||
) | ||
|
||
# ConditionalNode to check if len(results) > 2 | ||
conditional_node = ConditionalNode( | ||
input="results", | ||
output=["results"], | ||
node_name="ConditionalNode", | ||
node_config={ | ||
'key_name': 'results', | ||
'condition': 'len(results) > 2' | ||
} | ||
) | ||
|
||
merge_answers_node = MergeAnswersNode( | ||
input="user_prompt & results", | ||
output=["answer"], | ||
node_config={ | ||
"llm_model": self.llm_model, | ||
"schema": self.copy_schema | ||
}, | ||
node_name="MergeAnswersNode" | ||
) | ||
|
||
concat_node = ConcatAnswersNode( | ||
input="results", | ||
output=["answer"], | ||
node_config={}, | ||
node_name="ConcatNode" | ||
) | ||
|
||
# Build the graph | ||
return BaseGraph( | ||
nodes=[ | ||
graph_iterator_node, | ||
conditional_node, | ||
merge_answers_node, | ||
concat_node, | ||
], | ||
edges=[ | ||
(graph_iterator_node, conditional_node), | ||
(conditional_node, merge_answers_node), # True node (len(results) > 2) | ||
(conditional_node, concat_node), # False node (len(results) <= 2) | ||
], | ||
entry_point=graph_iterator_node, | ||
graph_name=self.__class__.__name__ | ||
) | ||
|
||
def run(self) -> str: | ||
""" | ||
Executes the web scraping and searching process. | ||
Returns: | ||
str: The answer to the prompt. | ||
""" | ||
inputs = {"user_prompt": self.prompt, "urls": self.source} | ||
self.final_state, self.execution_info = self.graph.execute(inputs) | ||
|
||
return self.final_state.get("answer", "No answer found.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters