Skip to content

Commit

Permalink
feat: Serper API integration for Google search
Browse files Browse the repository at this point in the history
  • Loading branch information
aziz-ullah-khan committed Nov 2, 2024
1 parent 7e3598d commit c218546
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 6 deletions.
3 changes: 2 additions & 1 deletion scrapegraphai/graphs/search_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ def _create_graph(self) -> BaseGraph:
"llm_model": self.llm_model,
"max_results": self.max_results,
"loader_kwargs": self.loader_kwargs,
"search_engine": self.copy_config.get("search_engine")
"search_engine": self.copy_config.get("search_engine"),
"serper_api_key": self.copy_config.get("serper_api_key")
}
)

Expand Down
9 changes: 8 additions & 1 deletion scrapegraphai/nodes/search_internet_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@ def __init__(
if node_config.get("search_engine")
else "google"
)

self.serper_api_key = (
node_config["serper_api_key"]
if node_config.get("serper_api_key")
else None
)

self.max_results = node_config.get("max_results", 3)

def execute(self, state: dict) -> dict:
Expand Down Expand Up @@ -95,7 +102,7 @@ def execute(self, state: dict) -> dict:
self.logger.info(f"Search Query: {search_query}")

answer = search_on_web(query=search_query, max_results=self.max_results,
search_engine=self.search_engine, proxy=self.proxy)
search_engine=self.search_engine, proxy=self.proxy, serper_api_key=self.serper_api_key)

if len(answer) == 0:
raise ValueError("Zero results found for the search query.")
Expand Down
32 changes: 28 additions & 4 deletions scrapegraphai/utils/research_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,20 @@
from googlesearch import search as google_search
import requests
from bs4 import BeautifulSoup
import json

def search_on_web(query: str, search_engine: str = "Google",
max_results: int = 10, port: int = 8080,
timeout: int = 10, proxy: str | dict = None) -> List[str]:
timeout: int = 10, proxy: str | dict = None,
serper_api_key: str = None) -> List[str]:
"""Search web function with improved error handling and validation"""

# Input validation
if not query or not isinstance(query, str):
raise ValueError("Query must be a non-empty string")

search_engine = search_engine.lower()
valid_engines = {"google", "duckduckgo", "bing", "searxng"}
valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"}
if search_engine not in valid_engines:
raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}")

Expand All @@ -42,7 +44,10 @@ def search_on_web(query: str, search_engine: str = "Google",

elif search_engine == "searxng":
results = _search_searxng(query, max_results, port, timeout)


elif search_engine.lower() == "serper":
results = _search_serper(query, max_results, serper_api_key, timeout)

return filter_pdf_links(results)

except requests.Timeout:
Expand Down Expand Up @@ -76,6 +81,25 @@ def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> Li
response.raise_for_status()
return [result['url'] for result in response.json().get("results", [])[:max_results]]

def _search_serper(query: str, max_results: int, serper_api_key: str, timeout: int) -> List[str]:
"""Helper function for serper api"""
if not serper_api_key:
raise ValueError("API key is required for serper api.")

url = "https://google.serper.dev/search"
payload = json.dumps({
"q": query,
"num": max_results
})
headers = {
'X-API-KEY': serper_api_key,
'Content-Type': 'application/json'
}
response = requests.post(url, headers=headers, data=payload, timeout=timeout)
response.raise_for_status()
return [result.get("link") for result in response.json().get("organic", [])]


def format_proxy(proxy):
if isinstance(proxy, dict):
server = proxy.get('server')
Expand All @@ -102,4 +126,4 @@ def filter_pdf_links(links: List[str]) -> List[str]:
Returns:
List[str]: A list of URLs excluding any that end with '.pdf'.
"""
return [link for link in links if not link.lower().endswith('.pdf')]
return [link for link in links if not link.lower().endswith('.pdf')]

0 comments on commit c218546

Please sign in to comment.