feat: Serper API integration for Google search

ScrapeGraphAI · Nov 2, 2024 · c218546 · c218546
1 parent 7e3598d
commit c218546
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 6 deletions.
diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py
@@ -66,7 +66,8 @@ def _create_graph(self) -> BaseGraph:
                 "llm_model": self.llm_model,
                 "max_results": self.max_results,
                 "loader_kwargs": self.loader_kwargs,
-                "search_engine": self.copy_config.get("search_engine")
+                "search_engine": self.copy_config.get("search_engine"),
+                "serper_api_key": self.copy_config.get("serper_api_key")
             }
         )
 

diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py
@@ -47,6 +47,13 @@ def __init__(
             if node_config.get("search_engine")
             else "google"
         )
+
+        self.serper_api_key = (
+            node_config["serper_api_key"]
+            if node_config.get("serper_api_key")
+            else None
+        )
+
         self.max_results = node_config.get("max_results", 3)
 
     def execute(self, state: dict) -> dict:
@@ -95,7 +102,7 @@ def execute(self, state: dict) -> dict:
         self.logger.info(f"Search Query: {search_query}")
 
         answer = search_on_web(query=search_query, max_results=self.max_results,
-                               search_engine=self.search_engine, proxy=self.proxy)
+                               search_engine=self.search_engine, proxy=self.proxy, serper_api_key=self.serper_api_key)
 
         if len(answer) == 0:
             raise ValueError("Zero results found for the search query.")

diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py
@@ -7,18 +7,20 @@
 from googlesearch import search as google_search
 import requests
 from bs4 import BeautifulSoup
+import json
 
 def search_on_web(query: str, search_engine: str = "Google",
                   max_results: int = 10, port: int = 8080,
-                  timeout: int = 10, proxy: str | dict = None) -> List[str]:
+                  timeout: int = 10, proxy: str | dict = None,
+                  serper_api_key: str = None) -> List[str]:
     """Search web function with improved error handling and validation"""
 
     # Input validation
     if not query or not isinstance(query, str):
         raise ValueError("Query must be a non-empty string")
 
     search_engine = search_engine.lower()
-    valid_engines = {"google", "duckduckgo", "bing", "searxng"}
+    valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"}
     if search_engine not in valid_engines:
         raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}")
 
@@ -42,7 +44,10 @@ def search_on_web(query: str, search_engine: str = "Google",
 
         elif search_engine == "searxng":
             results = _search_searxng(query, max_results, port, timeout)
-
+
+        elif search_engine.lower() == "serper":
+            results = _search_serper(query, max_results, serper_api_key, timeout)
+
         return filter_pdf_links(results)
 
     except requests.Timeout:
@@ -76,6 +81,25 @@ def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> Li
     response.raise_for_status()
     return [result['url'] for result in response.json().get("results", [])[:max_results]]
 
+def _search_serper(query: str, max_results: int, serper_api_key: str, timeout: int) -> List[str]:
+    """Helper function for serper api"""
+    if not serper_api_key:
+        raise ValueError("API key is required for serper api.")
+
+    url = "https://google.serper.dev/search"
+    payload = json.dumps({
+        "q": query,
+        "num": max_results
+    })
+    headers = {
+        'X-API-KEY': serper_api_key,
+        'Content-Type': 'application/json'
+    }
+    response = requests.post(url, headers=headers, data=payload, timeout=timeout)
+    response.raise_for_status()
+    return [result.get("link") for result in response.json().get("organic", [])]
+
+
 def format_proxy(proxy):
     if isinstance(proxy, dict):
         server = proxy.get('server')
@@ -102,4 +126,4 @@ def filter_pdf_links(links: List[str]) -> List[str]:
     Returns:
         List[str]: A list of URLs excluding any that end with '.pdf'.
     """
-    return [link for link in links if not link.lower().endswith('.pdf')]
+    return [link for link in links if not link.lower().endswith('.pdf')]