From 40d206abe51769abb73d5203cd00a0bf0a7ba9e3 Mon Sep 17 00:00:00 2001 From: Serper Date: Fri, 16 Jun 2023 00:42:21 +0100 Subject: [PATCH 1/2] Added Serper.dev search engine provider (Google Search API) --- pipelines/examples/agents/ReAct_example.py | 4 +- pipelines/examples/agents/ReAct_example_cn.py | 8 +- pipelines/pipelines/nodes/retriever/web.py | 2 +- .../nodes/search_engine/providers.py | 109 ++++++++++++++++++ .../pipelines/nodes/search_engine/web.py | 2 +- 5 files changed, 117 insertions(+), 8 deletions(-) diff --git a/pipelines/examples/agents/ReAct_example.py b/pipelines/examples/agents/ReAct_example.py index 95ad00aba0d2..4496ad66eda6 100644 --- a/pipelines/examples/agents/ReAct_example.py +++ b/pipelines/examples/agents/ReAct_example.py @@ -82,7 +82,7 @@ # yapf: disable parser = argparse.ArgumentParser() -parser.add_argument("--search_api_key", default=None, type=str, help="The SerpAPI key.") +parser.add_argument("--search_api_key", default=None, type=str, help="The Serper.dev or SerpAPI key.") parser.add_argument('--llm_name', choices=['THUDM/chatglm-6b', "THUDM/chatglm-6b-v1.1", "gpt-3.5-turbo", "gpt-4"], default="THUDM/chatglm-6b-v1.1", help="The chatbot models ") parser.add_argument("--api_key", default=None, type=str, help="The API Key.") args = parser.parse_args() @@ -97,7 +97,7 @@ def search_and_action_example(): default_prompt_template="question-answering-with-document-scores", ) - # https://serpapi.com/dashboard + # https://serper.dev web_retriever = WebRetriever(api_key=args.search_api_key, top_search_results=2) pipeline = WebQAPipeline(retriever=web_retriever, prompt_node=pn) diff --git a/pipelines/examples/agents/ReAct_example_cn.py b/pipelines/examples/agents/ReAct_example_cn.py index 801816db8987..14bc9d3bde7d 100644 --- a/pipelines/examples/agents/ReAct_example_cn.py +++ b/pipelines/examples/agents/ReAct_example_cn.py @@ -60,7 +60,7 @@ parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to run dense_qa system, defaults to gpu.") parser.add_argument("--index_name", default='dureader_index', type=str, help="The ann index name of ANN.") parser.add_argument("--search_engine", choices=['faiss', 'milvus'], default="faiss", help="The type of ANN search engine.") -parser.add_argument("--retriever", choices=['dense', 'SerpAPI'], default="dense", help="The type of Retriever.") +parser.add_argument("--retriever", choices=['dense', 'SerperDev'], default="dense", help="The type of Retriever.") parser.add_argument("--max_seq_len_query", default=64, type=int, help="The maximum total length of query after tokenization.") parser.add_argument("--max_seq_len_passage", default=256, type=int, help="The maximum total length of passage after tokenization.") parser.add_argument("--retriever_batch_size", default=16, type=int, help="The batch size of retriever to extract passage embedding for building ANN index.") @@ -68,7 +68,7 @@ parser.add_argument("--passage_embedding_model", default="rocketqa-zh-base-query-encoder", type=str, help="The passage_embedding_model path") parser.add_argument("--params_path", default="checkpoints/model_40/model_state.pdparams", type=str, help="The checkpoint path") parser.add_argument("--embedding_dim", default=768, type=int, help="The embedding_dim of index") -parser.add_argument("--search_api_key", default=None, type=str, help="The SerpAPI key.") +parser.add_argument("--search_api_key", default=None, type=str, help="The Serper.dev or SerpAPI key.") parser.add_argument('--embed_title', default=False, type=bool, help="The title to be embedded into embedding") parser.add_argument('--model_type', choices=['ernie_search', 'ernie', 'bert', 'neural_search'], default="ernie", help="the ernie model types") parser.add_argument('--llm_name', choices=['ernie-bot', 'THUDM/chatglm-6b', "gpt-3.5-turbo", "gpt-4"], default="THUDM/chatglm-6b", help="The chatbot models ") @@ -213,6 +213,6 @@ def search_and_action_example(web_retriever): use_gpu = True if args.device == "gpu" else False web_retriever = get_faiss_retriever(use_gpu) else: - # https://serpapi.com/dashboard - web_retriever = WebRetriever(api_key=args.search_api_key, engine="bing", top_search_results=2) + # https://serper.dev + web_retriever = WebRetriever(api_key=args.search_api_key, engine="google", top_search_results=2) search_and_action_example(web_retriever) diff --git a/pipelines/pipelines/nodes/retriever/web.py b/pipelines/pipelines/nodes/retriever/web.py index b5a66d90ec60..8d5a1034a55a 100644 --- a/pipelines/pipelines/nodes/retriever/web.py +++ b/pipelines/pipelines/nodes/retriever/web.py @@ -69,7 +69,7 @@ class WebRetriever(BaseRetriever): def __init__( self, api_key: str, - search_engine_provider: Union[str, SearchEngine] = "SerpAPI", + search_engine_provider: Union[str, SearchEngine] = "SerperDev", engine: Optional[str] = "google", top_search_results: Optional[int] = 10, search_engine_kwargs: Optional[Dict[str, Any]] = None, diff --git a/pipelines/pipelines/nodes/search_engine/providers.py b/pipelines/pipelines/nodes/search_engine/providers.py index f2eb382e8a98..9e8833968bbe 100644 --- a/pipelines/pipelines/nodes/search_engine/providers.py +++ b/pipelines/pipelines/nodes/search_engine/providers.py @@ -130,3 +130,112 @@ def search(self, query: str, **kwargs) -> List[Document]: logger.debug("SerpAPI returned %s documents for the query '%s'", len(documents), query) result_docs = documents[:top_k] return self.score_results(result_docs, len(answer_box) > 0) + + +class SerperDev(SearchEngine): + """ + Serper.dev is a search engine that provides a REST API to access search results from Google. See the [Serper.dev website](https://serper.dev.com/) for more details. + """ + + def __init__( + self, + api_key: str, + top_k: Optional[int] = 10, + engine: Optional[str] = "google", + search_engine_kwargs: Optional[Dict[str, Any]] = None, + ): + """ + :param api_key: API key for Serper.dev API. + :param top_k: Number of results to return. + :param engine: Search engine to use, only supports Google. + :param search_engine_kwargs: Additional parameters passed to the SerperDev API. For example, you can set 'hl' to 'en' + to set the search results language to English. + See the [Serper.dev documentation](https://serper.dev/playground) for the full list of supported parameters. + """ + super().__init__() + self.params_dict: Dict[str, Union[str, int, float]] = {} + self.api_key = api_key + self.kwargs = search_engine_kwargs if search_engine_kwargs else {} + self.engine = engine + self.top_k = top_k + + def search(self, query: str, **kwargs) -> List[Document]: + """ + :param query: Query string. + :param kwargs: Additional parameters passed to the Serper.dev API. For example, you can set 'hl' to 'en' + to set the search results language to English. + See the [Serper.dev documentation](https://serper.dev/playground) for the full list of supported parameters. + :return: List[Document] + """ + kwargs = {**self.kwargs, **kwargs} + top_k = kwargs.pop("top_k", self.top_k) + url = "https://google.serper.dev/search" + + params = {"q": query, **kwargs} + + headers = {"X-API-KEY": self.api_key, "Content-Type": "application/json"} + + response = requests.post(url, headers=headers, json=params, timeout=30) + + if response.status_code != 200: + raise Exception(f"Error while querying {self.__class__.__name__}: {response.text}") + + json_result = json.loads(response.text) + organic = [ + Document.from_dict(d, field_map={"snippet": "content"}) for d in json_result["organic"] if "snippet" in d + ] + answer_box = [] + if "answerBox" in json_result: + answer_dict = json_result["answerBox"] + for key in ["answer", "snippetHighlighted", "snippet", "title"]: + if key in answer_dict: + answer_box_content = answer_dict[key] + if isinstance(answer_box_content, list): + answer_box_content = answer_box_content[0] + answer_box = [ + Document.from_dict( + { + "title": answer_dict.get("title", ""), + "content": answer_box_content, + "link": answer_dict.get("link", ""), + } + ) + ] + break + + people_also_search = [] + if "peopleAlsoSearchFor" in json_result: + for result in json_result["peopleAlsoSearchFor"]: + people_also_search.append( + Document.from_dict( + { + "title": result["title"], + "content": result["snippet"] if result.get("snippet") else result["title"], + "link": result["link"], + } + ) + ) + + related_searches = [] + if "relatedSearches" in json_result: + for result in json_result["relatedSearches"]: + related_searches.append(Document.from_dict({"content": result.get("query", "")})) + + related_questions = [] + if "peopleAlsoAsk" in json_result: + for result in json_result["peopleAlsoAsk"]: + related_questions.append( + Document.from_dict( + { + "title": result["title"], + "content": result["snippet"] if result.get("snippet") else result["title"], + "link": result["link"], + } + ) + ) + + documents = answer_box + organic + people_also_search + related_searches + related_questions + + logger.debug("Serper.dev API returned %s documents for the query '%s'", len(documents), query) + result_docs = documents[:top_k] + return self.score_results(result_docs, len(answer_box) > 0) diff --git a/pipelines/pipelines/nodes/search_engine/web.py b/pipelines/pipelines/nodes/search_engine/web.py index 573756f58527..9a7c16f643ac 100644 --- a/pipelines/pipelines/nodes/search_engine/web.py +++ b/pipelines/pipelines/nodes/search_engine/web.py @@ -39,7 +39,7 @@ def __init__( self, api_key: str, top_k: Optional[int] = 10, - search_engine_provider: Union[str, SearchEngine] = "SerpAPI", + search_engine_provider: Union[str, SearchEngine] = "SerperDev", engine: Optional[str] = "google", search_engine_kwargs: Optional[Dict[str, Any]] = None, ): From 8bb221f5079c7e7cd8bf47e13ea5252ceb97e544 Mon Sep 17 00:00:00 2001 From: Serper Date: Tue, 20 Jun 2023 01:24:42 +0100 Subject: [PATCH 2/2] Added Serper.dev search engine provider (Google Search API) --- pipelines/examples/agents/ReAct_example_cn.py | 2 +- pipelines/pipelines/nodes/retriever/web.py | 2 +- pipelines/pipelines/nodes/search_engine/web.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/examples/agents/ReAct_example_cn.py b/pipelines/examples/agents/ReAct_example_cn.py index 14bc9d3bde7d..967381e0e104 100644 --- a/pipelines/examples/agents/ReAct_example_cn.py +++ b/pipelines/examples/agents/ReAct_example_cn.py @@ -60,7 +60,7 @@ parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to run dense_qa system, defaults to gpu.") parser.add_argument("--index_name", default='dureader_index', type=str, help="The ann index name of ANN.") parser.add_argument("--search_engine", choices=['faiss', 'milvus'], default="faiss", help="The type of ANN search engine.") -parser.add_argument("--retriever", choices=['dense', 'SerperDev'], default="dense", help="The type of Retriever.") +parser.add_argument("--retriever", choices=['dense', 'SerperDev', 'SerpAPI'], default="dense", help="The type of Retriever.") parser.add_argument("--max_seq_len_query", default=64, type=int, help="The maximum total length of query after tokenization.") parser.add_argument("--max_seq_len_passage", default=256, type=int, help="The maximum total length of passage after tokenization.") parser.add_argument("--retriever_batch_size", default=16, type=int, help="The batch size of retriever to extract passage embedding for building ANN index.") diff --git a/pipelines/pipelines/nodes/retriever/web.py b/pipelines/pipelines/nodes/retriever/web.py index 8d5a1034a55a..b5a66d90ec60 100644 --- a/pipelines/pipelines/nodes/retriever/web.py +++ b/pipelines/pipelines/nodes/retriever/web.py @@ -69,7 +69,7 @@ class WebRetriever(BaseRetriever): def __init__( self, api_key: str, - search_engine_provider: Union[str, SearchEngine] = "SerperDev", + search_engine_provider: Union[str, SearchEngine] = "SerpAPI", engine: Optional[str] = "google", top_search_results: Optional[int] = 10, search_engine_kwargs: Optional[Dict[str, Any]] = None, diff --git a/pipelines/pipelines/nodes/search_engine/web.py b/pipelines/pipelines/nodes/search_engine/web.py index 9a7c16f643ac..573756f58527 100644 --- a/pipelines/pipelines/nodes/search_engine/web.py +++ b/pipelines/pipelines/nodes/search_engine/web.py @@ -39,7 +39,7 @@ def __init__( self, api_key: str, top_k: Optional[int] = 10, - search_engine_provider: Union[str, SearchEngine] = "SerperDev", + search_engine_provider: Union[str, SearchEngine] = "SerpAPI", engine: Optional[str] = "google", search_engine_kwargs: Optional[Dict[str, Any]] = None, ):