community[patch]: Force opt-in for WebResearchRetriever (CVE-2024-3095)…

… (#24451) This PR addresses the issue raised by (CVE-2024-3095) https://huntr.com/bounties/e62d4895-2901-405b-9559-38276b6a5273 Unfortunately, we didn't do a good job writing the initial report. It's pointing at both the wrong package and the wrong code. The affected code is the Web Retriever not the AsyncHTMLLoader, and the WebRetriever lives in langchain-community The vulnerable code lives here: https://github.com/langchain-ai/langchain/blob/0bd3f4e1292c085f22bef1fff16059851e11d042/libs/community/langchain_community/retrievers/web_research.py#L233-L233 This PR adds a forced opt-in for users to make sure they are aware of the risk and can mitigate by configuring a proxy: https://github.com/langchain-ai/langchain/blob/0bd3f4e1292c085f22bef1fff16059851e11d042/libs/community/langchain_community/retrievers/web_research.py#L84-L84
langchain-ai · Jul 19, 2024 · 604dfe2 · 604dfe2
1 parent f101c75
commit 604dfe2
Showing 1 changed file with 30 additions and 1 deletion.
diff --git a/libs/community/langchain_community/retrievers/web_research.py b/libs/community/langchain_community/retrievers/web_research.py
@@ -1,6 +1,6 @@
 import logging
 import re
-from typing import List, Optional
+from typing import Any, List, Optional
 
 from langchain.chains import LLMChain
 from langchain.chains.prompt_selector import ConditionalPromptSelector
@@ -81,6 +81,35 @@ class WebResearchRetriever(BaseRetriever):
         "check .netrc for proxy configuration",
     )
 
+    allow_dangerous_requests: bool = False
+    """A flag to force users to acknowledge the risks of SSRF attacks when using 
+    this retriever.
+    
+    Users should set this flag to `True` if they have taken the necessary precautions
+    to prevent SSRF attacks when using this retriever.
+    
+    For example, users can run the requests through a properly configured
+    proxy and prevent the crawler from accidentally crawling internal resources.
+    """
+
+    def __init__(self, **kwargs: Any) -> None:
+        """Initialize the retriever."""
+        allow_dangerous_requests = kwargs.get("allow_dangerous_requests", False)
+        if not allow_dangerous_requests:
+            raise ValueError(
+                "WebResearchRetriever crawls URLs surfaced through "
+                "the provided search engine. It is possible that some of those URLs "
+                "will end up pointing to machines residing on an internal network, "
+                "leading"
+                "to an SSRF (Server-Side Request Forgery) attack. "
+                "To protect yourself against that risk, you can run the requests "
+                "through a proxy and prevent the crawler from accidentally crawling "
+                "internal resources."
+                "If've taken the necessary precautions, you can set "
+                "`allow_dangerous_requests` to `True`."
+            )
+        super().__init__(**kwargs)
+
     @classmethod
     def from_llm(
         cls,