From 6c35977d70ecc47abe4d632b9ce53353daff41c7 Mon Sep 17 00:00:00 2001 From: Stephen Lincoln Date: Tue, 14 Nov 2023 15:02:32 -0500 Subject: [PATCH 01/12] Added Sigma rule package updater --- sigmaiq/globals.py | 6 ++ sigmaiq/utils/sigma/__init__.py | 0 sigmaiq/utils/sigma/rule_updater.py | 156 ++++++++++++++++++++++++++++ 3 files changed, 162 insertions(+) create mode 100644 sigmaiq/globals.py create mode 100644 sigmaiq/utils/sigma/__init__.py create mode 100644 sigmaiq/utils/sigma/rule_updater.py diff --git a/sigmaiq/globals.py b/sigmaiq/globals.py new file mode 100644 index 0000000..29dd4b6 --- /dev/null +++ b/sigmaiq/globals.py @@ -0,0 +1,6 @@ +import os + +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) +SIGMA_RULE_DIR = os.path.join(ROOT_DIR, "llm/data/sigma") + +print(SIGMA_RULE_DIR) \ No newline at end of file diff --git a/sigmaiq/utils/sigma/__init__.py b/sigmaiq/utils/sigma/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sigmaiq/utils/sigma/rule_updater.py b/sigmaiq/utils/sigma/rule_updater.py new file mode 100644 index 0000000..bd98a28 --- /dev/null +++ b/sigmaiq/utils/sigma/rule_updater.py @@ -0,0 +1,156 @@ +import os +import shutil + +import requests +from sigmaiq.globals import SIGMA_RULE_DIR +from pathlib import Path +import zipfile +import io + + +class SigmaRuleUpdater: + """Download/update Sigma rules from the official SigmaHQ release packages.""" + + PACKAGE_NAME_URIS = { + "core": "sigma_core.zip", + "core+": "sigma_core+.zip", + "core++": "sigma_core++.zip", + "emerging_threats": "sigma_emerging_threats_addon.zip", + "all": "sigma_all_rules.zip", + } + + BASE_DOWNLOAD_URL = "https://github.com/SigmaHQ/sigma/releases/download" + + def __init__(self, rule_dir: str = None): + self.rule_dir = setup_rule_dir(rule_dir) + self.installed_tag = self._get_installed_release_tag() + self.latest_tag = get_latest_sigma_release_tag() + + def _get_installed_release_tag(self) -> str: + """Returns the currently installed Sigma release tag by checking the directory name of the + SIGMA_RULE_DIR directory. + + Returns: + str: The currently installed Sigma release tag, or "" if not found. + """ + version_file = Path(self.rule_dir) / "version.txt" + if not version_file.exists(): + return "" + with open(version_file, "r") as f: + current_version = "r"+f.readlines()[0].split(": ")[-1].strip() + return current_version + + def _needs_update(self) -> bool: + """Checks if the currently installed Sigma release tag is the same as the latest release tag. + + Returns: + bool: True if the latest_tag is different from the currently installed release tag, False otherwise. + """ + if not self.installed_tag: + return True + return self.latest_tag != self.installed_tag + + def update_sigma_rules( + self, + force: bool = False, + package_name: str = "core", + emerging_threats: bool = False, + ): + """Downloads the latest Sigma release package, if needed, and extracts it to the SIGMA_RULE_DIR directory. + + Args: + force (bool, optional): If True, will always download the latest Sigma release package. Defaults to False. + package_name (str, optional): The name of the Sigma release package to download. Defaults to "core". Valid + options are "core", "core+", "core++", "emerging_threats", and "all" + emerging_threats (bool, optional): If True, will download the emerging_threats Sigma release package in + addition to the package specified in args. Defaults to False. + + Raises: + ValueError: If the package_name is invalid. + + + """ + if package_name not in self.PACKAGE_NAME_URIS.keys(): + raise ValueError(f"Invalid package name '{package_name}'. Valid options are: {self.PACKAGE_NAME_URIS.keys}") + + print(f"Installed Sigma release tag at {self.rule_dir}: {self.installed_tag}") + if not self._needs_update() and not force: + print("Sigma rules are up-to-date.") + return + print(f"Updating Sigma rules to {self.latest_tag}...") + self._download_sigma_release(package_name) + + if emerging_threats: + if package_name in ["emerging_threats", "all"]: + print("emerging-threats already contains in the selected package, skipping download.") + else: + print("Downloading emerging_threats Sigma rules...") + self._download_sigma_release("emerging_threats", overwrite=False) + print("Sigma rules up to date!") + + def _download_sigma_release(self, package_name: str, overwrite: bool = True): + """Downloads the latest Sigma release package and extracts it to the SIGMA_RULE_DIR directory. + + Args: + package_name (str): The name of the Sigma release package to download. Valid options are "core", "core+", + "core++", "emerging_threats", and "all" + overwrite (bool, optional): If True, will overwrite the currently installed Sigma release package. Defaults + to True. + + Raises: + ValueError: If the package_name is invalid. + """ + if overwrite: + print(f"Removing Sigma rules at {self.rule_dir} before new download...") + shutil.rmtree(os.path.join(self.rule_dir, "rules"), ignore_errors=True) + shutil.rmtree(os.path.join(self.rule_dir, "rules-emerging-threats"), ignore_errors=True) + + if package_name not in self.PACKAGE_NAME_URIS.keys(): + raise ValueError(f"Invalid package name '{package_name}'. Valid options are: {self.PACKAGE_NAME_URIS.keys}") + + url = f"{self.BASE_DOWNLOAD_URL}/{self.latest_tag}/{self.PACKAGE_NAME_URIS[package_name]}" + r = requests.get(url, allow_redirects=True) + if not r.ok: + raise Exception(f"Error downloading Sigma release package: {r.url} {r.status_code} - {r.reason}") + self._extract_sigma_release(r.content) + + def _extract_sigma_release(self, content: bytes): + """Extracts the Sigma release package to the SIGMA_RULE_DIR directory. + + Args: + content (bytes): The content of the Sigma release package. + """ + + zf = zipfile.ZipFile(io.BytesIO(content)) + zf.extractall(self.rule_dir) + print(f"Extracted Sigma release package to {self.rule_dir}") + + +def get_latest_sigma_release_tag(): + """Requests https://github.com/SigmaHQ/sigma/releases/latest and returns the URL of the response + as the latest release + """ + url = "https://github.com/SigmaHQ/sigma/releases/latest" + r = requests.get(url, allow_redirects=False) + if not r.ok: + raise Exception(f"Error getting latest Sigma release: {r.url} {r.status_code} - {r.reason}") + latest_tag = r.next.path_url.split("/")[-1] + if not latest_tag: + raise Exception(f"Error getting latest Sigma release: {r.url} {r.status_code} - {r.reason}") + return latest_tag + + +def setup_rule_dir(rule_dir: str) -> str: + """Creates the SIGMA_RULE_DIR directory if it doesn't exist, and returns the path to the directory. + + Args: + rule_dir (str): The path to a directory where Sigma rules should be installed. + + Returns: + str: The path to the SIGMA_RULE_DIR directory. + """ + if not rule_dir: + rule_dir = SIGMA_RULE_DIR + if not os.path.exists(rule_dir): + os.makedirs(rule_dir) + return rule_dir From 440791a131578aad4906925c39e9acfbc14e74db Mon Sep 17 00:00:00 2001 From: Stephen Lincoln Date: Fri, 17 Nov 2023 14:04:43 -0500 Subject: [PATCH 02/12] Updated rule updater and globals --- sigmaiq/globals.py | 7 ++++--- sigmaiq/utils/sigma/rule_updater.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sigmaiq/globals.py b/sigmaiq/globals.py index 29dd4b6..2f5a241 100644 --- a/sigmaiq/globals.py +++ b/sigmaiq/globals.py @@ -1,6 +1,7 @@ import os -ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) -SIGMA_RULE_DIR = os.path.join(ROOT_DIR, "llm/data/sigma") -print(SIGMA_RULE_DIR) \ No newline at end of file +class DEFAULT_DIRS: + ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) + SIGMA_RULE_DIR = os.path.join(ROOT_DIR, "llm/data/sigma") + VECTOR_STORE_DIR = os.path.join(ROOT_DIR, "llm/data/vectordb") \ No newline at end of file diff --git a/sigmaiq/utils/sigma/rule_updater.py b/sigmaiq/utils/sigma/rule_updater.py index bd98a28..8a02dd0 100644 --- a/sigmaiq/utils/sigma/rule_updater.py +++ b/sigmaiq/utils/sigma/rule_updater.py @@ -2,7 +2,7 @@ import shutil import requests -from sigmaiq.globals import SIGMA_RULE_DIR +from sigmaiq.globals import DEFAULT_DIRS from pathlib import Path import zipfile import io @@ -150,7 +150,7 @@ def setup_rule_dir(rule_dir: str) -> str: str: The path to the SIGMA_RULE_DIR directory. """ if not rule_dir: - rule_dir = SIGMA_RULE_DIR + rule_dir = DEFAULT_DIRS.SIGMA_RULE_DIR if not os.path.exists(rule_dir): os.makedirs(rule_dir) return rule_dir From f88cdc61e66be6e9246e1c68b07fc844515a0dc8 Mon Sep 17 00:00:00 2001 From: Stephen Lincoln Date: Fri, 17 Nov 2023 14:12:02 -0500 Subject: [PATCH 03/12] Added Sigma toolkit --- sigmaiq/llm/toolkits/__init__.py | 0 sigmaiq/llm/toolkits/base.py | 134 ++++++++++++++++++++++++++ sigmaiq/llm/toolkits/prompts.py | 19 ++++ sigmaiq/llm/toolkits/sigma_toolkit.py | 31 ++++++ 4 files changed, 184 insertions(+) create mode 100644 sigmaiq/llm/toolkits/__init__.py create mode 100644 sigmaiq/llm/toolkits/base.py create mode 100644 sigmaiq/llm/toolkits/prompts.py create mode 100644 sigmaiq/llm/toolkits/sigma_toolkit.py diff --git a/sigmaiq/llm/toolkits/__init__.py b/sigmaiq/llm/toolkits/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sigmaiq/llm/toolkits/base.py b/sigmaiq/llm/toolkits/base.py new file mode 100644 index 0000000..365a9f5 --- /dev/null +++ b/sigmaiq/llm/toolkits/base.py @@ -0,0 +1,134 @@ +# stdlib +from typing import Optional, Dict, Any, Type, Union +import json +from json import JSONDecodeError + +# langchain +from langchain.agents.agent import AgentExecutor +from langchain.agents.format_scratchpad import format_to_openai_function_messages +from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser +from langchain.chat_models import ChatOpenAI +from langchain.prompts import ChatPromptTemplate +from langchain.tools.render import format_tool_to_openai_function + +# langchain typing +from langchain.schema import ( + AgentAction, + AgentFinish, + OutputParserException, +) +from langchain.schema.vectorstore import VectorStore +from langchain.schema.language_model import BaseLanguageModel +from langchain.schema.agent import AgentActionMessageLog +from langchain.schema.messages import ( + AIMessage, + BaseMessage, +) + +# sigmaiq +from sigmaiq.llm.toolkits.sigma_toolkit import SigmaToolkit +from sigmaiq.llm.toolkits.prompts import SIGMA_AGENT_PROMPT + + +def create_sigma_agent( + agent_llm: BaseLanguageModel = ChatOpenAI(model="gpt-3.5-turbo"), + rule_creation_llm: BaseLanguageModel = ChatOpenAI(model="gpt-3.5-turbo"), + sigma_vectorstore: VectorStore = None, + toolkit: Type[SigmaToolkit] = SigmaToolkit, + prompt: Optional[ChatPromptTemplate] = SIGMA_AGENT_PROMPT, + verbose: bool = False, + return_intermediate_steps: bool = False, + agent_executor_kwargs: Optional[Dict[str, Any]] = None, +) -> AgentExecutor: + """Construct a Sigma agent from an LLM and tools. + + Args: + agent_llm (BaseLanguageModel, optional): The LLM to use for the agent. Defaults to ChatOpenAI(model="gpt-3.5-turbo"). + rule_creation_llm (BaseLanguageModel, optional): The LLM to use for the rule creation tool. Defaults to ChatOpenAI(model="gpt-3.5-turbo"). + sigma_vectorstore (VectorStore, optional): The vectorstore containing Sigma rules to use for the agent. Defaults to None. + toolkit (Type[SigmaToolkit], optional): The toolkit to use for the agent. Defaults to SigmaToolkit. + prompt (Optional[ChatPromptTemplate], optional): The prompt to use for the agent. Defaults to SIGMA_AGENT_PROMPT. + verbose (bool, optional): Whether to print verbose output. Defaults to False. + return_intermediate_steps (bool, optional): Whether to return intermediate steps. Defaults to False. + agent_executor_kwargs (Optional[Dict[str, Any]], optional): Additional kwargs to pass to the AgentExecutor. Defaults to None. + + Returns: + AgentExecutor: Returns a callable AgentExecutor object. Either you can call it or use run method with the query to get the response + """ # noqa: E501 + + # Get Sigma Tools from the SigmaToolkit. Init with sigma vectorstore and rule creation llm + tools = toolkit(sigmadb=sigma_vectorstore, rule_creation_llm=rule_creation_llm).get_tools() + + # Create OpenAI Function for each tool for the agent LLM, so we can create an OpenAI Function AgentExecutor + llm_with_tools = agent_llm.bind(functions=[format_tool_to_openai_function(t) for t in tools]) + + # Create the agent + agent = ( + { + "input": lambda x: x["input"], + "agent_scratchpad": lambda x: format_to_openai_function_messages(x["intermediate_steps"]), + } + | prompt + | llm_with_tools + | CustomOpenAIFunctionsAgentOutputParser() + ) + + # Create and return the AgentExecutor + agent_executor = AgentExecutor( + agent=agent, + tools=tools, + verbose=verbose, + return_intermediate_steps=return_intermediate_steps, + handle_parsing_errors=True, + **(agent_executor_kwargs or {})) + + return agent_executor + + +class CustomOpenAIFunctionsAgentOutputParser(OpenAIFunctionsAgentOutputParser): + """Custom OpenAIFunctionsAgentOutputParser to overcome the JSON parsing error on some agent + intermediate step inputs. This occurs because the `json.load()` method needs the arg `strict=False` to +parse the JSON. This is a hacky way to do this, but it works for now. + """ + # Override + @staticmethod + def _parse_ai_message(message: BaseMessage) -> Union[AgentAction, AgentFinish]: + """Parse an AI message.""" + if not isinstance(message, AIMessage): + raise TypeError(f"Expected an AI message got {type(message)}") + + function_call = message.additional_kwargs.get("function_call", {}) + + if function_call: + function_name = function_call["name"] + try: + _tool_input = json.loads(function_call["arguments"].strip(), strict=False) # HACK + except JSONDecodeError: + raise OutputParserException( + f"Could not parse tool input: {function_call} because " + f"the `arguments` is not valid JSON." + ) + + # HACK HACK HACK: + # The code that encodes tool input into Open AI uses a special variable + # name called `__arg1` to handle old style tools that do not expose a + # schema and expect a single string argument as an input. + # We unpack the argument here if it exists. + # Open AI does not support passing in a JSON array as an argument. + if "__arg1" in _tool_input: + tool_input = _tool_input["__arg1"] + else: + tool_input = _tool_input + + content_msg = f"responded: {message.content}\n" if message.content else "\n" + log = f"\nInvoking: `{function_name}` with `{tool_input}`\n{content_msg}\n" + return AgentActionMessageLog( + tool=function_name, + tool_input=tool_input, + log=log, + message_log=[message], + ) + + return AgentFinish( + return_values={"output": message.content}, log=str(message.content) + ) diff --git a/sigmaiq/llm/toolkits/prompts.py b/sigmaiq/llm/toolkits/prompts.py new file mode 100644 index 0000000..2beffbe --- /dev/null +++ b/sigmaiq/llm/toolkits/prompts.py @@ -0,0 +1,19 @@ +from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder + +# Default prompts for Sigma agents +SIGMA_AGENT_PROMPT = prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + "You are a threat detection engineering assistant bot specializing in Sigma rules." + "You have two tools at your disposal: translate_sigma_rule and create_sigma_rule_vectorstore." + "translate_sigma_rule will convert or translate a Sigma Rule into a query for a specific backend." + "create_sigma_rule_vectorstore will take the users input, find similar Sigma Rules from the vectorstore," + "then create a brand new Sigma Rule based on the users input and the similar Sigma Rules returned from the vectorstore" + "to use as context. The output is a Sigma Rule in YAML format. Do not use 'translate_sigma_rule' unless " + "the user explicitly asks for a Sigma Rule to be converted or translated into a query for a specific backend.", + ), + ("user", "{input}"), + MessagesPlaceholder(variable_name="agent_scratchpad"), + ] +) diff --git a/sigmaiq/llm/toolkits/sigma_toolkit.py b/sigmaiq/llm/toolkits/sigma_toolkit.py new file mode 100644 index 0000000..f6935ee --- /dev/null +++ b/sigmaiq/llm/toolkits/sigma_toolkit.py @@ -0,0 +1,31 @@ +# stdlib +from typing import List + +# langchain +from langchain.agents.agent_toolkits.base import BaseToolkit +from langchain.tools import BaseTool + +# langchain typing +from langchain.schema.language_model import BaseLanguageModel +from langchain.schema.vectorstore import VectorStore + +# sigmaiq tools +from sigmaiq.llm.tools.create_sigma_rule import CreateSigmaRuleVectorStoreTool +from sigmaiq.llm.tools.translate_sigma_rule import TranslateSigmaRuleTool + + +class SigmaToolkit(BaseToolkit): + """Sigma Toolkit.""" + + sigmadb: VectorStore + rule_creation_llm: BaseLanguageModel + + class Config: + arbitrary_types_allowed = True + + def get_tools(self) -> List[BaseTool]: + """Get the tools in the toolkit.""" + return [ + TranslateSigmaRuleTool(), + CreateSigmaRuleVectorStoreTool(sigmadb=self.sigmadb, llm=self.rule_creation_llm), + ] From fa0c273a3bd912340555dd0449d5d660546feee8 Mon Sep 17 00:00:00 2001 From: Stephen Lincoln Date: Fri, 17 Nov 2023 14:15:43 -0500 Subject: [PATCH 04/12] Added Sigma tools --- sigmaiq/llm/tools/__init__.py | 0 sigmaiq/llm/tools/create_sigma_rule.py | 111 ++++++++++++++++++++++ sigmaiq/llm/tools/translate_sigma_rule.py | 95 ++++++++++++++++++ 3 files changed, 206 insertions(+) create mode 100644 sigmaiq/llm/tools/__init__.py create mode 100644 sigmaiq/llm/tools/create_sigma_rule.py create mode 100644 sigmaiq/llm/tools/translate_sigma_rule.py diff --git a/sigmaiq/llm/tools/__init__.py b/sigmaiq/llm/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sigmaiq/llm/tools/create_sigma_rule.py b/sigmaiq/llm/tools/create_sigma_rule.py new file mode 100644 index 0000000..28b5908 --- /dev/null +++ b/sigmaiq/llm/tools/create_sigma_rule.py @@ -0,0 +1,111 @@ +# stdlib +from typing import Type, Optional + +# langchain +from langchain.callbacks.manager import CallbackManagerForToolRun +from langchain.prompts import ChatPromptTemplate +from langchain.tools import BaseTool +from langchain.pydantic_v1 import BaseModel, Field + +# langchain typing +from langchain.schema.language_model import BaseLanguageModel +from langchain.schema.runnable import RunnablePassthrough +from langchain.schema.output_parser import StrOutputParser +from langchain.schema.vectorstore import VectorStore + + +class CreateSigmaRuleInput(BaseModel): + """Input for TranslateSigmaRule tool, which uses SigmAIQ backend factory to convert a Sigma Rule into + a query for a specific backend.""" + + query: str = Field( + description="The users question, used to search through the Sigma VectorStore and create a Sigma Rule." + ) + + class Config(BaseTool.Config): + pass + + +class CreateSigmaRuleVectorStoreTool(BaseTool): + """Class for translating Sigma rules via SigmAIQ Backend Factory""" + + name: str = "create_sigma_rule_vectorstore" + args_schema: Type[BaseModel] = CreateSigmaRuleInput + description: str = """Use this tool to take the users input, find similar Sigma Rules from the vectorstore, + then create a brand new Sigma Rule based on the users input and the similar Sigma Rules returned from the vectorstore + to use as context. The output is a Sigma Rule in YAML format. + """ + sigmadb: VectorStore + llm: BaseLanguageModel + k: int = 3 + verbose = True + + def _run( + self, + query: str, + run_manager: Optional[CallbackManagerForToolRun] = None, + ) -> str: + """Run the tool""" + + template = """You are a cybersecurity detection engineering assistant bot specializing in Sigma Rule creation. + You are assisting a user in creating a new Sigma Rule based on the users question. + The users question is first used to find similar Sigma Rules from the a vectorstore containing official + Sigma Rules. The official Sigma Rules should be used as context as needed in conjunction with the detection specified + in the users question to create a new Sigma Rule. Set the 'author' to 'SigmAIQ (AttackIQ)', + the date to today's date, and the reference to 'https://github.com/AttackIQ/SigmAIQ'. + The created Sigma Rule should be in YAML format and use the official Sigma schema. The detection field + can contain multiple 'selection' identifiers and multiple 'filter' identifiers as needed, + which can be used in the condition field to select criteria and filter out criteria respectively. + + Sigma Rule Schema: + + title + id [optional] + related [optional] + - id {{rule-id}} + type {{type-identifier}} + status [optional] + description [optional] + references [optional] + author [optional] + date [optional] + modified [optional] + tags [optional] + logsource + category [optional] + product [optional] + service [optional] + definition [optional] + ... + detection + {{search-identifier}} [optional] + {{string-list}} [optional] + {{map-list}} [optional] + {{field: valu}}> [optional] + ... # Multiple search identifiers can be specified as needed and used in the condition + condition + fields [optional] + falsepositives [optional] + level [optional]: + ------- + Vectorstore Search Results: + + {context} + ------ + User's Question: + {question} + """ + + prompt = ChatPromptTemplate.from_template(template) + retriever = self.sigmadb.as_retriever(search_kwargs={"k": self.k}) + chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | self.llm | StrOutputParser() + return chain.invoke(query) + + async def _arun( + self, + query: str, + k: int, + run_manager: Optional[CallbackManagerForToolRun] = None, + ) -> str: + """Async run the tool""" + raise NotImplementedError diff --git a/sigmaiq/llm/tools/translate_sigma_rule.py b/sigmaiq/llm/tools/translate_sigma_rule.py new file mode 100644 index 0000000..a29ca87 --- /dev/null +++ b/sigmaiq/llm/tools/translate_sigma_rule.py @@ -0,0 +1,95 @@ +from langchain.callbacks.manager import CallbackManagerForToolRun +from langchain.tools import BaseTool +from pydantic import BaseModel, Field, Extra +from typing import Union, Type, Optional +from sigmaiq.sigmaiq_backend_factory import AVAILABLE_BACKENDS, SigmAIQBackend +from sigmaiq.sigmaiq_pipeline_factory import AVAILABLE_PIPELINES +import json + + +class TranslateSigmaRuleInput(BaseModel): + """Input for TranslateSigmaRule tool, which uses SigmAIQ backend factory to convert a Sigma Rule into + a query for a specific backend.""" + + sigma_rule: Union[str, dict] = Field( + default=None, + description="""The Sigma Rule to translate. This can be one of two formats: + + 1. A YAML string of the Sigma rule, with at least the title, logsource, and detection fields. + 2. A dict object of the Sigma rule, which is the same as the YAML string and must contain the + "title, logsource, and detection fields""", + ) + backend: str = Field( + default=None, + description="""The backend or product to translate the Sigma rule to. Backend options their descriptions are as + follows:\n""" + + f"{json.dumps(AVAILABLE_BACKENDS, indent=2)}", + ) + processing_pipeline: str = Field( + default=None, + description="""The processing pipeline to use for the Sigma rule. This should only be set if explicitly provided + by the user, as certain pipelines are only compatible with certain backends. Otherwise, set this to None. + Pipeline options and their + descriptions are as follows:\n""" + + f"{json.dumps({k: v['description'] for k, v in AVAILABLE_PIPELINES.items()}, indent=2)}", + ) + output_format: str = Field( + default="default", + description="""The output format for the translated rule. Unless specified, 'default' should be used, as this is + the option available in all backends. Each backend option and valid backends with their + descriptions are as follows:\n""" + + f"{json.dumps({k: v['output_formats'] for k, v in SigmAIQBackend.display_backends_and_outputs().items()}, indent=2)}", + ) + + +class TranslateSigmaRuleTool(BaseTool): + """Class for translating Sigma rules via SigmAIQ Backend Factory""" + + name: str = "translate_sigma_rule" + args_schema: Type[BaseModel] = TranslateSigmaRuleInput + description: str = """ + Use this tool to translate or convert a Sigma rule into a query for a specific backend. + The input must be a Sigma Rule, which can be provided as a YAML string or dict object. + Additionally, the backend (product) must be specified, and the processing pipeline and output format can be + optionally specified. + The output is json of the translated rule to a query for the backend, or an error message if the + translation fails. + """ + # return_direct = True # We don't need an agent LLM to think about the output, it is what it is. + verbose = True + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + + def _run( + self, + sigma_rule: Union[str, dict] = None, + backend: str = None, + processing_pipeline: str = None, + output_format: str = "default", + run_manager: Optional[CallbackManagerForToolRun] = None, + ) -> str: + # Get backend object + backend_obj = SigmAIQBackend( + backend=backend, processing_pipeline=processing_pipeline, output_format=output_format + ).create_backend() + + try: + output = backend_obj.translate(sigma_rule)[0] + except Exception as e: + output = f"ERROR: {e}" + # Return translated rule + return output + + async def _arun( + self, + sigma_rule: Union[str, dict] = None, + backend: str = None, + processing_pipeline: str = None, + output_format: str = "default", + run_manager: Optional[CallbackManagerForToolRun] = None, + ) -> str: + """Async run the tool""" + raise NotImplementedError From 729ef1f1a99b6ebf6c307788630ea3afbcf08910 Mon Sep 17 00:00:00 2001 From: Stephen Lincoln Date: Fri, 17 Nov 2023 14:16:13 -0500 Subject: [PATCH 05/12] Added base LLM class --- sigmaiq/llm/__init__.py | 0 sigmaiq/llm/base.py | 176 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+) create mode 100644 sigmaiq/llm/__init__.py create mode 100644 sigmaiq/llm/base.py diff --git a/sigmaiq/llm/__init__.py b/sigmaiq/llm/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sigmaiq/llm/base.py b/sigmaiq/llm/base.py new file mode 100644 index 0000000..2e083f7 --- /dev/null +++ b/sigmaiq/llm/base.py @@ -0,0 +1,176 @@ +# stdlib +import os +from typing import Type, List + +# sigmaiq +from sigmaiq.utils.sigma.rule_updater import SigmaRuleUpdater +from sigmaiq.globals import DEFAULT_DIRS + +# langchain +from langchain.embeddings import OpenAIEmbeddings +from langchain.vectorstores import FAISS +from langchain.document_loaders import DirectoryLoader, TextLoader +from langchain.text_splitter import CharacterTextSplitter + +# langchain typing +from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore +from langchain.document_loaders.base import BaseLoader, Document +from langchain.schema.document import BaseDocumentTransformer + + +class SigmaLLM(SigmaRuleUpdater): + """Base class for Sigma rules with LLMs. + Provides methods for ensuring the latest Sigma rule package is installed, creating embeddings from Sigma rules, + and storing them in a vector store. Also provides basic search functionality based on the vector store embeddings. + + All agents, tools, and toolkits are in separate classes. + + To use custom embeddings, text splitters, loaders, etc, provide them as args override the methods in this class as + needed by your custom passed classes. + """ + + def __init__( + self, + rule_dir: str = None, + vector_store_dir: str = None, + embedding_function: Type[Embeddings] = OpenAIEmbeddings, + vector_store: Type[VectorStore] = FAISS, + rule_loader: Type[BaseLoader] = DirectoryLoader, + rule_splitter: Type[BaseDocumentTransformer] = CharacterTextSplitter, + ): + """Initializes the SigmaLLM object. + If passing custom embeddings, vector stores, loaders, or splitters, pass the class itself rather than + an instance of the class. For example, pass `OpenAIEmbeddings` instead of `OpenAIEmbeddings()`. Then, override + the methods in this class to load the custom classes if needed. + + Requires environmental variable `OPENAI_API_KEY` to be set to your OpenAI API key if using any OpenAI models + or embeddings. + + Args: + rule_dir (str, optional): The directory to store the Sigma rules. Defaults to None. + vector_store_dir (str, optional): The directory to store the vector store. Defaults to None. + embedding_function (Type[Embeddings], optional): The Embeddings class to use for Sigma rule embeddings. Defaults to OpenAIEmbeddings. + vector_store (Type[VectorStore], optional): The VectorStore class to use for Sigma rule embeddings. Defaults to FAISS. + rule_loader (Type[BaseLoader], optional): The DocumentLoader class to use for loading Sigma rules. Defaults to DirectoryLoader. + rule_splitter (Type[BaseDocumentTransformer], optional): The DocumentTransformer class to use for splitting Sigma rules. Defaults to CharacterTextSplitter. + """ + # Download/update sigma rules from parent class + super().__init__(rule_dir=rule_dir) + + # Setup rest of class + self.vector_store_dir = self._setup_vector_store_dir(vector_store_dir) + self.embedding_function = embedding_function() + self.vector_store = vector_store + self.sigmadb = None + self.rule_loader = rule_loader + self.rule_splitter = rule_splitter + + def load_sigma_vectordb(self): + """Loads the Sigma rule vector store. + Override `load_local()` below with how your vector store class loads local Vector DBs""" + if not os.path.exists(self.vector_store_dir): + raise FileNotFoundError(f"VectorStore not found at {self.vector_store_dir}.") + try: + self.sigmadb = self.vector_store.load_local( # CHANGE ME IF NEEDED + folder_path=self.vector_store_dir, + embeddings=self.embedding_function, + ) + except Exception as e: + raise e + + def create_sigma_vectordb(self, save: bool = True): + """Creates Sigma rule vector store by performing the following actions: + 1. Load each Sigma rule from the local SigmaHQ Sigma rules repository as Documents + 2. Split each Sigma rule Document + 3. Embed each Sigma rule Document and store in VectorStore + 4. Save the vectordb (if arg set) to disk + + Each of these steps has its own associated method; override them if you would like to change its behavior, for + example, by using a different TextSplitter or VectorStore. + + Args: + save (bool, optional): If True, will save the VectorStore to disk. Defaults to True. + + """ + if not self.installed_tag: + self.update_sigma_rules() + + # Load Sigma docs + sigma_docs = self.create_sigma_rule_docs() + print(f"Loaded {len(sigma_docs)} Sigma rules") + # Split Sigma docs + sigma_docs = self.split_sigma_docs(sigma_docs) + # Create VectorStore + self.create_vectordb(sigma_docs) + print(f"Created Sigma vectordb at {self.vector_store_dir}") + # Save VectorStore + if save: + self.save_vectordb() + + def create_sigma_rule_docs(self) -> List[Document]: + """Generator to loads Sigma rules from the local SigmaHQ Sigma rules repository.""" + sigma_rule_docs = [] + sigma_rule_docs += self.rule_loader(self.rule_dir, glob="**/*.yml", loader_cls=TextLoader).load() + + return sigma_rule_docs + + def split_sigma_docs(self, sigma_docs) -> List[Document]: + """Splits Sigma rule Documents into chunks based on the DirectoryLoader provided on initialization. + By default, we don't want to split up rules much, as we want the whole rule embedded. If you want to split + rules into smaller chunks, override this method and return the chunks, or use your own text splitter in initialization. + + Args: + sigma_docs (List[Document]): The list of Sigma rule Documents to split. + + Returns: + List[Document]: The list of Sigma rule Documents, split into chunks. + """ + # Override if needed + self.rule_splitter = self.rule_splitter(chunk_size=99999) # only chunk if rule is larger than 99999 characters + return self.rule_splitter.split_documents(sigma_docs) + + def create_vectordb(self, sigma_docs: List[Document]): + """Creates the VectorStore from the Sigma rule Documents. + Override `from_documents()` below with how your vector store class adds documents to the Vector DB""" + self.sigmadb = self.vector_store.from_documents(sigma_docs, self.embedding_function) # CHANGE ME IF NEEDED + + def save_vectordb(self, vectordb_path: str = None): + """Saves the VectorStore to disk. If no path is provided, will save to the path provided on initialization. + Override `save_local()` below with how your vector store class saves local Vector DBs + + Args: + vectordb_path (str, optional): The path to save the VectorStore. Defaults to None and will use the path provided on initialization. + """ + + self.sigmadb.save_local(self.vector_store_dir) + + @staticmethod + def _setup_vector_store_dir(vector_store_dir: str = None) -> str: + """Checks if the vector store directory exists. If not, creates it. + + Args: + vector_store_dir (str, optional): The directory to store the vector store. Defaults to None. + + Returns: + str: The vector store directory path. + """ + if not vector_store_dir: + vector_store_dir = DEFAULT_DIRS.VECTOR_STORE_DIR + if not os.path.exists(vector_store_dir): + os.makedirs(vector_store_dir) + return vector_store_dir + + def simple_search(self, query: str, k: int = 3) -> List[Document]: + """Searches the Sigma rule vector store for the query text using similarity search. + + Args: + query (str): The query text to search for. + k (int, optional): The number of results to return. Defaults to 3. + + Returns: + List[Document]: The top 'k' matching Sigma Rules from the search. + """ + if not self.sigmadb: + self.load_sigma_vectordb() + return self.sigmadb.similarity_search(query, k) From 7a786e2675646847bbdd492c680d78dbf40f924a Mon Sep 17 00:00:00 2001 From: Stephen Lincoln Date: Fri, 17 Nov 2023 14:16:30 -0500 Subject: [PATCH 06/12] Added LLM README --- sigmaiq/llm/README.md | 132 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 sigmaiq/llm/README.md diff --git a/sigmaiq/llm/README.md b/sigmaiq/llm/README.md new file mode 100644 index 0000000..db9df8b --- /dev/null +++ b/sigmaiq/llm/README.md @@ -0,0 +1,132 @@ + +

SigmAIQ: LLM

+ + +NOTE: This is an experimental feature that is under active development. It is not recommended for production use. +By default, OpenAI embeddings and LLM models (gpt-3.5-turbo) are used, which require an OpenAI API key set in the environmental +variable `OPENAI_API_KEY`. + +## Overview +The goal of this SigmAIQ feature is to utilize the power of LLMs and Vector Databases with Sigma Rules. +This feature uses [langchain](https://github.com/langchain-ai/langchain) and [pySigma](https://github.com/SigmaHQ/pySigma) +to utilize LLMs and Agents for Sigma Rule translation and creation. +Currently, the use cases of this feature include: +- Embedding creation and storage of Sigma Rules +- Sigma Rule similarity searching +- Agent/Bot for Sigma Rule translation and creation + +Please see the `examples` folder for use case examples. + +### Embedding Creation and Storage +The `sigmaiq.llm.base.SigmaLLM` class is used to automatically download the latest Sigma Rules from the [SigmaHQ](https://github.com/SigmaHQ/sigma/releases/latest) repo. +By default, this downloads the `sigma_core` ruleset into this projects `data` directory. Embeddings are then created for each rule and stored in a Vector Database. +By default, `OpenAIEmbeddings` and `FAISS` are used, respectively. The `sigmaiq.llm.base.SigmaLLM` class can be extended to use different embedding and vector database implementations. + +### Sigma Rule Similarity Searching +The `sigmaiq.llm.base.SigmaLLM` class is also used to search for similar Sigma Rules using a similarity search. This does not require LLM models to be trained, as the embeddings are already created and stored in the Vector Database. +This can be a cheaper, yet less accurate option, for searching through Sigma Rules. By default, the top 3 matching rules +are returned based on the query sent to the similarity search. Other `langchain` `VectorStore` searching functionality can be used on the VectorStore as well. + +### Agent/Bot for Sigma Rule Translation and Creation +A `langchain` `Agent` can be created with the `create_sigma_agent()` function in `sigmaiq.llm.toolkits.base`. +This agent uses the tools contained in the `SigmaToolkit` class (in `sigmaiq/llm/toolkits/sigma_toolkit`) for various tasks. +The Agent will automatically determine what tools to use based on the query sent to it, and can run different tools in succession to complete a task. + +For rule translation, the Agent will automatically parse the contents of the user's query to determine what backend, pipeline, and output format +to use for the translation. The Agent will then create a `SigmAIQBackend` and translate the rule provided in the query. + +For rule creation, the Agent will first look for similar Sigma Rules in the local Sigma VectorStore (from `SigmaLLM`) and return +the top 3 best matching rules. The Agent will then use these matching rules as context, in addition to the context/IOCs in the user's question, +to create a brand new Sigma Rule! The Agent will then return the newly created Sigma Rule to the user. + + +#### Example Q&A +This example demonstrates how the agent can use multiple tools in succession; in this case, a Sigma Rule is first created +based on the user's question with the rule creation tool, then the rule is translated to a Microsoft 365 Defender query with the rule translation tool. +The Sigma Rule YAML can be found retrieved in the `intermediate_steps` of the output. + +QUESTION: "Create a Windows process creation Sigma Rule for certutil downloading a file from definitely-not-malware.com, then translate it to a Microsoft 365 Defender query." + +ANSWER: + +Intermediate Step (Rule Creation): +```yaml +title: Windows Process Creation Event with certutil.exe Downloading from definitely-not-malware.com +description: Detects a Windows process creation event where certutil.exe downloads a file from definitely-not-malware.com +references: + - https://github.com/AttackIQ/SigmAIQ +author: SigmAIQ (AttackIQ) +date: 2022/12/06 +logsource: + category: process_creation + product: windows +detection: + selection: + Image|endswith: '\certutil.exe' + CommandLine|contains: 'definitely-not-malware.com' + condition: selection +falsepositives: + - Unknown +level: high +``` + +Final Output: + +Here is the translated Microsoft 365 Defender query: + +``` +DeviceProcessEvents +| where FolderPath endswith "\\certutil.exe" and ProcessCommandLine contains "definitely-not-malware.com" +``` + + +## Installation +Clone this repo, then install SigmAIQ dependencies along with the `llm` group dependencies +with your favorite Python package manager, such as pip or poetry. + +### pip +```bash +pip install -r requirements/common.txt -r requirements/llm.txt +``` + +### poetry +```bash +poetry install --with llm +``` + +## Usage +For usage examples, please see the `examples` directory. By default, OpenAI embeddings and LLM models are used, which +require an OpenAI API key set in the environmental variable `OPENAI_API_KEY`. + + +## Known Issues +- Agent parsing issues sometimes occur when invalid JSON is passed between agent steps. + + +## TODO +- Add example for using custom (and free) embeddings and LLM models +- Add example for using custom Vector Databases +- Add ability to easily customize prompts for tools/agents +- Sigma Rule Creation Tool without Vector Databases +- Adding metadata to Vector Database entries for advanced filtering on Sigma Rule fields + - I.E. category, product, level, status, etc + + +## License +This project is licensed under the terms of the GNU LGPL, version 2.1. Please see the `LICENSE` file for full details. + + +## Contributing +Contributions and use cases are welcome! Please submit a PR or issue if you would like to contribute or have any questions. + + +## Acknowledgements +First and foremost, we'd like to acknowledge the creators, maintainers, contributors, and everyone else involved with the +[Sigma](https://github.com/SigmaHQ/sigma/) and [pySigma](https://github.com/SigmaHQ/pySigma) projects for obvious reasons. + +We'd also like to acknowledge the [langchain](https://github.com/langchain-ai/langchain) project the work with making +LLMs more accessible and easier to use. \ No newline at end of file From adeb9ba854876754c216c2d6b8c17f3b4d1d6baf Mon Sep 17 00:00:00 2001 From: Stephen Lincoln Date: Fri, 17 Nov 2023 14:16:46 -0500 Subject: [PATCH 07/12] Added examples for base wrapper and LLMs --- examples/__init__.py | 0 examples/custom_field_mappings.py | 49 ++++++++++++ examples/llm_basic_usage.py | 36 +++++++++ examples/llm_rule_translation_and_creation.py | 58 ++++++++++++++ examples/translate_sigma_rules.py | 77 +++++++++++++++++++ 5 files changed, 220 insertions(+) create mode 100644 examples/__init__.py create mode 100644 examples/custom_field_mappings.py create mode 100644 examples/llm_basic_usage.py create mode 100644 examples/llm_rule_translation_and_creation.py create mode 100644 examples/translate_sigma_rules.py diff --git a/examples/__init__.py b/examples/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/custom_field_mappings.py b/examples/custom_field_mappings.py new file mode 100644 index 0000000..61e49ed --- /dev/null +++ b/examples/custom_field_mappings.py @@ -0,0 +1,49 @@ +# %% This example shows how to use the SigmAIQ pySigma wrapper to provide custom field mappings for a backend +# %% This will allow you to translate specific field names to custom field names during rule translation + +# %% Import SigmAIQ +from sigmaiq import SigmAIQBackend, SigmAIQPipeline + +# %% Import pprint for pretty printing, and copy for copying rules +from pprint import pprint +from copy import copy + +# %% A basic Sigma Rule in YAML str to convert to a query. +# %% SigmAIQ also accepts a rule in JSON/Dict format, SigmaRule objects, and SigmaCollection objects + +sigma_rule = """ +title: whoami Command +description: Detects a basic whoami commandline execution +logsource: + product: windows + category: process_creation +detection: + selection1: + - CommandLine|contains: 'whoami.exe' + condition: selection1 +""" + +# %% Create SigmAIQ backend translate the rule to a Microsoft 365 Defender query +sigmaiq_backend = SigmAIQBackend(backend="splunk").create_backend() +query = sigmaiq_backend.translate(copy(sigma_rule)) # Returns List of queries + +print("\nM365Defender Query: ", end="\n\n") +pprint(query[0]) +print("\n-------------------") + +# %% Create custom field mappings +# %% This will map the CommandLine field to a custom field named "CustomCommandLine" +custom_field_mappings = {"CommandLine": "CustomCommandLine"} +my_custom_pipeline = SigmAIQPipeline.from_fieldmap(custom_field_mappings, priority=0).create_pipeline() + +# %% Create SigmAIQ backend translate the rule to a Microsoft 365 Defender query with our custom field mappings +sigmaiq_backend = SigmAIQBackend( + backend="splunk", + processing_pipeline=my_custom_pipeline).create_backend() + +query = sigmaiq_backend.translate(copy(sigma_rule)) # Returns List of queries + +print("\nM365Defender Query with Custom Fieldmappings: ", end="\n\n") +pprint(query[0]) +print("\n-------------------") + diff --git a/examples/llm_basic_usage.py b/examples/llm_basic_usage.py new file mode 100644 index 0000000..23202dd --- /dev/null +++ b/examples/llm_basic_usage.py @@ -0,0 +1,36 @@ +# %% This example will demonstrate how to use SigmAIQ to perform the following: +# %% 1. Download the latest Sigma Rule package release +# %% 2. Create embeddings of the Sigma Rules in the package +# %% 3. Create and save a VectorDB of the Sigma Rule embeddings +# %% 4. Use a similarity search on the VectorDB to find Sigma Rules similar to a provided query +from pprint import pprint + +# %% NOTE, this example uses OpenAI for embeddings. Ensure you have an OpenAI API key set in your environment variable +# %% OPENAI_API_KEY + +# %% Also ensure you have installed the correct requirements with: +# `pip install -r requirements/common.txt -r requirements/llm.txt` + + +# %% Import SigmAIQ LLM and OpenAIEmbeddings +from sigmaiq.llm.base import SigmaLLM + +# %% Create a SigmaLLM object with default settings. See the class docstring for more information +sigma_llm = SigmaLLM() + +# %% The `create_sigma_vectordb()` method will automatically do all the work for you :) (only run this once) +sigma_llm.create_sigma_vectordb(save=True) # Save locally to disk + +# %% Run a similarity search on the vectordb for encoded powershell commands and print top 3 results +query = "Encoded powershell commands" +matching_rules = sigma_llm.simple_search(query, k=3) +for matching_rule in matching_rules: + print(matching_rule.page_content, end="\n\n-------------------\n\n") + +# %% You can also load an existing vector store from disk (recommended) +sigma_llm.load_sigma_vectordb() + +query = "certutil" +matching_rules = sigma_llm.simple_search(query, k=3) +for matching_rule in matching_rules: + print(matching_rule.page_content, end="\n\n-------------------\n\n") diff --git a/examples/llm_rule_translation_and_creation.py b/examples/llm_rule_translation_and_creation.py new file mode 100644 index 0000000..98481b8 --- /dev/null +++ b/examples/llm_rule_translation_and_creation.py @@ -0,0 +1,58 @@ +# %% This example will demonstrate how to create a Sigma langchain agent chatbot, which can perform various tasks like +# %% automatically translate a rule for you, and create new rules from a users input. + +# %% Import required SigmAIQ classes and functions +from sigmaiq.llm.toolkits.base import create_sigma_agent +from sigmaiq.llm.base import SigmaLLM + +# %% Ensure we have our Sigma vector store setup with our base LLM class +sigma_llm = SigmaLLM() + +try: + sigma_llm.load_sigma_vectordb() +except Exception as e: + print(e) + print("Creating new Sigma VectorDB") + sigma_llm.create_sigma_vectordb(save=True) + +# %% Create a Sigma Agent Executor, and pass it our Sigma VectorDB +sigma_agent_executor = create_sigma_agent(sigma_vectorstore=sigma_llm.sigmadb) + +# %% RULE TRANSLATION +# %% Have the agent automatically translate a Sigma rule to a Splunk query with the splunk_cim_dm pipeline + +sigma_rule = r""" +title: whoami Command +description: Detects a basic whoami commandline execution +logsource: + product: windows + category: process_creation +detection: + selection1: + - CommandLine|contains: 'whoami.exe' + condition: selection1 +""" + +user_input = ("Translate the following Sigma rule to a Splunk query using the 'splunk_cim_dm' pipeline: \n\n" + + sigma_rule) + +# answer = sigma_agent_executor.invoke({"input": user_input}) +# print("\nRULE TRANSLATION:", end="\n\n") +#print(f"Question:\n {user_input}", end="\n\n") +#print(f"Answer: \n") +#print(answer.get('output'), end="\n\n") + +# %% RULE CREATION +# %% The agent will take the user input, look up similar Sigma Rules in the Sigma vector store, then create a brand +# %% new rule based on the context of the users input and the similar Sigma Rules. + +user_input = ("Create a Windows process creation Sigma Rule for certutil downloading a file " + "from definitely-not-malware.com, then translate it to a Microsoft 365 Defender query.") + +answer = sigma_agent_executor.invoke({"input": user_input}) +print("\nRULE CREATION:", end="\n\n") +print(f"Question:\n {user_input}", end="\n\n") +print(f"Answer: \n") +print(answer.get('output'), end="\n\n") + + diff --git a/examples/translate_sigma_rules.py b/examples/translate_sigma_rules.py new file mode 100644 index 0000000..3cb1ef8 --- /dev/null +++ b/examples/translate_sigma_rules.py @@ -0,0 +1,77 @@ +# %% This example shows how to use the SigmAIQ pySigma wrapper to easily translate Sigma rules to queries +# %% easily, without having to worry about installing and configuring the correct backends, pipelines and other details. + + +# %% Import SigmAIQ +from sigmaiq import SigmAIQBackend + +# %% Import pprint for pretty printing, and copy for copying rules +from pprint import pprint +from copy import copy + +# %% A basic Sigma Rule in YAML str to convert to a query. +# %% SigmAIQ also accepts a rule in JSON/Dict format, SigmaRule objects, and SigmaCollection objects + +sigma_rule = """ +title: whoami Command +description: Detects a basic whoami commandline execution +logsource: + product: windows + category: process_creation +detection: + selection1: + - CommandLine|contains: 'whoami.exe' + condition: selection1 +""" + +# %% BACKENDS +# %% Show the available supported backends +print("Supported Backends:", end="\n\n") +pprint(SigmAIQBackend.display_available_backends()) +print("\n-------------------") + +# %% Create SigmAIQ backend translate the rule to a Microsoft 365 Defender query +# %% SigmAIQ will automatically select the best pipeline for the backend +sigmaiq_backend = SigmAIQBackend(backend="microsoft365defender").create_backend() +query = sigmaiq_backend.translate(copy(sigma_rule)) # Returns List of queries + +print("\nM365Defender Query: ", end="\n\n") +pprint(query[0]) +print("\n-------------------") + +# %% PIPELINES +# %% Show the available pipelines with each backend +print("Available Pipelines:", end="\n\n") +pprint(SigmAIQBackend.display_all_associated_pipelines()) +print("\n-------------------") + +# %% Create SigmAIQ backend translate the rule to a Splunk search with the CIM pipeline +sigmaiq_backend = SigmAIQBackend(backend="splunk", processing_pipeline="splunk_cim_dm").create_backend() +query = sigmaiq_backend.translate(copy(sigma_rule)) + +print("\nSplunk CIM Query: ", end="\n\n") +pprint(query[0]) +print("\n-------------------") + +# %% OUTPUT FORMATS +# %% Show the available output formats with each backend +print("\nAvailable Output Formats:", end="\n\n") +pprint(SigmAIQBackend.display_backends_and_outputs()) +print("\n-------------------") + +# %% Change the output_format to an Enterprise Security Correlation Search stanza +sigmaiq_backend.set_output_format("stanza") +query = sigmaiq_backend.translate(copy(sigma_rule)) + +print("\nSplunk CIM Query, Stanza Output: ", end="\n\n") +pprint(query[0]) +print("\n-------------------") + + +# %% You can also translate a Sigma rule to all supported backend, pipeline, and output format combinations at once. +# %% Any combination that is not supported will not be included in the results +# %% This is useful for testing and comparing the output of different backends and pipelines +queries = SigmAIQBackend.create_all_and_translate(copy(sigma_rule)) + +print("\n All Translations: ", end="\n\n") +pprint(queries) \ No newline at end of file From bc608611bfda1c3920690d1a81812728b98bdb1c Mon Sep 17 00:00:00 2001 From: Stephen Lincoln Date: Fri, 17 Nov 2023 14:17:15 -0500 Subject: [PATCH 08/12] Added required libs to pyproject and requirements. Separated requirements for llm --- pyproject.toml | 12 ++++++--- requirements.txt | 26 +----------------- requirements/common.txt | 25 +++++++++++++++++ requirements/llm.txt | 60 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 94 insertions(+), 29 deletions(-) create mode 100644 requirements/common.txt create mode 100644 requirements/llm.txt diff --git a/pyproject.toml b/pyproject.toml index 3e260cd..ddb407d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ packages = [ ] [tool.poetry.dependencies] -python = "^3.8" +python = ">=3.8.1, <=3.11.6" pysigma = "0.9.11" certifi = "^2023.07.22" pysigma-backend-carbonblack = "0.1.4" @@ -39,16 +39,20 @@ pysigma-pipeline-sysmon = "1.0.2" pysigma-pipeline-windows = "1.1.0" importlib-resources = "^5.13.0" - - - [tool.poetry.dev-dependencies] pytest = "^7.4.0" pytest-cov = "^4.1.0" black = "^23.7.0" ruff = "^0.0.286" +[tool.poetry.group.llm] +optional = true +[tool.poetry.group.llm.dependencies] +langchain = "^0.0.335" +openai = "^1.2.4" +tiktoken = "^0.5.1" +faiss-cpu = "^1.7.4" [build-system] requires = ["poetry-core"] diff --git a/requirements.txt b/requirements.txt index 8680540..5a3f151 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,25 +1 @@ -certifi==2023.7.22 ; python_version >= "3.8" and python_version < "4.0" -charset-normalizer==3.2.0 ; python_version >= "3.8" and python_version < "4.0" -idna==3.4 ; python_version >= "3.8" and python_version < "4.0" -importlib-resources==5.13.0 ; python_version >= "3.8" and python_version < "4.0" -packaging==22.0 ; python_version >= "3.8" and python_version < "4.0" -pyparsing==3.1.1 ; python_version >= "3.8" and python_version < "4.0" -pysigma-backend-carbonblack==0.1.4 ; python_version >= "3.8" and python_version < "4.0" -pysigma-backend-cortexxdr==0.1.1 ; python_version >= "3.8" and python_version < "4.0" -pysigma-backend-elasticsearch==1.0.5 ; python_version >= "3.8" and python_version < "4.0" -pysigma-backend-insightidr==0.2.1 ; python_version >= "3.8" and python_version < "4.0" -pysigma-backend-loki==0.9.1 ; python_version >= "3.8" and python_version < "4.0" -pysigma-backend-microsoft365defender==0.2.1 ; python_version >= "3.8" and python_version < "4.0" -pysigma-backend-opensearch==1.0.0 ; python_version >= "3.8" and python_version < "4.0" -pysigma-backend-qradar-aql==0.2.3 ; python_version >= "3.8" and python_version < "4.0" -pysigma-backend-sentinelone==0.1.2 ; python_version >= "3.8" and python_version < "4.0" -pysigma-backend-splunk==1.0.2 ; python_version >= "3.8" and python_version < "4.0" -pysigma-backend-stix==0.1.8 ; python_version >= "3.8" and python_version < "4.0" -pysigma-pipeline-crowdstrike==1.0.0 ; python_version >= "3.8" and python_version < "4.0" -pysigma-pipeline-sysmon==1.0.2 ; python_version >= "3.8" and python_version < "4.0" -pysigma-pipeline-windows==1.1.0 ; python_version >= "3.8" and python_version < "4.0" -pysigma==0.9.11 ; python_version >= "3.8" and python_version < "4.0" -pyyaml==6.0.1 ; python_version >= "3.8" and python_version < "4.0" -requests==2.31.0 ; python_version >= "3.8" and python_version < "4.0" -urllib3==2.0.4 ; python_version >= "3.8" and python_version < "4.0" -zipp==3.16.2 ; python_version >= "3.8" and python_version < "3.10" +-r requirements/common.txt \ No newline at end of file diff --git a/requirements/common.txt b/requirements/common.txt new file mode 100644 index 0000000..a84902c --- /dev/null +++ b/requirements/common.txt @@ -0,0 +1,25 @@ +certifi==2023.7.22 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +charset-normalizer==3.3.2 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +idna==3.4 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +importlib-resources==5.13.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +packaging==22.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pyparsing==3.1.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pysigma-backend-carbonblack==0.1.4 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pysigma-backend-cortexxdr==0.1.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pysigma-backend-elasticsearch==1.0.5 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pysigma-backend-insightidr==0.2.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pysigma-backend-loki==0.9.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pysigma-backend-microsoft365defender==0.2.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pysigma-backend-opensearch==1.0.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pysigma-backend-qradar-aql==0.2.3 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pysigma-backend-sentinelone==0.1.2 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pysigma-backend-splunk==1.0.2 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pysigma-backend-stix==0.1.8 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pysigma-pipeline-crowdstrike==1.0.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pysigma-pipeline-sysmon==1.0.2 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pysigma-pipeline-windows==1.1.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pysigma==0.9.11 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pyyaml==6.0.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +requests==2.31.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +urllib3==2.1.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +zipp==3.17.0 ; python_full_version >= "3.8.1" and python_version < "3.10" diff --git a/requirements/llm.txt b/requirements/llm.txt new file mode 100644 index 0000000..565b617 --- /dev/null +++ b/requirements/llm.txt @@ -0,0 +1,60 @@ +aiohttp==3.8.6 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +aiosignal==1.3.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +annotated-types==0.6.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +anyio==3.7.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +async-timeout==4.0.3 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +attrs==23.1.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +backoff==2.2.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +beautifulsoup4==4.12.2 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +certifi==2023.7.22 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +chardet==5.2.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +charset-normalizer==3.3.2 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +click==8.1.7 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +colorama==0.4.6 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" and platform_system == "Windows" +dataclasses-json==0.6.2 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +distro==1.8.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +emoji==2.8.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +exceptiongroup==1.1.3 ; python_full_version >= "3.8.1" and python_version < "3.11" +faiss-cpu==1.7.4 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +filetype==1.2.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +frozenlist==1.4.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +greenlet==3.0.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" and (platform_machine == "win32" or platform_machine == "WIN32" or platform_machine == "AMD64" or platform_machine == "amd64" or platform_machine == "x86_64" or platform_machine == "ppc64le" or platform_machine == "aarch64") +h11==0.14.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +httpcore==1.0.2 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +httpx==0.25.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +idna==3.4 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +joblib==1.3.2 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +jsonpatch==1.33 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +jsonpointer==2.4 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +langchain==0.0.335 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +langdetect==1.0.9 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +langsmith==0.0.64 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +lxml==4.9.3 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +marshmallow==3.20.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +multidict==6.0.4 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +mypy-extensions==1.0.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +nltk==3.8.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +numpy==1.24.4 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +openai==1.2.4 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +packaging==22.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pydantic-core==2.14.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pydantic==2.5.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +python-iso639==2023.6.15 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +python-magic==0.4.27 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +pyyaml==6.0.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +rapidfuzz==3.5.2 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +regex==2023.10.3 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +requests==2.31.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +six==1.16.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +sniffio==1.3.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +soupsieve==2.5 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +sqlalchemy==2.0.23 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +tabulate==0.9.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +tenacity==8.2.3 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +tiktoken==0.5.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +tqdm==4.66.1 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +typing-extensions==4.8.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +typing-inspect==0.9.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +unstructured==0.10.30 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +urllib3==2.1.0 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" +yarl==1.9.2 ; python_full_version >= "3.8.1" and python_full_version <= "3.11.6" From 5267e2c62450a8ba09a4eebcdcece29cbd7077c2 Mon Sep 17 00:00:00 2001 From: Stephen Lincoln Date: Fri, 17 Nov 2023 14:17:32 -0500 Subject: [PATCH 09/12] Updated gitignore for data dirs --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 1e627b6..ffe0686 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,5 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ +# Custom data folder for vectordb and sigma rules +sigmaiq/llm/data/* \ No newline at end of file From c72804b0173665c7376b011a972c34444adf6407 Mon Sep 17 00:00:00 2001 From: Stephen Lincoln Date: Fri, 17 Nov 2023 14:17:48 -0500 Subject: [PATCH 10/12] Updated README for LLM --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 15650df..48ad708 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,9 @@ encountered. Please report any issues [here](https://github.com/AttackIQ/SigmAIQ Feature requests are also always welcome! pySigma tools/utils are currently not in the pre-release version, and will be added in future releases. +# LLM Support +For LLM usage, see the [LLM README](sigmaiq/llm/README.md) + # Installation & Usage ## Installation From e917b54f743472a288cbc87af7fbac2f44f130f6 Mon Sep 17 00:00:00 2001 From: Stephen Lincoln Date: Fri, 17 Nov 2023 16:15:45 -0500 Subject: [PATCH 11/12] Edited README --- sigmaiq/llm/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sigmaiq/llm/README.md b/sigmaiq/llm/README.md index db9df8b..a4fbe65 100644 --- a/sigmaiq/llm/README.md +++ b/sigmaiq/llm/README.md @@ -90,7 +90,8 @@ with your favorite Python package manager, such as pip or poetry. ### pip ```bash -pip install -r requirements/common.txt -r requirements/llm.txt +pip install -e . +pip install -r requirements/llm.txt ``` ### poetry @@ -98,6 +99,7 @@ pip install -r requirements/common.txt -r requirements/llm.txt poetry install --with llm ``` + ## Usage For usage examples, please see the `examples` directory. By default, OpenAI embeddings and LLM models are used, which require an OpenAI API key set in the environmental variable `OPENAI_API_KEY`. From cd0e6532f861ffc403de3667cbba2cb6a0618f5f Mon Sep 17 00:00:00 2001 From: Stephen Lincoln Date: Fri, 17 Nov 2023 16:19:24 -0500 Subject: [PATCH 12/12] Increased minor version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ddb407d..ee52874 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "SigmAIQ" -version = "0.2.4" +version = "0.3.0" description = "Wrapper and tools for pySigma and Sigma rules" authors = ["Stephen Lincoln ", "AttackIQ "] readme = "README.md"