From 6bb11d6012d17987d25ec5767758480bbc82686e Mon Sep 17 00:00:00 2001 From: Soufiane Fariss Date: Fri, 7 Jun 2024 13:10:12 +0200 Subject: [PATCH 01/17] feat: auto-generate ruleset cache on source change --- capa/rules/cache.py | 89 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/capa/rules/cache.py b/capa/rules/cache.py index 6f87570ef..27d7ecf54 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -9,8 +9,10 @@ import sys import zlib import pickle +import shutil import hashlib import logging +import subprocess from typing import List, Optional from pathlib import Path from dataclasses import dataclass @@ -26,6 +28,58 @@ CacheIdentifier = str +def is_dev_environment() -> bool: + if getattr(sys, "frozen", False): + # running as a PyInstaller executable + return False + + if "site-packages" in __file__: + # running from a site-packages installation + return False + + if not shutil.which("git"): + # git is found, but might not be always be in PATH + # we should handle this case + return False + + return True + + +def get_modified_files() -> List[str]: + try: + # use git status to retrieve tracked modified files + result = subprocess.run( + ["git", "--no-pager", "status", "--porcelain", "--untracked-files=no"], + capture_output=True, + text=True, + check=True, + ) + + # retrieve .py source files + # ' M': the file has staged modifications + # 'M ': the file has unstaged modifications + # 'MM': the file has both staged and unstaged modifications + files = [] + for line in result.stdout.splitlines(): + if line.startswith(("M ", "MM", " M")) and line.endswith(".py"): + file_path = line[3:] + files.append(file_path) + + return files + except (subprocess.CalledProcessError, FileNotFoundError): + return [] + + +def get_git_commit_hash() -> Optional[str]: + try: + result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True) + commit_hash = result.stdout.strip() + logger.debug("git commit hash %s", commit_hash) + return commit_hash + except (subprocess.CalledProcessError, FileNotFoundError): + return None + + def compute_cache_identifier(rule_content: List[bytes]) -> CacheIdentifier: hash = hashlib.sha256() @@ -107,6 +161,41 @@ def get_ruleset_content(ruleset: capa.rules.RuleSet) -> List[bytes]: def compute_ruleset_cache_identifier(ruleset: capa.rules.RuleSet) -> CacheIdentifier: rule_contents = get_ruleset_content(ruleset) + + if is_dev_environment(): + modified_files = get_modified_files() + commit_hash = get_git_commit_hash() + + if modified_files or commit_hash: + hash = hashlib.sha256() + hash.update(capa.version.__version__.encode("utf-8")) + hash.update(b"\x00") + + for file in modified_files: + try: + with Path(file).open("rb") as f: + file_content = f.read() + logger.debug("found modified source py %s", file) + hash.update(file_content) + hash.update(b"\x00") + except FileNotFoundError as e: + logger.error("modified file not found: %s", file) + logger.error("%s", e) + + if commit_hash: + hash.update(commit_hash.encode("ascii")) + hash.update(b"\x00") + + # include the hash of the rule contents + rule_hashes = sorted([hashlib.sha256(buf).hexdigest() for buf in rule_contents]) + for rule_hash in rule_hashes: + hash.update(rule_hash.encode("ascii")) + hash.update(b"\x00") + + logger.debug( + "developer environment detected, ruleset cache will be auto-generated upon each source modification" + ) + return hash.hexdigest() return compute_cache_identifier(rule_contents) From 11aa2d9124ba95c84b1e5f7d8e962d9b15acbd36 Mon Sep 17 00:00:00 2001 From: Soufiane Fariss Date: Fri, 7 Jun 2024 16:27:25 +0200 Subject: [PATCH 02/17] check if git dir exists, and return sorted modified file paths --- CHANGELOG.md | 4 ++++ capa/rules/cache.py | 44 +++++++++++++++++++++++++------------------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 531aaa758..5e5d30ffc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,9 +48,13 @@ Special thanks to our repeat and new contributors: - render maec/* fields #843 @s-ff - replace Halo spinner with Rich #2086 @s-ff - optimize rule matching #2080 @williballenthin +<<<<<<< HEAD - add aarch64 as a valid architecture #2144 mehunhoff@google.com @williballenthin - relax dependency version requirements for the capa library #2053 @williballenthin - add scripts dependency group and update documentation #2145 @mr-tz +======= +- regenerate ruleset cache automatically on source change (only in dev mode) #2133 @s-ff +>>>>>>> 699f49d2 (check if git dir exists, and return sorted modified file paths) ### New Rules (25) diff --git a/capa/rules/cache.py b/capa/rules/cache.py index 27d7ecf54..763d336ff 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -28,24 +28,31 @@ CacheIdentifier = str -def is_dev_environment() -> bool: +def is_dev_environment() -> Optional[Path]: if getattr(sys, "frozen", False): # running as a PyInstaller executable - return False + return None if "site-packages" in __file__: # running from a site-packages installation - return False + return None - if not shutil.which("git"): - # git is found, but might not be always be in PATH - # we should handle this case - return False + capa_root = Path(__file__).resolve().parent.parent + git_dir = capa_root / ".git" - return True + if not git_dir.is_dir(): + # .git directory doesn't exist + return None + + git_exe = shutil.which("git") + if not git_exe: + # git is not found in PATH + return None + return Path(git_exe) -def get_modified_files() -> List[str]: + +def get_modified_files() -> List[Path]: try: # use git status to retrieve tracked modified files result = subprocess.run( @@ -59,13 +66,13 @@ def get_modified_files() -> List[str]: # ' M': the file has staged modifications # 'M ': the file has unstaged modifications # 'MM': the file has both staged and unstaged modifications - files = [] + files: List[Path] = [] for line in result.stdout.splitlines(): if line.startswith(("M ", "MM", " M")) and line.endswith(".py"): - file_path = line[3:] + file_path = Path(line[3:]) files.append(file_path) - return files + return sorted(files) except (subprocess.CalledProcessError, FileNotFoundError): return [] @@ -173,11 +180,10 @@ def compute_ruleset_cache_identifier(ruleset: capa.rules.RuleSet) -> CacheIdenti for file in modified_files: try: - with Path(file).open("rb") as f: - file_content = f.read() - logger.debug("found modified source py %s", file) - hash.update(file_content) - hash.update(b"\x00") + file_content = file.read_bytes() + logger.debug("found modified source file %s", file) + hash.update(file_content) + hash.update(b"\x00") except FileNotFoundError as e: logger.error("modified file not found: %s", file) logger.error("%s", e) @@ -196,6 +202,7 @@ def compute_ruleset_cache_identifier(ruleset: capa.rules.RuleSet) -> CacheIdenti "developer environment detected, ruleset cache will be auto-generated upon each source modification" ) return hash.hexdigest() + return compute_cache_identifier(rule_contents) @@ -262,8 +269,7 @@ def generate_rule_cache(rules_dir: Path, cache_dir: Path) -> bool: logger.error("%s", str(e)) return False - content = capa.rules.cache.get_ruleset_content(rules) - id = capa.rules.cache.compute_cache_identifier(content) + id = capa.rules.cache.compute_ruleset_cache_identifier(rules) path = capa.rules.cache.get_cache_path(cache_dir, id) assert path.exists() From b0fd6451f2fc1337a48dfadbd49ebdeed89a9c87 Mon Sep 17 00:00:00 2001 From: Soufiane Fariss Date: Mon, 10 Jun 2024 03:42:20 +0200 Subject: [PATCH 03/17] move dev/git logic to capa.rules.utils --- capa/rules/cache.py | 122 ++++++++++++-------------------------------- capa/rules/utils.py | 74 +++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 89 deletions(-) create mode 100644 capa/rules/utils.py diff --git a/capa/rules/cache.py b/capa/rules/cache.py index 763d336ff..312a756d0 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -9,10 +9,8 @@ import sys import zlib import pickle -import shutil import hashlib import logging -import subprocess from typing import List, Optional from pathlib import Path from dataclasses import dataclass @@ -20,6 +18,7 @@ import capa.rules import capa.helpers import capa.version +import capa.rules.utils logger = logging.getLogger(__name__) @@ -28,65 +27,6 @@ CacheIdentifier = str -def is_dev_environment() -> Optional[Path]: - if getattr(sys, "frozen", False): - # running as a PyInstaller executable - return None - - if "site-packages" in __file__: - # running from a site-packages installation - return None - - capa_root = Path(__file__).resolve().parent.parent - git_dir = capa_root / ".git" - - if not git_dir.is_dir(): - # .git directory doesn't exist - return None - - git_exe = shutil.which("git") - if not git_exe: - # git is not found in PATH - return None - - return Path(git_exe) - - -def get_modified_files() -> List[Path]: - try: - # use git status to retrieve tracked modified files - result = subprocess.run( - ["git", "--no-pager", "status", "--porcelain", "--untracked-files=no"], - capture_output=True, - text=True, - check=True, - ) - - # retrieve .py source files - # ' M': the file has staged modifications - # 'M ': the file has unstaged modifications - # 'MM': the file has both staged and unstaged modifications - files: List[Path] = [] - for line in result.stdout.splitlines(): - if line.startswith(("M ", "MM", " M")) and line.endswith(".py"): - file_path = Path(line[3:]) - files.append(file_path) - - return sorted(files) - except (subprocess.CalledProcessError, FileNotFoundError): - return [] - - -def get_git_commit_hash() -> Optional[str]: - try: - result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True) - commit_hash = result.stdout.strip() - logger.debug("git commit hash %s", commit_hash) - return commit_hash - except (subprocess.CalledProcessError, FileNotFoundError): - return None - - def compute_cache_identifier(rule_content: List[bytes]) -> CacheIdentifier: hash = hashlib.sha256() @@ -169,39 +109,43 @@ def get_ruleset_content(ruleset: capa.rules.RuleSet) -> List[bytes]: def compute_ruleset_cache_identifier(ruleset: capa.rules.RuleSet) -> CacheIdentifier: rule_contents = get_ruleset_content(ruleset) - if is_dev_environment(): - modified_files = get_modified_files() - commit_hash = get_git_commit_hash() + try: + if capa.rules.utils.is_dev_environment(): + modified_files = capa.rules.utils.get_modified_files() + commit_hash = capa.rules.utils.get_git_commit_hash() - if modified_files or commit_hash: - hash = hashlib.sha256() - hash.update(capa.version.__version__.encode("utf-8")) - hash.update(b"\x00") + if modified_files or commit_hash: + hash = hashlib.sha256() + hash.update(capa.version.__version__.encode("utf-8")) + hash.update(b"\x00") - for file in modified_files: - try: - file_content = file.read_bytes() - logger.debug("found modified source file %s", file) - hash.update(file_content) + for file in modified_files: + try: + file_content = file.read_bytes() + logger.debug("found modified source file %s", file) + hash.update(file_content) + hash.update(b"\x00") + except FileNotFoundError as e: + logger.error("modified file not found: %s", file) + logger.error("%s", e) + + if commit_hash: + hash.update(commit_hash.encode("ascii")) hash.update(b"\x00") - except FileNotFoundError as e: - logger.error("modified file not found: %s", file) - logger.error("%s", e) - - if commit_hash: - hash.update(commit_hash.encode("ascii")) - hash.update(b"\x00") - # include the hash of the rule contents - rule_hashes = sorted([hashlib.sha256(buf).hexdigest() for buf in rule_contents]) - for rule_hash in rule_hashes: - hash.update(rule_hash.encode("ascii")) - hash.update(b"\x00") + # include the hash of the rule contents + rule_hashes = sorted([hashlib.sha256(buf).hexdigest() for buf in rule_contents]) + for rule_hash in rule_hashes: + hash.update(rule_hash.encode("ascii")) + hash.update(b"\x00") - logger.debug( - "developer environment detected, ruleset cache will be auto-generated upon each source modification" - ) - return hash.hexdigest() + logger.debug( + "developer environment detected, ruleset cache will be auto-generated upon each source modification" + ) + return hash.hexdigest() + except Exception as e: + logger.warning("failed to compute ruleset cache identifier in developer mode: %s", str(e)) + logger.warning("falling back to default cache identifier based on rules contents") return compute_cache_identifier(rule_contents) diff --git a/capa/rules/utils.py b/capa/rules/utils.py new file mode 100644 index 000000000..2b353b0d8 --- /dev/null +++ b/capa/rules/utils.py @@ -0,0 +1,74 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import sys +import shutil +import logging +import subprocess +from typing import List, Optional +from pathlib import Path + +logger = logging.getLogger(__name__) + + +def is_dev_environment() -> bool: + if getattr(sys, "frozen", False): + # running as a PyInstaller executable + return False + + if "site-packages" in __file__: + # running from a site-packages installation + return False + + capa_root = Path(__file__).resolve().parent.parent.parent + git_dir = capa_root / ".git" + + if not git_dir.is_dir(): + # .git directory doesn't exist + return False + + git_exe = shutil.which("git") + if not git_exe: + # git is not found in PATH + return False + + return True + + +def get_modified_files() -> List[Path]: + try: + # use git status to retrieve tracked modified files + result = subprocess.run( + ["git", "--no-pager", "status", "--porcelain", "--untracked-files=no"], + capture_output=True, + text=True, + check=True, + ) + + # retrieve .py source files + # ' M': the file has staged modifications + # 'M ': the file has unstaged modifications + # 'MM': the file has both staged and unstaged modifications + files: List[Path] = [] + for line in result.stdout.splitlines(): + if line.startswith(("M ", "MM", " M")) and line.endswith(".py"): + file_path = Path(line[3:]) + files.append(file_path) + + return sorted(files) + except (subprocess.CalledProcessError, FileNotFoundError): + return [] + + +def get_git_commit_hash() -> Optional[str]: + try: + result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True) + commit_hash = result.stdout.strip() + logger.debug("git commit hash %s", commit_hash) + return commit_hash + except (subprocess.CalledProcessError, FileNotFoundError): + return None From b453f364a08e006b9bcd152092556004112578af Mon Sep 17 00:00:00 2001 From: Soufiane Fariss Date: Mon, 10 Jun 2024 13:41:41 +0200 Subject: [PATCH 04/17] refactor: group compute cache id logic into 1 function This commits groups the computing cache identifier logic into a a single function: compute_cache_identifier. Initially, this the cache id computing was split into two functions compute_ruleset_cache_identifier and compute_cache_identifier. --- capa/rules/cache.py | 84 ++++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/capa/rules/cache.py b/capa/rules/cache.py index 312a756d0..db484a941 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -28,6 +28,49 @@ def compute_cache_identifier(rule_content: List[bytes]) -> CacheIdentifier: + # is this a development environment? + # if yes, include the modified files contents and/or commit hash + # in computing the cache identifier + try: + if capa.rules.utils.is_dev_environment(): + modified_files = capa.rules.utils.get_modified_files() + commit_hash = capa.rules.utils.get_git_commit_hash() + + if modified_files or commit_hash: + hash = hashlib.sha256() + hash.update(capa.version.__version__.encode("utf-8")) + hash.update(b"\x00") + + for file in modified_files: + try: + file_content = file.read_bytes() + logger.debug("found modified source file %s", file) + hash.update(file_content) + hash.update(b"\x00") + except FileNotFoundError as e: + logger.error("modified file not found: %s", file) + logger.error("%s", e) + + if commit_hash: + hash.update(commit_hash.encode("ascii")) + hash.update(b"\x00") + + # include the hash of the rule contents + rule_hashes = sorted([hashlib.sha256(buf).hexdigest() for buf in rule_content]) + for rule_hash in rule_hashes: + hash.update(rule_hash.encode("ascii")) + hash.update(b"\x00") + + logger.debug( + "developer environment detected, ruleset cache will be auto-generated upon each source modification" + ) + return hash.hexdigest() + except Exception as e: + logger.warning("failed to compute ruleset cache identifier in developer mode: %s", str(e)) + logger.warning("falling back to default cache identifier based on rules contents") + + # this is not a development environment, only use rule contents in + # computing the cache identifier hash = hashlib.sha256() # note that this changes with each release, @@ -109,44 +152,6 @@ def get_ruleset_content(ruleset: capa.rules.RuleSet) -> List[bytes]: def compute_ruleset_cache_identifier(ruleset: capa.rules.RuleSet) -> CacheIdentifier: rule_contents = get_ruleset_content(ruleset) - try: - if capa.rules.utils.is_dev_environment(): - modified_files = capa.rules.utils.get_modified_files() - commit_hash = capa.rules.utils.get_git_commit_hash() - - if modified_files or commit_hash: - hash = hashlib.sha256() - hash.update(capa.version.__version__.encode("utf-8")) - hash.update(b"\x00") - - for file in modified_files: - try: - file_content = file.read_bytes() - logger.debug("found modified source file %s", file) - hash.update(file_content) - hash.update(b"\x00") - except FileNotFoundError as e: - logger.error("modified file not found: %s", file) - logger.error("%s", e) - - if commit_hash: - hash.update(commit_hash.encode("ascii")) - hash.update(b"\x00") - - # include the hash of the rule contents - rule_hashes = sorted([hashlib.sha256(buf).hexdigest() for buf in rule_contents]) - for rule_hash in rule_hashes: - hash.update(rule_hash.encode("ascii")) - hash.update(b"\x00") - - logger.debug( - "developer environment detected, ruleset cache will be auto-generated upon each source modification" - ) - return hash.hexdigest() - except Exception as e: - logger.warning("failed to compute ruleset cache identifier in developer mode: %s", str(e)) - logger.warning("falling back to default cache identifier based on rules contents") - return compute_cache_identifier(rule_contents) @@ -213,7 +218,8 @@ def generate_rule_cache(rules_dir: Path, cache_dir: Path) -> bool: logger.error("%s", str(e)) return False - id = capa.rules.cache.compute_ruleset_cache_identifier(rules) + content = capa.rules.cache.get_ruleset_content(rules) + id = capa.rules.cache.compute_cache_identifier(content) path = capa.rules.cache.get_cache_path(cache_dir, id) assert path.exists() From d78db767bff249425993e600e3c089c6f03ed998 Mon Sep 17 00:00:00 2001 From: mr-tz Date: Wed, 3 Jul 2024 13:03:51 +0000 Subject: [PATCH 05/17] simplify cache/dev check --- capa/helpers.py | 19 +++++++++++ capa/main.py | 13 +++++++- capa/rules/__init__.py | 9 +++-- capa/rules/cache.py | 43 ------------------------ capa/rules/utils.py | 74 ------------------------------------------ 5 files changed, 37 insertions(+), 121 deletions(-) delete mode 100644 capa/rules/utils.py diff --git a/capa/helpers.py b/capa/helpers.py index 77380c7ed..9f487d190 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -238,3 +238,22 @@ def is_running_standalone() -> bool: # so we keep this in a common area. # generally, other library code should not use this function. return hasattr(sys, "frozen") and hasattr(sys, "_MEIPASS") + + +def is_dev_environment() -> bool: + if is_running_standalone(): + return False + + if "site-packages" in __file__: + # running from a site-packages installation + # we may need to double check this + return False + + capa_root = Path(__file__).resolve().parent.parent + git_dir = capa_root / ".git" + + if not git_dir.is_dir(): + # .git directory doesn't exist + return False + + return True diff --git a/capa/main.py b/capa/main.py index eb43769d2..1c6ff447b 100644 --- a/capa/main.py +++ b/capa/main.py @@ -16,6 +16,7 @@ import argparse import textwrap import contextlib +from glob import glob from types import TracebackType from typing import Any, Dict, List, Optional from pathlib import Path @@ -565,13 +566,23 @@ def get_rules_from_cli(args) -> RuleSet: raises: ShouldExitError: if the program is invoked incorrectly and should exit. """ + enable_cache: bool = True try: if capa.helpers.is_running_standalone() and args.is_default_rules: cache_dir = get_default_root() / "cache" else: cache_dir = capa.rules.cache.get_default_cache_directory() - rules = capa.rules.get_rules(args.rules, cache_dir=cache_dir) + if capa.helpers.is_dev_environment(): + # get newest cache + newest_cache_ts = max([os.path.getmtime(f) for f in glob(f"{cache_dir}/*.cache")]) + for f in glob("rules/*.py"): + print(f) + if newest_cache_ts > os.path.getmtime(f): + logger.warning("found a modified source file {f} that's newer than the most recent cache") + enable_cache: bool = False + + rules = capa.rules.get_rules(args.rules, cache_dir=cache_dir, enable_cache=enable_cache) except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e: logger.error("%s", str(e)) logger.error( diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 3deff533f..d8d44b68a 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -2123,12 +2123,14 @@ def get_rules( rule_paths: List[RulePath], cache_dir=None, on_load_rule: Callable[[RulePath, int, int], None] = on_load_rule_default, + enable_cache: bool = True, ) -> RuleSet: """ args: rule_paths: list of paths to rules files or directories containing rules files cache_dir: directory to use for caching rules, or will use the default detected cache directory if None on_load_rule: callback to invoke before a rule is loaded, use for progress or cancellation + enable_cache: enable loading of a cached ruleset """ if cache_dir is None: cache_dir = capa.rules.cache.get_default_cache_directory() @@ -2140,9 +2142,10 @@ def get_rules( # rule_file_paths[i] corresponds to rule_contents[i]. rule_contents = [file_path.read_bytes() for file_path in rule_file_paths] - ruleset = capa.rules.cache.load_cached_ruleset(cache_dir, rule_contents) - if ruleset is not None: - return ruleset + if enable_cache: + ruleset = capa.rules.cache.load_cached_ruleset(cache_dir, rule_contents) + if ruleset is not None: + return ruleset rules: List[Rule] = [] diff --git a/capa/rules/cache.py b/capa/rules/cache.py index db484a941..16761e9da 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -18,7 +18,6 @@ import capa.rules import capa.helpers import capa.version -import capa.rules.utils logger = logging.getLogger(__name__) @@ -28,47 +27,6 @@ def compute_cache_identifier(rule_content: List[bytes]) -> CacheIdentifier: - # is this a development environment? - # if yes, include the modified files contents and/or commit hash - # in computing the cache identifier - try: - if capa.rules.utils.is_dev_environment(): - modified_files = capa.rules.utils.get_modified_files() - commit_hash = capa.rules.utils.get_git_commit_hash() - - if modified_files or commit_hash: - hash = hashlib.sha256() - hash.update(capa.version.__version__.encode("utf-8")) - hash.update(b"\x00") - - for file in modified_files: - try: - file_content = file.read_bytes() - logger.debug("found modified source file %s", file) - hash.update(file_content) - hash.update(b"\x00") - except FileNotFoundError as e: - logger.error("modified file not found: %s", file) - logger.error("%s", e) - - if commit_hash: - hash.update(commit_hash.encode("ascii")) - hash.update(b"\x00") - - # include the hash of the rule contents - rule_hashes = sorted([hashlib.sha256(buf).hexdigest() for buf in rule_content]) - for rule_hash in rule_hashes: - hash.update(rule_hash.encode("ascii")) - hash.update(b"\x00") - - logger.debug( - "developer environment detected, ruleset cache will be auto-generated upon each source modification" - ) - return hash.hexdigest() - except Exception as e: - logger.warning("failed to compute ruleset cache identifier in developer mode: %s", str(e)) - logger.warning("falling back to default cache identifier based on rules contents") - # this is not a development environment, only use rule contents in # computing the cache identifier hash = hashlib.sha256() @@ -151,7 +109,6 @@ def get_ruleset_content(ruleset: capa.rules.RuleSet) -> List[bytes]: def compute_ruleset_cache_identifier(ruleset: capa.rules.RuleSet) -> CacheIdentifier: rule_contents = get_ruleset_content(ruleset) - return compute_cache_identifier(rule_contents) diff --git a/capa/rules/utils.py b/capa/rules/utils.py deleted file mode 100644 index 2b353b0d8..000000000 --- a/capa/rules/utils.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: [package root]/LICENSE.txt -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and limitations under the License. -import sys -import shutil -import logging -import subprocess -from typing import List, Optional -from pathlib import Path - -logger = logging.getLogger(__name__) - - -def is_dev_environment() -> bool: - if getattr(sys, "frozen", False): - # running as a PyInstaller executable - return False - - if "site-packages" in __file__: - # running from a site-packages installation - return False - - capa_root = Path(__file__).resolve().parent.parent.parent - git_dir = capa_root / ".git" - - if not git_dir.is_dir(): - # .git directory doesn't exist - return False - - git_exe = shutil.which("git") - if not git_exe: - # git is not found in PATH - return False - - return True - - -def get_modified_files() -> List[Path]: - try: - # use git status to retrieve tracked modified files - result = subprocess.run( - ["git", "--no-pager", "status", "--porcelain", "--untracked-files=no"], - capture_output=True, - text=True, - check=True, - ) - - # retrieve .py source files - # ' M': the file has staged modifications - # 'M ': the file has unstaged modifications - # 'MM': the file has both staged and unstaged modifications - files: List[Path] = [] - for line in result.stdout.splitlines(): - if line.startswith(("M ", "MM", " M")) and line.endswith(".py"): - file_path = Path(line[3:]) - files.append(file_path) - - return sorted(files) - except (subprocess.CalledProcessError, FileNotFoundError): - return [] - - -def get_git_commit_hash() -> Optional[str]: - try: - result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True) - commit_hash = result.stdout.strip() - logger.debug("git commit hash %s", commit_hash) - return commit_hash - except (subprocess.CalledProcessError, FileNotFoundError): - return None From 0257948b87e398cfec68a3fc89c5444db31a67e0 Mon Sep 17 00:00:00 2001 From: Soufiane Fariss Date: Fri, 7 Jun 2024 13:10:12 +0200 Subject: [PATCH 06/17] feat: auto-generate ruleset cache on source change --- capa/rules/cache.py | 89 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/capa/rules/cache.py b/capa/rules/cache.py index 16761e9da..af62e1202 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -9,8 +9,10 @@ import sys import zlib import pickle +import shutil import hashlib import logging +import subprocess from typing import List, Optional from pathlib import Path from dataclasses import dataclass @@ -26,6 +28,58 @@ CacheIdentifier = str +def is_dev_environment() -> bool: + if getattr(sys, "frozen", False): + # running as a PyInstaller executable + return False + + if "site-packages" in __file__: + # running from a site-packages installation + return False + + if not shutil.which("git"): + # git is found, but might not be always be in PATH + # we should handle this case + return False + + return True + + +def get_modified_files() -> List[str]: + try: + # use git status to retrieve tracked modified files + result = subprocess.run( + ["git", "--no-pager", "status", "--porcelain", "--untracked-files=no"], + capture_output=True, + text=True, + check=True, + ) + + # retrieve .py source files + # ' M': the file has staged modifications + # 'M ': the file has unstaged modifications + # 'MM': the file has both staged and unstaged modifications + files = [] + for line in result.stdout.splitlines(): + if line.startswith(("M ", "MM", " M")) and line.endswith(".py"): + file_path = line[3:] + files.append(file_path) + + return files + except (subprocess.CalledProcessError, FileNotFoundError): + return [] + + +def get_git_commit_hash() -> Optional[str]: + try: + result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True) + commit_hash = result.stdout.strip() + logger.debug("git commit hash %s", commit_hash) + return commit_hash + except (subprocess.CalledProcessError, FileNotFoundError): + return None + + def compute_cache_identifier(rule_content: List[bytes]) -> CacheIdentifier: # this is not a development environment, only use rule contents in # computing the cache identifier @@ -109,6 +163,41 @@ def get_ruleset_content(ruleset: capa.rules.RuleSet) -> List[bytes]: def compute_ruleset_cache_identifier(ruleset: capa.rules.RuleSet) -> CacheIdentifier: rule_contents = get_ruleset_content(ruleset) + + if is_dev_environment(): + modified_files = get_modified_files() + commit_hash = get_git_commit_hash() + + if modified_files or commit_hash: + hash = hashlib.sha256() + hash.update(capa.version.__version__.encode("utf-8")) + hash.update(b"\x00") + + for file in modified_files: + try: + with Path(file).open("rb") as f: + file_content = f.read() + logger.debug("found modified source py %s", file) + hash.update(file_content) + hash.update(b"\x00") + except FileNotFoundError as e: + logger.error("modified file not found: %s", file) + logger.error("%s", e) + + if commit_hash: + hash.update(commit_hash.encode("ascii")) + hash.update(b"\x00") + + # include the hash of the rule contents + rule_hashes = sorted([hashlib.sha256(buf).hexdigest() for buf in rule_contents]) + for rule_hash in rule_hashes: + hash.update(rule_hash.encode("ascii")) + hash.update(b"\x00") + + logger.debug( + "developer environment detected, ruleset cache will be auto-generated upon each source modification" + ) + return hash.hexdigest() return compute_cache_identifier(rule_contents) From 733b1dea009569e4b8260e49fba425a810c0e4c9 Mon Sep 17 00:00:00 2001 From: Soufiane Fariss Date: Fri, 7 Jun 2024 16:27:25 +0200 Subject: [PATCH 07/17] check if git dir exists, and return sorted modified file paths --- CHANGELOG.md | 5 +---- capa/rules/cache.py | 44 +++++++++++++++++++++++++------------------- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e5d30ffc..e83efb5ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## master (unreleased) ### New Features +- regenerate ruleset cache automatically on source change (only in dev mode) #2133 @s-ff ### Breaking Changes @@ -48,13 +49,9 @@ Special thanks to our repeat and new contributors: - render maec/* fields #843 @s-ff - replace Halo spinner with Rich #2086 @s-ff - optimize rule matching #2080 @williballenthin -<<<<<<< HEAD - add aarch64 as a valid architecture #2144 mehunhoff@google.com @williballenthin - relax dependency version requirements for the capa library #2053 @williballenthin - add scripts dependency group and update documentation #2145 @mr-tz -======= -- regenerate ruleset cache automatically on source change (only in dev mode) #2133 @s-ff ->>>>>>> 699f49d2 (check if git dir exists, and return sorted modified file paths) ### New Rules (25) diff --git a/capa/rules/cache.py b/capa/rules/cache.py index af62e1202..8c8d9866e 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -28,24 +28,31 @@ CacheIdentifier = str -def is_dev_environment() -> bool: +def is_dev_environment() -> Optional[Path]: if getattr(sys, "frozen", False): # running as a PyInstaller executable - return False + return None if "site-packages" in __file__: # running from a site-packages installation - return False + return None - if not shutil.which("git"): - # git is found, but might not be always be in PATH - # we should handle this case - return False + capa_root = Path(__file__).resolve().parent.parent + git_dir = capa_root / ".git" - return True + if not git_dir.is_dir(): + # .git directory doesn't exist + return None + + git_exe = shutil.which("git") + if not git_exe: + # git is not found in PATH + return None + return Path(git_exe) -def get_modified_files() -> List[str]: + +def get_modified_files() -> List[Path]: try: # use git status to retrieve tracked modified files result = subprocess.run( @@ -59,13 +66,13 @@ def get_modified_files() -> List[str]: # ' M': the file has staged modifications # 'M ': the file has unstaged modifications # 'MM': the file has both staged and unstaged modifications - files = [] + files: List[Path] = [] for line in result.stdout.splitlines(): if line.startswith(("M ", "MM", " M")) and line.endswith(".py"): - file_path = line[3:] + file_path = Path(line[3:]) files.append(file_path) - return files + return sorted(files) except (subprocess.CalledProcessError, FileNotFoundError): return [] @@ -175,11 +182,10 @@ def compute_ruleset_cache_identifier(ruleset: capa.rules.RuleSet) -> CacheIdenti for file in modified_files: try: - with Path(file).open("rb") as f: - file_content = f.read() - logger.debug("found modified source py %s", file) - hash.update(file_content) - hash.update(b"\x00") + file_content = file.read_bytes() + logger.debug("found modified source file %s", file) + hash.update(file_content) + hash.update(b"\x00") except FileNotFoundError as e: logger.error("modified file not found: %s", file) logger.error("%s", e) @@ -198,6 +204,7 @@ def compute_ruleset_cache_identifier(ruleset: capa.rules.RuleSet) -> CacheIdenti "developer environment detected, ruleset cache will be auto-generated upon each source modification" ) return hash.hexdigest() + return compute_cache_identifier(rule_contents) @@ -264,8 +271,7 @@ def generate_rule_cache(rules_dir: Path, cache_dir: Path) -> bool: logger.error("%s", str(e)) return False - content = capa.rules.cache.get_ruleset_content(rules) - id = capa.rules.cache.compute_cache_identifier(content) + id = capa.rules.cache.compute_ruleset_cache_identifier(rules) path = capa.rules.cache.get_cache_path(cache_dir, id) assert path.exists() From d98515059a1cc3bd3a7b6012c8c9ed2e79130fe1 Mon Sep 17 00:00:00 2001 From: Soufiane Fariss Date: Mon, 10 Jun 2024 03:42:20 +0200 Subject: [PATCH 08/17] move dev/git logic to capa.rules.utils --- capa/rules/cache.py | 122 ++++++++++++-------------------------------- capa/rules/utils.py | 74 +++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 89 deletions(-) create mode 100644 capa/rules/utils.py diff --git a/capa/rules/cache.py b/capa/rules/cache.py index 8c8d9866e..4d9bbe145 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -9,10 +9,8 @@ import sys import zlib import pickle -import shutil import hashlib import logging -import subprocess from typing import List, Optional from pathlib import Path from dataclasses import dataclass @@ -20,6 +18,7 @@ import capa.rules import capa.helpers import capa.version +import capa.rules.utils logger = logging.getLogger(__name__) @@ -28,65 +27,6 @@ CacheIdentifier = str -def is_dev_environment() -> Optional[Path]: - if getattr(sys, "frozen", False): - # running as a PyInstaller executable - return None - - if "site-packages" in __file__: - # running from a site-packages installation - return None - - capa_root = Path(__file__).resolve().parent.parent - git_dir = capa_root / ".git" - - if not git_dir.is_dir(): - # .git directory doesn't exist - return None - - git_exe = shutil.which("git") - if not git_exe: - # git is not found in PATH - return None - - return Path(git_exe) - - -def get_modified_files() -> List[Path]: - try: - # use git status to retrieve tracked modified files - result = subprocess.run( - ["git", "--no-pager", "status", "--porcelain", "--untracked-files=no"], - capture_output=True, - text=True, - check=True, - ) - - # retrieve .py source files - # ' M': the file has staged modifications - # 'M ': the file has unstaged modifications - # 'MM': the file has both staged and unstaged modifications - files: List[Path] = [] - for line in result.stdout.splitlines(): - if line.startswith(("M ", "MM", " M")) and line.endswith(".py"): - file_path = Path(line[3:]) - files.append(file_path) - - return sorted(files) - except (subprocess.CalledProcessError, FileNotFoundError): - return [] - - -def get_git_commit_hash() -> Optional[str]: - try: - result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True) - commit_hash = result.stdout.strip() - logger.debug("git commit hash %s", commit_hash) - return commit_hash - except (subprocess.CalledProcessError, FileNotFoundError): - return None - - def compute_cache_identifier(rule_content: List[bytes]) -> CacheIdentifier: # this is not a development environment, only use rule contents in # computing the cache identifier @@ -171,39 +111,43 @@ def get_ruleset_content(ruleset: capa.rules.RuleSet) -> List[bytes]: def compute_ruleset_cache_identifier(ruleset: capa.rules.RuleSet) -> CacheIdentifier: rule_contents = get_ruleset_content(ruleset) - if is_dev_environment(): - modified_files = get_modified_files() - commit_hash = get_git_commit_hash() + try: + if capa.rules.utils.is_dev_environment(): + modified_files = capa.rules.utils.get_modified_files() + commit_hash = capa.rules.utils.get_git_commit_hash() - if modified_files or commit_hash: - hash = hashlib.sha256() - hash.update(capa.version.__version__.encode("utf-8")) - hash.update(b"\x00") + if modified_files or commit_hash: + hash = hashlib.sha256() + hash.update(capa.version.__version__.encode("utf-8")) + hash.update(b"\x00") - for file in modified_files: - try: - file_content = file.read_bytes() - logger.debug("found modified source file %s", file) - hash.update(file_content) + for file in modified_files: + try: + file_content = file.read_bytes() + logger.debug("found modified source file %s", file) + hash.update(file_content) + hash.update(b"\x00") + except FileNotFoundError as e: + logger.error("modified file not found: %s", file) + logger.error("%s", e) + + if commit_hash: + hash.update(commit_hash.encode("ascii")) hash.update(b"\x00") - except FileNotFoundError as e: - logger.error("modified file not found: %s", file) - logger.error("%s", e) - - if commit_hash: - hash.update(commit_hash.encode("ascii")) - hash.update(b"\x00") - # include the hash of the rule contents - rule_hashes = sorted([hashlib.sha256(buf).hexdigest() for buf in rule_contents]) - for rule_hash in rule_hashes: - hash.update(rule_hash.encode("ascii")) - hash.update(b"\x00") + # include the hash of the rule contents + rule_hashes = sorted([hashlib.sha256(buf).hexdigest() for buf in rule_contents]) + for rule_hash in rule_hashes: + hash.update(rule_hash.encode("ascii")) + hash.update(b"\x00") - logger.debug( - "developer environment detected, ruleset cache will be auto-generated upon each source modification" - ) - return hash.hexdigest() + logger.debug( + "developer environment detected, ruleset cache will be auto-generated upon each source modification" + ) + return hash.hexdigest() + except Exception as e: + logger.warning("failed to compute ruleset cache identifier in developer mode: %s", str(e)) + logger.warning("falling back to default cache identifier based on rules contents") return compute_cache_identifier(rule_contents) diff --git a/capa/rules/utils.py b/capa/rules/utils.py new file mode 100644 index 000000000..2b353b0d8 --- /dev/null +++ b/capa/rules/utils.py @@ -0,0 +1,74 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import sys +import shutil +import logging +import subprocess +from typing import List, Optional +from pathlib import Path + +logger = logging.getLogger(__name__) + + +def is_dev_environment() -> bool: + if getattr(sys, "frozen", False): + # running as a PyInstaller executable + return False + + if "site-packages" in __file__: + # running from a site-packages installation + return False + + capa_root = Path(__file__).resolve().parent.parent.parent + git_dir = capa_root / ".git" + + if not git_dir.is_dir(): + # .git directory doesn't exist + return False + + git_exe = shutil.which("git") + if not git_exe: + # git is not found in PATH + return False + + return True + + +def get_modified_files() -> List[Path]: + try: + # use git status to retrieve tracked modified files + result = subprocess.run( + ["git", "--no-pager", "status", "--porcelain", "--untracked-files=no"], + capture_output=True, + text=True, + check=True, + ) + + # retrieve .py source files + # ' M': the file has staged modifications + # 'M ': the file has unstaged modifications + # 'MM': the file has both staged and unstaged modifications + files: List[Path] = [] + for line in result.stdout.splitlines(): + if line.startswith(("M ", "MM", " M")) and line.endswith(".py"): + file_path = Path(line[3:]) + files.append(file_path) + + return sorted(files) + except (subprocess.CalledProcessError, FileNotFoundError): + return [] + + +def get_git_commit_hash() -> Optional[str]: + try: + result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True) + commit_hash = result.stdout.strip() + logger.debug("git commit hash %s", commit_hash) + return commit_hash + except (subprocess.CalledProcessError, FileNotFoundError): + return None From 077d4bb75a17d5bd5d261a0c4093ab5c4c3d48b0 Mon Sep 17 00:00:00 2001 From: Soufiane Fariss Date: Mon, 10 Jun 2024 13:41:41 +0200 Subject: [PATCH 09/17] refactor: group compute cache id logic into 1 function This commits groups the computing cache identifier logic into a a single function: compute_cache_identifier. Initially, this the cache id computing was split into two functions compute_ruleset_cache_identifier and compute_cache_identifier. --- capa/rules/cache.py | 43 ++----------------------------------------- 1 file changed, 2 insertions(+), 41 deletions(-) diff --git a/capa/rules/cache.py b/capa/rules/cache.py index 4d9bbe145..1452a6504 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -28,8 +28,6 @@ def compute_cache_identifier(rule_content: List[bytes]) -> CacheIdentifier: - # this is not a development environment, only use rule contents in - # computing the cache identifier hash = hashlib.sha256() # note that this changes with each release, @@ -111,44 +109,6 @@ def get_ruleset_content(ruleset: capa.rules.RuleSet) -> List[bytes]: def compute_ruleset_cache_identifier(ruleset: capa.rules.RuleSet) -> CacheIdentifier: rule_contents = get_ruleset_content(ruleset) - try: - if capa.rules.utils.is_dev_environment(): - modified_files = capa.rules.utils.get_modified_files() - commit_hash = capa.rules.utils.get_git_commit_hash() - - if modified_files or commit_hash: - hash = hashlib.sha256() - hash.update(capa.version.__version__.encode("utf-8")) - hash.update(b"\x00") - - for file in modified_files: - try: - file_content = file.read_bytes() - logger.debug("found modified source file %s", file) - hash.update(file_content) - hash.update(b"\x00") - except FileNotFoundError as e: - logger.error("modified file not found: %s", file) - logger.error("%s", e) - - if commit_hash: - hash.update(commit_hash.encode("ascii")) - hash.update(b"\x00") - - # include the hash of the rule contents - rule_hashes = sorted([hashlib.sha256(buf).hexdigest() for buf in rule_contents]) - for rule_hash in rule_hashes: - hash.update(rule_hash.encode("ascii")) - hash.update(b"\x00") - - logger.debug( - "developer environment detected, ruleset cache will be auto-generated upon each source modification" - ) - return hash.hexdigest() - except Exception as e: - logger.warning("failed to compute ruleset cache identifier in developer mode: %s", str(e)) - logger.warning("falling back to default cache identifier based on rules contents") - return compute_cache_identifier(rule_contents) @@ -215,7 +175,8 @@ def generate_rule_cache(rules_dir: Path, cache_dir: Path) -> bool: logger.error("%s", str(e)) return False - id = capa.rules.cache.compute_ruleset_cache_identifier(rules) + content = capa.rules.cache.get_ruleset_content(rules) + id = capa.rules.cache.compute_cache_identifier(content) path = capa.rules.cache.get_cache_path(cache_dir, id) assert path.exists() From 919ce0fb304925df2646417ab980c540b8463335 Mon Sep 17 00:00:00 2001 From: mr-tz Date: Wed, 3 Jul 2024 18:37:13 +0000 Subject: [PATCH 10/17] refactor cache check --- capa/helpers.py | 40 ++++++++++++++++++++++++++++++++++++++++ capa/main.py | 12 ++++-------- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/capa/helpers.py b/capa/helpers.py index 9f487d190..46395978b 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -5,6 +5,7 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import os import sys import gzip import json @@ -14,6 +15,7 @@ import importlib.util from typing import NoReturn from pathlib import Path +from datetime import datetime import tqdm @@ -257,3 +259,41 @@ def is_dev_environment() -> bool: return False return True + + +def should_enable_cache(cache_dir: Path) -> bool: + """ + args: + cache_dir: the cache directory containing cache files + + return: + True if latest cache file is older than the newest relevant rule code, else False + """ + + def ts_to_str(ts): + return datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S") + + # retrieve the latest modified cache file + cache_files = list(cache_dir.glob("*.cache")) + if not cache_files: + logger.debug("no rule cache files found") + return False + + latest_cache_file = max(cache_files, key=os.path.getmtime) + cache_timestamp = os.path.getmtime(latest_cache_file) + + # these are the relevant rules code files that could conflict with using an outdated cache + rule_code_timestamp = max( + os.path.getmtime(p) for p in [Path("capa/rules/__init__.py"), Path("capa/rules/cache.py")] + ) + + if rule_code_timestamp > cache_timestamp: + logger.warning( + "not using cache: latest rule code (%s) is newer than the latest rule cache file (%s)", + ts_to_str(rule_code_timestamp), + ts_to_str(cache_timestamp), + ) + return False + + logger.debug("no potentially outdated cache files found, cache can be used") + return True diff --git a/capa/main.py b/capa/main.py index 1c6ff447b..3ded60c79 100644 --- a/capa/main.py +++ b/capa/main.py @@ -16,7 +16,6 @@ import argparse import textwrap import contextlib -from glob import glob from types import TracebackType from typing import Any, Dict, List, Optional from pathlib import Path @@ -574,13 +573,10 @@ def get_rules_from_cli(args) -> RuleSet: cache_dir = capa.rules.cache.get_default_cache_directory() if capa.helpers.is_dev_environment(): - # get newest cache - newest_cache_ts = max([os.path.getmtime(f) for f in glob(f"{cache_dir}/*.cache")]) - for f in glob("rules/*.py"): - print(f) - if newest_cache_ts > os.path.getmtime(f): - logger.warning("found a modified source file {f} that's newer than the most recent cache") - enable_cache: bool = False + # using the rules cache during development may result in unexpected + # errors, this check tries to prevent issues if relevant rules code + # is newer than the newest rules cache + enable_cache = capa.helpers.should_enable_cache(cache_dir) rules = capa.rules.get_rules(args.rules, cache_dir=cache_dir, enable_cache=enable_cache) except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e: From 681eac29639962ab4df551df7563719560687ecc Mon Sep 17 00:00:00 2001 From: mr-tz Date: Wed, 3 Jul 2024 18:39:48 +0000 Subject: [PATCH 11/17] remove unneded changes/file --- capa/rules/cache.py | 2 -- capa/rules/utils.py | 74 --------------------------------------------- 2 files changed, 76 deletions(-) delete mode 100644 capa/rules/utils.py diff --git a/capa/rules/cache.py b/capa/rules/cache.py index 1452a6504..6f87570ef 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -18,7 +18,6 @@ import capa.rules import capa.helpers import capa.version -import capa.rules.utils logger = logging.getLogger(__name__) @@ -108,7 +107,6 @@ def get_ruleset_content(ruleset: capa.rules.RuleSet) -> List[bytes]: def compute_ruleset_cache_identifier(ruleset: capa.rules.RuleSet) -> CacheIdentifier: rule_contents = get_ruleset_content(ruleset) - return compute_cache_identifier(rule_contents) diff --git a/capa/rules/utils.py b/capa/rules/utils.py deleted file mode 100644 index 2b353b0d8..000000000 --- a/capa/rules/utils.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: [package root]/LICENSE.txt -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and limitations under the License. -import sys -import shutil -import logging -import subprocess -from typing import List, Optional -from pathlib import Path - -logger = logging.getLogger(__name__) - - -def is_dev_environment() -> bool: - if getattr(sys, "frozen", False): - # running as a PyInstaller executable - return False - - if "site-packages" in __file__: - # running from a site-packages installation - return False - - capa_root = Path(__file__).resolve().parent.parent.parent - git_dir = capa_root / ".git" - - if not git_dir.is_dir(): - # .git directory doesn't exist - return False - - git_exe = shutil.which("git") - if not git_exe: - # git is not found in PATH - return False - - return True - - -def get_modified_files() -> List[Path]: - try: - # use git status to retrieve tracked modified files - result = subprocess.run( - ["git", "--no-pager", "status", "--porcelain", "--untracked-files=no"], - capture_output=True, - text=True, - check=True, - ) - - # retrieve .py source files - # ' M': the file has staged modifications - # 'M ': the file has unstaged modifications - # 'MM': the file has both staged and unstaged modifications - files: List[Path] = [] - for line in result.stdout.splitlines(): - if line.startswith(("M ", "MM", " M")) and line.endswith(".py"): - file_path = Path(line[3:]) - files.append(file_path) - - return sorted(files) - except (subprocess.CalledProcessError, FileNotFoundError): - return [] - - -def get_git_commit_hash() -> Optional[str]: - try: - result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True) - commit_hash = result.stdout.strip() - logger.debug("git commit hash %s", commit_hash) - return commit_hash - except (subprocess.CalledProcessError, FileNotFoundError): - return None From b741ab0957d8657116b15e5e8ae5d9adeb2258d1 Mon Sep 17 00:00:00 2001 From: mr-tz Date: Thu, 4 Jul 2024 09:26:36 +0000 Subject: [PATCH 12/17] refactor --- capa/helpers.py | 26 +++++++++++++++----------- capa/main.py | 6 ++---- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/capa/helpers.py b/capa/helpers.py index 46395978b..58575198f 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -248,7 +248,6 @@ def is_dev_environment() -> bool: if "site-packages" in __file__: # running from a site-packages installation - # we may need to double check this return False capa_root = Path(__file__).resolve().parent.parent @@ -261,18 +260,17 @@ def is_dev_environment() -> bool: return True -def should_enable_cache(cache_dir: Path) -> bool: +def is_cache_newer_than_rule_code(cache_dir: Path) -> bool: """ + basic check to prevent issues if the rules cache is older than relevant rules code + args: cache_dir: the cache directory containing cache files - return: - True if latest cache file is older than the newest relevant rule code, else False + returns: + True if latest cache file is newer than relevant rule cache code """ - def ts_to_str(ts): - return datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S") - # retrieve the latest modified cache file cache_files = list(cache_dir.glob("*.cache")) if not cache_files: @@ -283,16 +281,22 @@ def ts_to_str(ts): cache_timestamp = os.path.getmtime(latest_cache_file) # these are the relevant rules code files that could conflict with using an outdated cache - rule_code_timestamp = max( - os.path.getmtime(p) for p in [Path("capa/rules/__init__.py"), Path("capa/rules/cache.py")] - ) + latest_rule_code_file = max([Path("capa/rules/__init__.py"), Path("capa/rules/cache.py")], key=os.path.getmtime) + rule_code_timestamp = os.path.getmtime(latest_rule_code_file) if rule_code_timestamp > cache_timestamp: + + def ts_to_str(ts): + return datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S") + logger.warning( - "not using cache: latest rule code (%s) is newer than the latest rule cache file (%s)", + "not using cache: latest rule code file %s (%s) is newer than the latest rule cache file %s (%s)", + latest_rule_code_file, ts_to_str(rule_code_timestamp), + latest_cache_file, ts_to_str(cache_timestamp), ) + logger.debug("delete the cache file manually to use rule caching again") return False logger.debug("no potentially outdated cache files found, cache can be used") diff --git a/capa/main.py b/capa/main.py index 3ded60c79..725dd926b 100644 --- a/capa/main.py +++ b/capa/main.py @@ -573,10 +573,8 @@ def get_rules_from_cli(args) -> RuleSet: cache_dir = capa.rules.cache.get_default_cache_directory() if capa.helpers.is_dev_environment(): - # using the rules cache during development may result in unexpected - # errors, this check tries to prevent issues if relevant rules code - # is newer than the newest rules cache - enable_cache = capa.helpers.should_enable_cache(cache_dir) + # using the rules cache during development may result in unexpected errors, see #1898 + enable_cache = capa.helpers.is_cache_newer_than_rule_code(cache_dir) rules = capa.rules.get_rules(args.rules, cache_dir=cache_dir, enable_cache=enable_cache) except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e: From 03f26afc442dde29b0d8662b98a9fe50a5e67f9e Mon Sep 17 00:00:00 2001 From: mr-tz Date: Thu, 4 Jul 2024 10:30:55 +0000 Subject: [PATCH 13/17] add tests --- tests/test_helpers.py | 6 ++++++ tests/test_rule_cache.py | 26 ++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 3f3fc9325..1f291f7ee 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -8,6 +8,7 @@ import codecs +import capa.helpers from capa.features.extractors import helpers @@ -64,3 +65,8 @@ def test_generate_symbols(): symbols = list(helpers.generate_symbols("ws2_32", "#1", include_dll=False)) assert len(symbols) == 1 assert "ws2_32.#1" in symbols + + +def test_is_dev_environment(): + # testing environment should be a dev environment + assert capa.helpers.is_dev_environment() is True diff --git a/tests/test_rule_cache.py b/tests/test_rule_cache.py index 0206e936d..e92aa81f8 100644 --- a/tests/test_rule_cache.py +++ b/tests/test_rule_cache.py @@ -6,10 +6,13 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import os import textwrap import contextlib +from pathlib import Path import capa.rules +import capa.helpers import capa.rules.cache R1 = capa.rules.Rule.from_yaml( @@ -113,3 +116,26 @@ def test_ruleset_cache_invalid(): assert capa.rules.cache.load_cached_ruleset(cache_dir, content) is None # the invalid cache should be deleted assert not path.exists() + + +def test_rule_cache_dev_environment(): + # generate rules cache + rs = capa.rules.RuleSet([R2]) + content = capa.rules.cache.get_ruleset_content(rs) + id = capa.rules.cache.compute_cache_identifier(content) + cache_dir = capa.rules.cache.get_default_cache_directory() + cache_path = capa.rules.cache.get_cache_path(cache_dir, id) + with contextlib.suppress(OSError): + cache_path.unlink() + capa.rules.cache.cache_ruleset(cache_dir, rs) + assert cache_path.exists() + + assert capa.helpers.is_cache_newer_than_rule_code(cache_dir) is True + + capa_root = Path(__file__).resolve().parent.parent + cachepy = capa_root / "capa" / "rules" / "cache.py" # alternative: capa_root / "capa" / "rules" / "__init__.py" + + # set last modified time to older than code file + os.utime(cache_path, (cache_path.stat().st_atime, cachepy.stat().st_mtime - 100)) + assert capa.helpers.is_dev_environment() is True + assert capa.helpers.is_cache_newer_than_rule_code(cache_dir) is False From 5b2c5ea3ee1713dbc92cb9e36dfaa222e0d3c2ca Mon Sep 17 00:00:00 2001 From: mr-tz Date: Thu, 4 Jul 2024 11:58:43 +0000 Subject: [PATCH 14/17] debug test --- tests/test_rule_cache.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/test_rule_cache.py b/tests/test_rule_cache.py index e92aa81f8..835761d08 100644 --- a/tests/test_rule_cache.py +++ b/tests/test_rule_cache.py @@ -135,7 +135,18 @@ def test_rule_cache_dev_environment(): capa_root = Path(__file__).resolve().parent.parent cachepy = capa_root / "capa" / "rules" / "cache.py" # alternative: capa_root / "capa" / "rules" / "__init__.py" - # set last modified time to older than code file - os.utime(cache_path, (cache_path.stat().st_atime, cachepy.stat().st_mtime - 100)) + # set cache's last modified time prior to code file's modified time + os.utime(cache_path, (cache_path.stat().st_atime, cachepy.stat().st_mtime - 6000000)) + + # debug + def ts_to_str(ts): + from datetime import datetime + + return datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S") + + for g in ((capa_root / "capa" / "rules").glob("*.py"), cache_dir.glob("*.cache")): + for p in g: + print(p, "\t", ts_to_str(p.stat().st_mtime)) # noqa: T201 + assert capa.helpers.is_dev_environment() is True assert capa.helpers.is_cache_newer_than_rule_code(cache_dir) is False From 7c060a26aa3042662a4485572d06b564a086918c Mon Sep 17 00:00:00 2001 From: mr-tz Date: Thu, 4 Jul 2024 12:50:57 +0000 Subject: [PATCH 15/17] fix test logic --- tests/test_rule_cache.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_rule_cache.py b/tests/test_rule_cache.py index 835761d08..b694b98c4 100644 --- a/tests/test_rule_cache.py +++ b/tests/test_rule_cache.py @@ -125,8 +125,11 @@ def test_rule_cache_dev_environment(): id = capa.rules.cache.compute_cache_identifier(content) cache_dir = capa.rules.cache.get_default_cache_directory() cache_path = capa.rules.cache.get_cache_path(cache_dir, id) - with contextlib.suppress(OSError): - cache_path.unlink() + + # clear existing cache files + for f in cache_dir.glob("*.cache"): + f.unlink() + capa.rules.cache.cache_ruleset(cache_dir, rs) assert cache_path.exists() @@ -136,7 +139,7 @@ def test_rule_cache_dev_environment(): cachepy = capa_root / "capa" / "rules" / "cache.py" # alternative: capa_root / "capa" / "rules" / "__init__.py" # set cache's last modified time prior to code file's modified time - os.utime(cache_path, (cache_path.stat().st_atime, cachepy.stat().st_mtime - 6000000)) + os.utime(cache_path, (cache_path.stat().st_atime, cachepy.stat().st_mtime - 600000)) # debug def ts_to_str(ts): From 6e000c67eca4f6f51b3c00c6cff19751ec261e45 Mon Sep 17 00:00:00 2001 From: Moritz Date: Mon, 26 Aug 2024 13:24:30 +0200 Subject: [PATCH 16/17] Apply suggestions from code review Co-authored-by: Willi Ballenthin --- capa/helpers.py | 3 +-- capa/main.py | 2 ++ capa/rules/__init__.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/capa/helpers.py b/capa/helpers.py index 1fc072431..497a99e41 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -343,13 +343,12 @@ def ts_to_str(ts): return datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S") logger.warning( - "not using cache: latest rule code file %s (%s) is newer than the latest rule cache file %s (%s)", + "latest rule code file %s (%s) is newer than the latest rule cache file %s (%s)", latest_rule_code_file, ts_to_str(rule_code_timestamp), latest_cache_file, ts_to_str(cache_timestamp), ) - logger.debug("delete the cache file manually to use rule caching again") return False logger.debug("no potentially outdated cache files found, cache can be used") diff --git a/capa/main.py b/capa/main.py index 390a0cc26..f42ae8e1a 100644 --- a/capa/main.py +++ b/capa/main.py @@ -622,6 +622,8 @@ def get_rules_from_cli(args) -> RuleSet: if capa.helpers.is_dev_environment(): # using the rules cache during development may result in unexpected errors, see #1898 enable_cache = capa.helpers.is_cache_newer_than_rule_code(cache_dir) + if not enable_cache: + logger.debug("not using cache. delete the cache file manually to use rule caching again") rules = capa.rules.get_rules(args.rules, cache_dir=cache_dir, enable_cache=enable_cache) except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e: diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index d5dfcf886..1d7f13ed1 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -2137,7 +2137,7 @@ def get_rules( rule_paths: list of paths to rules files or directories containing rules files cache_dir: directory to use for caching rules, or will use the default detected cache directory if None on_load_rule: callback to invoke before a rule is loaded, use for progress or cancellation - enable_cache: enable loading of a cached ruleset + enable_cache: enable loading of a cached ruleset (default: True) """ if cache_dir is None: cache_dir = capa.rules.cache.get_default_cache_directory() From 8b87a0fbf068549378d5f85960f1e6093e32e883 Mon Sep 17 00:00:00 2001 From: Moritz Date: Mon, 26 Aug 2024 13:27:35 +0200 Subject: [PATCH 17/17] Apply suggestions from code review --- capa/helpers.py | 1 - capa/main.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/capa/helpers.py b/capa/helpers.py index 497a99e41..ef8e94c62 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -351,5 +351,4 @@ def ts_to_str(ts): ) return False - logger.debug("no potentially outdated cache files found, cache can be used") return True diff --git a/capa/main.py b/capa/main.py index f42ae8e1a..f9e0ce249 100644 --- a/capa/main.py +++ b/capa/main.py @@ -624,6 +624,8 @@ def get_rules_from_cli(args) -> RuleSet: enable_cache = capa.helpers.is_cache_newer_than_rule_code(cache_dir) if not enable_cache: logger.debug("not using cache. delete the cache file manually to use rule caching again") + else: + logger.debug("cache can be used, no potentially outdated cache files found") rules = capa.rules.get_rules(args.rules, cache_dir=cache_dir, enable_cache=enable_cache) except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e: