rules: match: optimize rule matching by better indexing rule by features

Implement the "tighten rule pre-selection" algorithm described here: #2063 (comment) In summary: > Rather than indexing all features from all rules, > we should pick and index the minimal set (ideally, one) of > features from each rule that must be present for the rule to match. > When we have multiple candidates, pick the feature that is > probably most uncommon and therefore "selective". This seems to work pretty well. Total evaluations when running against mimikatz drop from 19M to 1.1M (wow!) and capa seems to match around 3x more functions per second (wow wow). When doing large scale runs, capa is about 25% faster when using the vivisect backend (analysis heavy) or 3x faster when using the upcoming BinExport2 backend (minimal analysis).
mandiant · Jun 7, 2024 · b068890 · b068890
1 parent d10d282
commit b068890
Show file tree

Hide file tree

Showing 5 changed files with 737 additions and 226 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@
 - document Antivirus warnings and VirusTotal false positive detections #2028 @RionEV @mr-tz
 - render maec/* fields #843 @s-ff
 - replace Halo spinner with Rich #2086 @s-ff
+- optimize rule matching #2080 @williballenthin
 
 ### Breaking Changes
 

diff --git a/capa/engine.py b/capa/engine.py
@@ -270,6 +270,14 @@ def evaluate(self, features: FeatureSet, short_circuit=True):
 MatchResults = Mapping[str, List[Tuple[Address, Result]]]
 
 
+def get_rule_namespaces(rule: "capa.rules.Rule") -> Iterator[str]:
+    namespace = rule.meta.get("namespace")
+    if namespace:
+        while namespace:
+            yield namespace
+            namespace, _, _ = namespace.rpartition("/")
+
+
 def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations: Iterable[Address]):
     """
     record into the given featureset that the given rule matched at the given locations.
@@ -280,11 +288,8 @@ def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations:
     updates `features` in-place. doesn't modify the remaining arguments.
     """
     features[capa.features.common.MatchedRule(rule.name)].update(locations)
-    namespace = rule.meta.get("namespace")
-    if namespace:
-        while namespace:
-            features[capa.features.common.MatchedRule(namespace)].update(locations)
-            namespace, _, _ = namespace.rpartition("/")
+    for namespace in get_rule_namespaces(rule):
+        features[capa.features.common.MatchedRule(namespace)].update(locations)
 
 
 def match(rules: List["capa.rules.Rule"], features: FeatureSet, addr: Address) -> Tuple[FeatureSet, MatchResults]:

diff --git a/capa/features/common.py b/capa/features/common.py
@@ -385,10 +385,12 @@ def __init__(self, value: bytes, description=None):
         self.value = value
 
     def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
+        assert isinstance(self.value, bytes)
+
         capa.perf.counters["evaluate.feature"] += 1
         capa.perf.counters["evaluate.feature.bytes"] += 1
+        capa.perf.counters["evaluate.feature.bytes." + str(len(self.value))] += 1
 
-        assert isinstance(self.value, bytes)
         for feature, locations in features.items():
             if not isinstance(feature, (Bytes,)):
                 continue
@@ -486,6 +488,6 @@ def __init__(self, value: str, description=None):
 def is_global_feature(feature):
     """
     is this a feature that is extracted at every scope?
-    today, these are OS and arch features.
+    today, these are OS, arch, and format features.
     """
-    return isinstance(feature, (OS, Arch))
+    return isinstance(feature, (OS, Arch, Format))