chore(iast): redaction algorithms refactor II [backport 2.9] (#9509)

Backport 8d67869 from #9163 to 2.9. # Summarize Refactor of the IAST redaction system. The old algorithms had several problems: ## Description This PR continues this #9126 - Migrate SQL Injection to this new algorithm - Remove deprecated code ## Checklist - [x] Change(s) are motivated and described in the PR description - [x] Testing strategy is described if automated tests are not included in the PR - [x] Risks are described (performance impact, potential for breakage, maintainability) - [x] Change is maintainable (easy to change, telemetry, documentation) - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed or label `changelog/no-changelog` is set - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)) - [x] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) - [x] If this PR changes the public interface, I've notified `@DataDog/apm-tees`. - [x] If change touches code that signs or publishes builds or packages, or handles credentials of any kind, I've requested a review from `@DataDog/security-design-and-guidance`. ## Reviewer Checklist - [x] Title is accurate - [x] All changes are related to the pull request's stated goal - [x] Description motivates each change - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - [x] Testing strategy adequately addresses listed risks - [x] Change is maintainable (easy to change, telemetry, documentation) - [x] Release note makes sense to a user of the library - [x] Author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - [x] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) Co-authored-by: Alberto Vara <alberto.vara@datadoghq.com>
DataDog · Jun 12, 2024 · a568c63 · a568c63
1 parent 547bdd4
commit a568c63
Show file tree

Hide file tree

Showing 28 changed files with 476 additions and 807 deletions.
diff --git a/ddtrace/appsec/_iast/_evidence_redaction/_sensitive_handler.py b/ddtrace/appsec/_iast/_evidence_redaction/_sensitive_handler.py
@@ -3,11 +3,14 @@
 from ddtrace.internal.logger import get_logger
 from ddtrace.settings.asm import config as asm_config
 
+from .._utils import _get_source_index
 from ..constants import VULN_CMDI
 from ..constants import VULN_HEADER_INJECTION
+from ..constants import VULN_SQL_INJECTION
 from ..constants import VULN_SSRF
 from .command_injection_sensitive_analyzer import command_injection_sensitive_analyzer
 from .header_injection_sensitive_analyzer import header_injection_sensitive_analyzer
+from .sql_sensitive_analyzer import sql_sensitive_analyzer
 from .url_sensitive_analyzer import url_sensitive_analyzer
 
 
@@ -27,7 +30,7 @@ def __init__(self):
 
         self._sensitive_analyzers = {
             VULN_CMDI: command_injection_sensitive_analyzer,
-            # SQL_INJECTION: sql_sensitive_analyzer,
+            VULN_SQL_INJECTION: sql_sensitive_analyzer,
             VULN_SSRF: url_sensitive_analyzer,
             VULN_HEADER_INJECTION: header_injection_sensitive_analyzer,
         }
@@ -178,7 +181,7 @@ def to_redacted_json(self, evidence_value, sensitive, tainted_ranges, sources):
             if next_tainted and next_tainted["start"] == i:
                 self.write_value_part(value_parts, evidence_value[start:i], source_index)
 
-                source_index = next_tainted_index
+                source_index = _get_source_index(sources, next_tainted["source"])
 
                 while next_sensitive and self._contains(next_tainted, next_sensitive):
                     redaction_start = next_sensitive["start"] - next_tainted["start"]

diff --git a/ddtrace/appsec/_iast/_evidence_redaction/sql_sensitive_analyzer.py b/ddtrace/appsec/_iast/_evidence_redaction/sql_sensitive_analyzer.py
@@ -0,0 +1,70 @@
+import re
+
+from ddtrace.appsec._iast.constants import DBAPI_MARIADB
+from ddtrace.appsec._iast.constants import DBAPI_MYSQL
+from ddtrace.appsec._iast.constants import DBAPI_PSYCOPG
+from ddtrace.appsec._iast.constants import DBAPI_SQLITE
+from ddtrace.internal.logger import get_logger
+
+
+log = get_logger(__name__)
+
+
+STRING_LITERAL = r"'(?:''|[^'])*'"
+POSTGRESQL_ESCAPED_LITERAL = r"\$([^$]*)\$.*?\$\1\$"
+MYSQL_STRING_LITERAL = r'"(?:\\\\"|[^"])*"|\'(?:\\\\\'|[^\'])*\''
+LINE_COMMENT = r"--.*$"
+BLOCK_COMMENT = r"/\*[\s\S]*?\*/"
+EXPONENT = r"(?:E[-+]?\\d+[fd]?)?"
+INTEGER_NUMBER = r"(?<!\w)\d+"
+DECIMAL_NUMBER = r"\d*\.\d+"
+HEX_NUMBER = r"x'[0-9a-f]+'|0x[0-9a-f]+"
+BIN_NUMBER = r"b'[0-9a-f]+'|0b[0-9a-f]+"
+NUMERIC_LITERAL = (
+    r"[-+]?(?:" + "|".join([HEX_NUMBER, BIN_NUMBER, DECIMAL_NUMBER + EXPONENT, INTEGER_NUMBER + EXPONENT]) + r")"
+)
+
+patterns = {
+    DBAPI_MYSQL: re.compile(
+        f"({NUMERIC_LITERAL})|({MYSQL_STRING_LITERAL})|({LINE_COMMENT})|({BLOCK_COMMENT})", re.IGNORECASE | re.MULTILINE
+    ),
+    DBAPI_PSYCOPG: re.compile(
+        f"({NUMERIC_LITERAL})|({POSTGRESQL_ESCAPED_LITERAL})|({STRING_LITERAL})|({LINE_COMMENT})|({BLOCK_COMMENT})",
+        re.IGNORECASE | re.MULTILINE,
+    ),
+}
+patterns[DBAPI_SQLITE] = patterns[DBAPI_MYSQL]
+patterns[DBAPI_MARIADB] = patterns[DBAPI_MYSQL]
+
+
+def sql_sensitive_analyzer(evidence, name_pattern, value_pattern):
+    pattern = patterns.get(evidence.dialect, patterns[DBAPI_MYSQL])
+    tokens = []
+
+    regex_result = pattern.search(evidence.value)
+    while regex_result is not None:
+        start = regex_result.start()
+        end = regex_result.end()
+        start_char = evidence.value[start]
+        if start_char == "'" or start_char == '"':
+            start += 1
+            end -= 1
+        elif end > start + 1:
+            next_char = evidence.value[start + 1]
+            if start_char == "/" and next_char == "*":
+                start += 2
+                end -= 2
+            elif start_char == "-" and start_char == next_char:
+                start += 2
+            elif start_char.lower() == "q" and next_char == "'":
+                start += 3
+                end -= 2
+            elif start_char == "$":
+                match = regex_result.group(0)
+                size = match.find("$", 1) + 1
+                if size > 1:
+                    start += size
+                    end -= size
+        tokens.append({"start": start, "end": end})
+        regex_result = pattern.search(evidence.value, regex_result.end())
+    return tokens
diff --git a/ddtrace/appsec/_iast/_taint_dict.py b/ddtrace/appsec/_iast/_taint_dict.py
diff --git a/ddtrace/appsec/_iast/_taint_tracking/aspects.py b/ddtrace/appsec/_iast/_taint_tracking/aspects.py
@@ -506,7 +506,6 @@ def format_value_aspect(
     if options == 115:
         new_text = str_aspect(str, 0, element)
     elif options == 114:
-        # TODO: use our repr once we have implemented it
         new_text = repr_aspect(repr, 0, element)
     elif options == 97:
         new_text = ascii(element)

diff --git a/ddtrace/appsec/_iast/_taint_utils.py b/ddtrace/appsec/_iast/_taint_utils.py
@@ -5,11 +5,11 @@
 from typing import Optional
 from typing import Union
 
+from ddtrace.appsec._iast.constants import DBAPI_INTEGRATIONS
 from ddtrace.internal.logger import get_logger
 from ddtrace.settings.asm import config as asm_config
 
 
-DBAPI_INTEGRATIONS = ("sqlite", "psycopg", "mysql", "mariadb")
 DBAPI_PREFIXES = ("django-",)
 
 log = get_logger(__name__)
@@ -529,7 +529,7 @@ def supported_dbapi_integration(integration_name):
     return integration_name in DBAPI_INTEGRATIONS or integration_name.startswith(DBAPI_PREFIXES)
 
 
-def check_tainted_args(args, kwargs, tracer, integration_name, method):
+def check_tainted_dbapi_args(args, kwargs, tracer, integration_name, method):
     if supported_dbapi_integration(integration_name) and method.__name__ == "execute":
         from ._taint_tracking import is_pyobject_tainted
 

diff --git a/ddtrace/appsec/_iast/_utils.py b/ddtrace/appsec/_iast/_utils.py
@@ -1,19 +1,10 @@
-import re
-import string
 import sys
-from typing import TYPE_CHECKING  # noqa:F401
+from typing import List
 
 from ddtrace.internal.logger import get_logger
 from ddtrace.settings.asm import config as asm_config
 
 
-if TYPE_CHECKING:
-    from typing import Any  # noqa:F401
-    from typing import List  # noqa:F401
-    from typing import Set  # noqa:F401
-    from typing import Tuple  # noqa:F401
-
-
 def _is_python_version_supported():  # type: () -> bool
     # IAST supports Python versions 3.6 to 3.12
     return (3, 6, 0) <= sys.version_info < (3, 13, 0)
@@ -31,78 +22,13 @@ def _is_iast_enabled():
     return True
 
 
-# Used to cache the compiled regular expression
-_SOURCE_NAME_SCRUB = None
-_SOURCE_VALUE_SCRUB = None
-_SOURCE_NUMERAL_SCRUB = None
-
-
-def _has_to_scrub(s):  # type: (str) -> bool
-    # TODO: This function is deprecated.
-    #  Redaction migrated to `ddtrace.appsec._iast._evidence_redaction._sensitive_handler` but we need to migrate
-    #  all vulnerabilities to use it first.
-    global _SOURCE_NAME_SCRUB
-    global _SOURCE_VALUE_SCRUB
-    global _SOURCE_NUMERAL_SCRUB
-
-    if _SOURCE_NAME_SCRUB is None:
-        _SOURCE_NAME_SCRUB = re.compile(asm_config._iast_redaction_name_pattern)
-        _SOURCE_VALUE_SCRUB = re.compile(asm_config._iast_redaction_value_pattern)
-        _SOURCE_NUMERAL_SCRUB = re.compile(asm_config._iast_redaction_numeral_pattern)
-
-    return (
-        _SOURCE_NAME_SCRUB.match(s) is not None
-        or _SOURCE_VALUE_SCRUB.match(s) is not None
-        or _SOURCE_NUMERAL_SCRUB.match(s) is not None
-    )
-
-
-def _is_numeric(s):
-    # TODO: This function is deprecated.
-    #  Redaction migrated to `ddtrace.appsec._iast._evidence_redaction._sensitive_handler` but we need to migrate
-    #  all vulnerabilities to use it first.
-    global _SOURCE_NUMERAL_SCRUB
-
-    if _SOURCE_NUMERAL_SCRUB is None:
-        _SOURCE_NUMERAL_SCRUB = re.compile(asm_config._iast_redaction_numeral_pattern)
-
-    return _SOURCE_NUMERAL_SCRUB.match(s) is not None
-
-
-_REPLACEMENTS = string.ascii_letters
-_LEN_REPLACEMENTS = len(_REPLACEMENTS)
-
-
-def _scrub(s, has_range=False):  # type: (str, bool) -> str
-    # TODO: This function is deprecated.
-    #  Redaction migrated to `ddtrace.appsec._iast._evidence_redaction._sensitive_handler` but we need to migrate
-    #  all vulnerabilities to use it first.
-    if has_range:
-        return "".join([_REPLACEMENTS[i % _LEN_REPLACEMENTS] for i in range(len(s))])
-    return "*" * len(s)
-
-
-def _is_evidence_value_parts(value):  # type: (Any) -> bool
-    # TODO: This function is deprecated.
-    #  Redaction migrated to `ddtrace.appsec._iast._evidence_redaction._sensitive_handler` but we need to migrate
-    #  all vulnerabilities to use it first.
-    return isinstance(value, (set, list))
-
-
-def _scrub_get_tokens_positions(text, tokens):
-    # type: (str, Set[str]) -> List[Tuple[int, int]]
-    # TODO: This function is deprecated.
-    #  Redaction migrated to `ddtrace.appsec._iast._evidence_redaction._sensitive_handler` but we need to migrate
-    #  all vulnerabilities to use it first.
-    token_positions = []
-
-    for token in tokens:
-        position = text.find(token)
-        if position != -1:
-            token_positions.append((position, position + len(token)))
-
-    token_positions.sort()
-    return token_positions
+def _get_source_index(sources: List, source) -> int:
+    i = 0
+    for source_ in sources:
+        if hash(source_) == hash(source):
+            return i
+        i += 1
+    return -1
 
 
 def _get_patched_code(module_path, module_name):  # type: (str, str) -> str

diff --git a/ddtrace/appsec/_iast/constants.py b/ddtrace/appsec/_iast/constants.py
@@ -16,15 +16,6 @@
 
 VULNERABILITY_TOKEN_TYPE = Dict[int, Dict[str, Any]]
 
-EVIDENCE_ALGORITHM_TYPE = "ALGORITHM"
-EVIDENCE_SQL_INJECTION = "SQL_INJECTION"
-EVIDENCE_PATH_TRAVERSAL = "PATH_TRAVERSAL"
-EVIDENCE_WEAK_RANDOMNESS = "WEAK_RANDOMNESS"
-EVIDENCE_COOKIE = "COOKIE"
-EVIDENCE_CMDI = "COMMAND"
-EVIDENCE_HEADER_INJECTION = "HEADER_INJECTION"
-EVIDENCE_SSRF = "SSRF"
-
 HEADER_NAME_VALUE_SEPARATOR = ": "
 
 MD5_DEF = "md5"
@@ -91,3 +82,8 @@
     "tarfile": {"open"},
     "zipfile": {"ZipFile"},
 }
+DBAPI_SQLITE = "sqlite"
+DBAPI_PSYCOPG = "psycopg"
+DBAPI_MYSQL = "mysql"
+DBAPI_MARIADB = "mariadb"
+DBAPI_INTEGRATIONS = (DBAPI_SQLITE, DBAPI_PSYCOPG, DBAPI_MYSQL, DBAPI_MARIADB)
diff --git a/ddtrace/appsec/_iast/reporter.py b/ddtrace/appsec/_iast/reporter.py
@@ -13,6 +13,7 @@
 import attr
 
 from ddtrace.appsec._iast._evidence_redaction import sensitive_handler
+from ddtrace.appsec._iast._utils import _get_source_index
 from ddtrace.appsec._iast.constants import VULN_INSECURE_HASHING_TYPE
 from ddtrace.appsec._iast.constants import VULN_WEAK_CIPHER_TYPE
 from ddtrace.appsec._iast.constants import VULN_WEAK_RANDOMNESS
@@ -26,8 +27,12 @@ def _only_if_true(value):
     return value if value else None
 
 
+ATTRS_TO_SKIP = frozenset({"_ranges", "_evidences_with_no_sources", "dialect"})
+
+
 @attr.s(eq=False, hash=False)
 class Evidence(object):
+    dialect = attr.ib(type=str, default=None)  # type: Optional[str]
     value = attr.ib(type=str, default=None)  # type: Optional[str]
     _ranges = attr.ib(type=dict, default={})  # type: Any
     valueParts = attr.ib(type=list, default=None)  # type: Any
@@ -143,14 +148,6 @@ def add_ranges_to_evidence_and_extract_sources(self, vuln):
             if source not in self.sources:
                 self.sources = self.sources + [source]
 
-    def _get_source_index(self, sources: List[Source], source: Source) -> int:
-        i = 0
-        for source_ in sources:
-            if hash(source_) == hash(source):
-                return i
-            i += 1
-        return -1
-
     def build_and_scrub_value_parts(self) -> Dict[str, Any]:
         """
         Builds and scrubs value parts of vulnerabilities.
@@ -197,7 +194,7 @@ def get_unredacted_value_parts(self, evidence_value: str, ranges: List[dict], so
             if from_index < range_["start"]:
                 value_parts.append({"value": evidence_value[from_index : range_["start"]]})
 
-            source_index = self._get_source_index(sources, range_["source"])
+            source_index = _get_source_index(sources, range_["source"])
 
             value_parts.append(
                 {"value": evidence_value[range_["start"] : range_["end"]], "source": source_index}  # type: ignore[dict-item]
@@ -217,7 +214,10 @@ def _to_dict(self) -> Dict[str, Any]:
         Returns:
         - Dict[str, Any]: Dictionary representation of the IAST span reporter.
         """
-        return attr.asdict(self, filter=lambda attr, x: x is not None and attr.name != "_ranges")
+        return attr.asdict(
+            self,
+            filter=lambda attr, x: x is not None and attr.name not in ATTRS_TO_SKIP,
+        )
 
     def _to_str(self) -> str:
         """