Skip to content

Commit

Permalink
chore(iast): redaction algorithms refactor II [backport 2.9] (#9509)
Browse files Browse the repository at this point in the history
Backport 8d67869 from #9163 to 2.9.

# Summarize
Refactor of the IAST redaction system. The old algorithms had several
problems:

## Description
This PR continues this #9126
- Migrate SQL Injection to this new algorithm
- Remove deprecated code

## Checklist

- [x] Change(s) are motivated and described in the PR description
- [x] Testing strategy is described if automated tests are not included
in the PR
- [x] Risks are described (performance impact, potential for breakage,
maintainability)
- [x] Change is maintainable (easy to change, telemetry, documentation)
- [x] [Library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
are followed or label `changelog/no-changelog` is set
- [x] Documentation is included (in-code, generated user docs, [public
corp docs](https://github.com/DataDog/documentation/))
- [x] Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))
- [x] If this PR changes the public interface, I've notified
`@DataDog/apm-tees`.
- [x] If change touches code that signs or publishes builds or packages,
or handles credentials of any kind, I've requested a review from
`@DataDog/security-design-and-guidance`.

## Reviewer Checklist

- [x] Title is accurate
- [x] All changes are related to the pull request's stated goal
- [x] Description motivates each change
- [x] Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- [x] Testing strategy adequately addresses listed risks
- [x] Change is maintainable (easy to change, telemetry, documentation)
- [x] Release note makes sense to a user of the library
- [x] Author has acknowledged and discussed the performance implications
of this PR as reported in the benchmarks PR comment
- [x] Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

Co-authored-by: Alberto Vara <alberto.vara@datadoghq.com>
  • Loading branch information
github-actions[bot] and avara1986 authored Jun 12, 2024
1 parent 547bdd4 commit a568c63
Show file tree
Hide file tree
Showing 28 changed files with 476 additions and 807 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
from ddtrace.internal.logger import get_logger
from ddtrace.settings.asm import config as asm_config

from .._utils import _get_source_index
from ..constants import VULN_CMDI
from ..constants import VULN_HEADER_INJECTION
from ..constants import VULN_SQL_INJECTION
from ..constants import VULN_SSRF
from .command_injection_sensitive_analyzer import command_injection_sensitive_analyzer
from .header_injection_sensitive_analyzer import header_injection_sensitive_analyzer
from .sql_sensitive_analyzer import sql_sensitive_analyzer
from .url_sensitive_analyzer import url_sensitive_analyzer


Expand All @@ -27,7 +30,7 @@ def __init__(self):

self._sensitive_analyzers = {
VULN_CMDI: command_injection_sensitive_analyzer,
# SQL_INJECTION: sql_sensitive_analyzer,
VULN_SQL_INJECTION: sql_sensitive_analyzer,
VULN_SSRF: url_sensitive_analyzer,
VULN_HEADER_INJECTION: header_injection_sensitive_analyzer,
}
Expand Down Expand Up @@ -178,7 +181,7 @@ def to_redacted_json(self, evidence_value, sensitive, tainted_ranges, sources):
if next_tainted and next_tainted["start"] == i:
self.write_value_part(value_parts, evidence_value[start:i], source_index)

source_index = next_tainted_index
source_index = _get_source_index(sources, next_tainted["source"])

while next_sensitive and self._contains(next_tainted, next_sensitive):
redaction_start = next_sensitive["start"] - next_tainted["start"]
Expand Down
70 changes: 70 additions & 0 deletions ddtrace/appsec/_iast/_evidence_redaction/sql_sensitive_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import re

from ddtrace.appsec._iast.constants import DBAPI_MARIADB
from ddtrace.appsec._iast.constants import DBAPI_MYSQL
from ddtrace.appsec._iast.constants import DBAPI_PSYCOPG
from ddtrace.appsec._iast.constants import DBAPI_SQLITE
from ddtrace.internal.logger import get_logger


log = get_logger(__name__)


STRING_LITERAL = r"'(?:''|[^'])*'"
POSTGRESQL_ESCAPED_LITERAL = r"\$([^$]*)\$.*?\$\1\$"
MYSQL_STRING_LITERAL = r'"(?:\\\\"|[^"])*"|\'(?:\\\\\'|[^\'])*\''
LINE_COMMENT = r"--.*$"
BLOCK_COMMENT = r"/\*[\s\S]*?\*/"
EXPONENT = r"(?:E[-+]?\\d+[fd]?)?"
INTEGER_NUMBER = r"(?<!\w)\d+"
DECIMAL_NUMBER = r"\d*\.\d+"
HEX_NUMBER = r"x'[0-9a-f]+'|0x[0-9a-f]+"
BIN_NUMBER = r"b'[0-9a-f]+'|0b[0-9a-f]+"
NUMERIC_LITERAL = (
r"[-+]?(?:" + "|".join([HEX_NUMBER, BIN_NUMBER, DECIMAL_NUMBER + EXPONENT, INTEGER_NUMBER + EXPONENT]) + r")"
)

patterns = {
DBAPI_MYSQL: re.compile(
f"({NUMERIC_LITERAL})|({MYSQL_STRING_LITERAL})|({LINE_COMMENT})|({BLOCK_COMMENT})", re.IGNORECASE | re.MULTILINE
),
DBAPI_PSYCOPG: re.compile(
f"({NUMERIC_LITERAL})|({POSTGRESQL_ESCAPED_LITERAL})|({STRING_LITERAL})|({LINE_COMMENT})|({BLOCK_COMMENT})",
re.IGNORECASE | re.MULTILINE,
),
}
patterns[DBAPI_SQLITE] = patterns[DBAPI_MYSQL]
patterns[DBAPI_MARIADB] = patterns[DBAPI_MYSQL]


def sql_sensitive_analyzer(evidence, name_pattern, value_pattern):
pattern = patterns.get(evidence.dialect, patterns[DBAPI_MYSQL])
tokens = []

regex_result = pattern.search(evidence.value)
while regex_result is not None:
start = regex_result.start()
end = regex_result.end()
start_char = evidence.value[start]
if start_char == "'" or start_char == '"':
start += 1
end -= 1
elif end > start + 1:
next_char = evidence.value[start + 1]
if start_char == "/" and next_char == "*":
start += 2
end -= 2
elif start_char == "-" and start_char == next_char:
start += 2
elif start_char.lower() == "q" and next_char == "'":
start += 3
end -= 2
elif start_char == "$":
match = regex_result.group(0)
size = match.find("$", 1) + 1
if size > 1:
start += size
end -= size
tokens.append({"start": start, "end": end})
regex_result = pattern.search(evidence.value, regex_result.end())
return tokens
21 changes: 0 additions & 21 deletions ddtrace/appsec/_iast/_taint_dict.py

This file was deleted.

1 change: 0 additions & 1 deletion ddtrace/appsec/_iast/_taint_tracking/aspects.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,6 @@ def format_value_aspect(
if options == 115:
new_text = str_aspect(str, 0, element)
elif options == 114:
# TODO: use our repr once we have implemented it
new_text = repr_aspect(repr, 0, element)
elif options == 97:
new_text = ascii(element)
Expand Down
4 changes: 2 additions & 2 deletions ddtrace/appsec/_iast/_taint_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
from typing import Optional
from typing import Union

from ddtrace.appsec._iast.constants import DBAPI_INTEGRATIONS
from ddtrace.internal.logger import get_logger
from ddtrace.settings.asm import config as asm_config


DBAPI_INTEGRATIONS = ("sqlite", "psycopg", "mysql", "mariadb")
DBAPI_PREFIXES = ("django-",)

log = get_logger(__name__)
Expand Down Expand Up @@ -529,7 +529,7 @@ def supported_dbapi_integration(integration_name):
return integration_name in DBAPI_INTEGRATIONS or integration_name.startswith(DBAPI_PREFIXES)


def check_tainted_args(args, kwargs, tracer, integration_name, method):
def check_tainted_dbapi_args(args, kwargs, tracer, integration_name, method):
if supported_dbapi_integration(integration_name) and method.__name__ == "execute":
from ._taint_tracking import is_pyobject_tainted

Expand Down
90 changes: 8 additions & 82 deletions ddtrace/appsec/_iast/_utils.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,10 @@
import re
import string
import sys
from typing import TYPE_CHECKING # noqa:F401
from typing import List

from ddtrace.internal.logger import get_logger
from ddtrace.settings.asm import config as asm_config


if TYPE_CHECKING:
from typing import Any # noqa:F401
from typing import List # noqa:F401
from typing import Set # noqa:F401
from typing import Tuple # noqa:F401


def _is_python_version_supported(): # type: () -> bool
# IAST supports Python versions 3.6 to 3.12
return (3, 6, 0) <= sys.version_info < (3, 13, 0)
Expand All @@ -31,78 +22,13 @@ def _is_iast_enabled():
return True


# Used to cache the compiled regular expression
_SOURCE_NAME_SCRUB = None
_SOURCE_VALUE_SCRUB = None
_SOURCE_NUMERAL_SCRUB = None


def _has_to_scrub(s): # type: (str) -> bool
# TODO: This function is deprecated.
# Redaction migrated to `ddtrace.appsec._iast._evidence_redaction._sensitive_handler` but we need to migrate
# all vulnerabilities to use it first.
global _SOURCE_NAME_SCRUB
global _SOURCE_VALUE_SCRUB
global _SOURCE_NUMERAL_SCRUB

if _SOURCE_NAME_SCRUB is None:
_SOURCE_NAME_SCRUB = re.compile(asm_config._iast_redaction_name_pattern)
_SOURCE_VALUE_SCRUB = re.compile(asm_config._iast_redaction_value_pattern)
_SOURCE_NUMERAL_SCRUB = re.compile(asm_config._iast_redaction_numeral_pattern)

return (
_SOURCE_NAME_SCRUB.match(s) is not None
or _SOURCE_VALUE_SCRUB.match(s) is not None
or _SOURCE_NUMERAL_SCRUB.match(s) is not None
)


def _is_numeric(s):
# TODO: This function is deprecated.
# Redaction migrated to `ddtrace.appsec._iast._evidence_redaction._sensitive_handler` but we need to migrate
# all vulnerabilities to use it first.
global _SOURCE_NUMERAL_SCRUB

if _SOURCE_NUMERAL_SCRUB is None:
_SOURCE_NUMERAL_SCRUB = re.compile(asm_config._iast_redaction_numeral_pattern)

return _SOURCE_NUMERAL_SCRUB.match(s) is not None


_REPLACEMENTS = string.ascii_letters
_LEN_REPLACEMENTS = len(_REPLACEMENTS)


def _scrub(s, has_range=False): # type: (str, bool) -> str
# TODO: This function is deprecated.
# Redaction migrated to `ddtrace.appsec._iast._evidence_redaction._sensitive_handler` but we need to migrate
# all vulnerabilities to use it first.
if has_range:
return "".join([_REPLACEMENTS[i % _LEN_REPLACEMENTS] for i in range(len(s))])
return "*" * len(s)


def _is_evidence_value_parts(value): # type: (Any) -> bool
# TODO: This function is deprecated.
# Redaction migrated to `ddtrace.appsec._iast._evidence_redaction._sensitive_handler` but we need to migrate
# all vulnerabilities to use it first.
return isinstance(value, (set, list))


def _scrub_get_tokens_positions(text, tokens):
# type: (str, Set[str]) -> List[Tuple[int, int]]
# TODO: This function is deprecated.
# Redaction migrated to `ddtrace.appsec._iast._evidence_redaction._sensitive_handler` but we need to migrate
# all vulnerabilities to use it first.
token_positions = []

for token in tokens:
position = text.find(token)
if position != -1:
token_positions.append((position, position + len(token)))

token_positions.sort()
return token_positions
def _get_source_index(sources: List, source) -> int:
i = 0
for source_ in sources:
if hash(source_) == hash(source):
return i
i += 1
return -1


def _get_patched_code(module_path, module_name): # type: (str, str) -> str
Expand Down
14 changes: 5 additions & 9 deletions ddtrace/appsec/_iast/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,6 @@

VULNERABILITY_TOKEN_TYPE = Dict[int, Dict[str, Any]]

EVIDENCE_ALGORITHM_TYPE = "ALGORITHM"
EVIDENCE_SQL_INJECTION = "SQL_INJECTION"
EVIDENCE_PATH_TRAVERSAL = "PATH_TRAVERSAL"
EVIDENCE_WEAK_RANDOMNESS = "WEAK_RANDOMNESS"
EVIDENCE_COOKIE = "COOKIE"
EVIDENCE_CMDI = "COMMAND"
EVIDENCE_HEADER_INJECTION = "HEADER_INJECTION"
EVIDENCE_SSRF = "SSRF"

HEADER_NAME_VALUE_SEPARATOR = ": "

MD5_DEF = "md5"
Expand Down Expand Up @@ -91,3 +82,8 @@
"tarfile": {"open"},
"zipfile": {"ZipFile"},
}
DBAPI_SQLITE = "sqlite"
DBAPI_PSYCOPG = "psycopg"
DBAPI_MYSQL = "mysql"
DBAPI_MARIADB = "mariadb"
DBAPI_INTEGRATIONS = (DBAPI_SQLITE, DBAPI_PSYCOPG, DBAPI_MYSQL, DBAPI_MARIADB)
20 changes: 10 additions & 10 deletions ddtrace/appsec/_iast/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import attr

from ddtrace.appsec._iast._evidence_redaction import sensitive_handler
from ddtrace.appsec._iast._utils import _get_source_index
from ddtrace.appsec._iast.constants import VULN_INSECURE_HASHING_TYPE
from ddtrace.appsec._iast.constants import VULN_WEAK_CIPHER_TYPE
from ddtrace.appsec._iast.constants import VULN_WEAK_RANDOMNESS
Expand All @@ -26,8 +27,12 @@ def _only_if_true(value):
return value if value else None


ATTRS_TO_SKIP = frozenset({"_ranges", "_evidences_with_no_sources", "dialect"})


@attr.s(eq=False, hash=False)
class Evidence(object):
dialect = attr.ib(type=str, default=None) # type: Optional[str]
value = attr.ib(type=str, default=None) # type: Optional[str]
_ranges = attr.ib(type=dict, default={}) # type: Any
valueParts = attr.ib(type=list, default=None) # type: Any
Expand Down Expand Up @@ -143,14 +148,6 @@ def add_ranges_to_evidence_and_extract_sources(self, vuln):
if source not in self.sources:
self.sources = self.sources + [source]

def _get_source_index(self, sources: List[Source], source: Source) -> int:
i = 0
for source_ in sources:
if hash(source_) == hash(source):
return i
i += 1
return -1

def build_and_scrub_value_parts(self) -> Dict[str, Any]:
"""
Builds and scrubs value parts of vulnerabilities.
Expand Down Expand Up @@ -197,7 +194,7 @@ def get_unredacted_value_parts(self, evidence_value: str, ranges: List[dict], so
if from_index < range_["start"]:
value_parts.append({"value": evidence_value[from_index : range_["start"]]})

source_index = self._get_source_index(sources, range_["source"])
source_index = _get_source_index(sources, range_["source"])

value_parts.append(
{"value": evidence_value[range_["start"] : range_["end"]], "source": source_index} # type: ignore[dict-item]
Expand All @@ -217,7 +214,10 @@ def _to_dict(self) -> Dict[str, Any]:
Returns:
- Dict[str, Any]: Dictionary representation of the IAST span reporter.
"""
return attr.asdict(self, filter=lambda attr, x: x is not None and attr.name != "_ranges")
return attr.asdict(
self,
filter=lambda attr, x: x is not None and attr.name not in ATTRS_TO_SKIP,
)

def _to_str(self) -> str:
"""
Expand Down
Loading

0 comments on commit a568c63

Please sign in to comment.