Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/skillspector/nodes/analyzers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
from skillspector.nodes.analyzers.static_patterns_data_exfiltration import (
node as static_patterns_data_exfiltration_node,
)
from skillspector.nodes.analyzers.static_patterns_deserialization import (
node as static_patterns_deserialization_node,
)
from skillspector.nodes.analyzers.static_patterns_excessive_agency import (
node as static_patterns_excessive_agency_node,
)
Expand Down Expand Up @@ -92,6 +95,7 @@
"static_patterns_agent_snooping",
"static_patterns_anti_refusal",
"static_patterns_ssrf",
"static_patterns_deserialization",
"static_yara",
"behavioral_ast",
"behavioral_taint_tracking",
Expand All @@ -118,6 +122,7 @@
"static_patterns_agent_snooping": static_patterns_agent_snooping_node,
"static_patterns_anti_refusal": static_patterns_anti_refusal_node,
"static_patterns_ssrf": static_patterns_ssrf_node,
"static_patterns_deserialization": static_patterns_deserialization_node,
"static_yara": static_yara_node,
"behavioral_ast": behavioral_ast_node,
"behavioral_taint_tracking": behavioral_taint_tracking_node,
Expand Down
85 changes: 85 additions & 0 deletions src/skillspector/nodes/analyzers/behavioral_ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,34 @@
}
)

# Deserializers that reconstruct arbitrary objects (or execute code) from their
# input, regardless of arguments. Feeding attacker-controlled bytes to any of
# these is equivalent to code execution: pickle invokes ``__reduce__`` during
# unpickling, ``yaml.unsafe_load`` constructs arbitrary Python objects, etc.
# ``yaml.load``/``torch.load``/``numpy.load`` are handled separately because
# their safety depends on arguments (see ``_deserialization_message``).
_DESERIALIZATION_SINKS = frozenset(
{
"pickle.load",
"pickle.loads",
"cPickle.load",
"cPickle.loads",
"_pickle.load",
"_pickle.loads",
"marshal.load",
"marshal.loads",
"dill.load",
"dill.loads",
"jsonpickle.decode",
"pandas.read_pickle",
"joblib.load",
"yaml.unsafe_load",
}
)

# Loader classes that make ``yaml.load`` safe (no arbitrary object construction).
_SAFE_YAML_LOADERS = frozenset({"SafeLoader", "CSafeLoader", "BaseLoader"})

_RULE_MESSAGES: dict[str, str] = {
"AST1": "exec() call detected",
"AST2": "eval() call detected",
Expand All @@ -94,6 +122,7 @@
"AST7": "Dynamic attribute access via getattr()",
"AST8": "Dangerous execution chain",
"AST9": "Reflective dangerous call via getattr() with a literal sink name",
"AST10": "Insecure deserialization of untrusted data",
}

_RULE_SEVERITIES: dict[str, Severity] = {
Expand All @@ -106,6 +135,7 @@
"AST7": Severity.LOW,
"AST8": Severity.CRITICAL,
"AST9": Severity.HIGH,
"AST10": Severity.MEDIUM,
}

_RULE_CONFIDENCES: dict[str, float] = {
Expand All @@ -118,6 +148,7 @@
"AST7": 0.50,
"AST8": 0.95,
"AST9": 0.85,
"AST10": 0.70,
}

_TAG = "Dangerous Code Execution"
Expand Down Expand Up @@ -148,6 +179,57 @@ def _contains_dangerous_source(node: ast.AST, aliases: dict[str, str] | None = N
return None


def _loader_arg_name(node: ast.expr) -> str | None:
"""Return the trailing name of a yaml ``Loader`` argument (``yaml.SafeLoader`` → 'SafeLoader')."""
if isinstance(node, ast.Attribute):
return node.attr
if isinstance(node, ast.Name):
return node.id
return None


def _kwarg_is_true(node: ast.Call, name: str) -> bool:
"""True if keyword *name* is passed as a literal ``True``."""
return any(
kw.arg == name and isinstance(kw.value, ast.Constant) and kw.value.value is True
for kw in node.keywords
)


def _deserialization_message(call_name: str, node: ast.Call) -> str | None:
"""Return an AST10 message if *node* is an unsafe deserialization call, else None.

``_DESERIALIZATION_SINKS`` are unconditionally unsafe. ``yaml.load``, ``torch.load``,
and ``numpy.load`` are argument-dependent: an explicit safe ``Loader``,
``weights_only=True``, or the default ``allow_pickle=False`` respectively make them
safe and must not be flagged (avoids false positives on the hardened forms).
"""
if call_name in _DESERIALIZATION_SINKS:
return f"Insecure deserialization: {call_name}()"
if call_name == "yaml.load":
for kw in node.keywords:
if kw.arg == "Loader":
if _loader_arg_name(kw.value) in _SAFE_YAML_LOADERS:
return None
return "Insecure deserialization: yaml.load() with an unsafe Loader"
if len(node.args) >= 2 and _loader_arg_name(node.args[1]) in _SAFE_YAML_LOADERS:
return None
return "Insecure deserialization: yaml.load() without SafeLoader"
if call_name == "torch.load":
return (
None
if _kwarg_is_true(node, "weights_only")
else ("Insecure deserialization: torch.load() without weights_only=True")
)
if call_name == "numpy.load":
return (
"Insecure deserialization: numpy.load(allow_pickle=True)"
if _kwarg_is_true(node, "allow_pickle")
else None
)
return None


def _analyze_python(content: str, file_path: str) -> list[AnalyzerFinding]:
try:
tree = ast.parse(content, filename=file_path)
Expand Down Expand Up @@ -223,6 +305,9 @@ def _emit(
if attr in _OS_EXEC_CALLS:
_emit("AST5", lineno, end_lineno)

elif (deser_msg := _deserialization_message(call_name, ast_node)) is not None:
_emit("AST10", lineno, end_lineno, deser_msg)

elif call_name == "getattr" and len(ast_node.args) >= 2:
second_arg = ast_node.args[1]
if not isinstance(second_arg, ast.Constant):
Expand Down
34 changes: 33 additions & 1 deletion src/skillspector/nodes/analyzers/behavioral_taint_tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,32 @@
}
)

_ALL_SINKS = _NETWORK_OUTPUT_SINKS | _EXEC_SINKS | _FILE_WRITE_SINKS
# Deserializers that reconstruct arbitrary objects / execute code on their input.
# When untrusted data (network, user, or a bundled/downloaded file) reaches one of
# these, it is an RCE-class flow — the deserialization analogue of _EXEC_SINKS.
# Only unconditionally-unsafe names are listed; argument-dependent forms
# (yaml.load / torch.load / numpy.load) are handled by behavioral_ast (AST10) where
# keyword arguments can be inspected without false positives on the hardened forms.
_DESERIALIZATION_SINKS = frozenset(
{
"pickle.load",
"pickle.loads",
"cPickle.load",
"cPickle.loads",
"_pickle.load",
"_pickle.loads",
"marshal.load",
"marshal.loads",
"dill.load",
"dill.loads",
"jsonpickle.decode",
"pandas.read_pickle",
"joblib.load",
"yaml.unsafe_load",
}
)

_ALL_SINKS = _NETWORK_OUTPUT_SINKS | _EXEC_SINKS | _FILE_WRITE_SINKS | _DESERIALIZATION_SINKS

# Pre-computed for _pick_rule — avoids rebuilding the union on every call.
_EXTERNAL_INPUT_SOURCES = _NETWORK_INPUT_SOURCES | _USER_INPUT_SOURCES
Expand All @@ -145,6 +170,7 @@
"TT3": Severity.CRITICAL,
"TT4": Severity.HIGH,
"TT5": Severity.CRITICAL,
"TT6": Severity.HIGH,
}

_RULE_CONFIDENCES: dict[str, float] = {
Expand All @@ -153,6 +179,7 @@
"TT3": 0.90,
"TT4": 0.80,
"TT5": 0.90,
"TT6": 0.85,
}

_TAG = "Data Flow"
Expand All @@ -168,6 +195,7 @@
(_NETWORK_OUTPUT_SINKS, "network output"),
(_EXEC_SINKS, "code execution"),
(_FILE_WRITE_SINKS, "file write"),
(_DESERIALIZATION_SINKS, "deserialization"),
]


Expand Down Expand Up @@ -204,6 +232,10 @@ def _pick_rule(source_name: str, sink_name: str, is_direct: bool) -> str:
return "TT4"
if source_name in _EXTERNAL_INPUT_SOURCES and sink_name in _EXEC_SINKS:
return "TT5"
if sink_name in _DESERIALIZATION_SINKS and (
source_name in _EXTERNAL_INPUT_SOURCES or source_name in _FILE_READ_SOURCES
):
return "TT6"
return "TT1" if is_direct else "TT2"


Expand Down
29 changes: 29 additions & 0 deletions src/skillspector/nodes/analyzers/pattern_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class PatternCategory(StrEnum):
AGENT_SNOOPING = "Agent Snooping"
ANTI_REFUSAL = "Anti-Refusal"
SERVER_SIDE_REQUEST_FORGERY = "Server-Side Request Forgery"
DESERIALIZATION = "Insecure Deserialization"


# Pattern-specific explanations (why the finding is dangerous)
Expand Down Expand Up @@ -100,6 +101,7 @@ class PatternCategory(StrEnum):
"TT3": "Credentials or environment variables flow to a network sink. This is a high-confidence indicator of credential exfiltration.",
"TT4": "File contents flow to a network sink. This may indicate data exfiltration of sensitive files.",
"TT5": "External input (network, user) flows to a code execution sink. This enables remote code execution or command injection.",
"TT6": "External input or file contents flow to an insecure deserializer (pickle, marshal, dill, jsonpickle, joblib, yaml.unsafe_load). Deserializing untrusted data reconstructs arbitrary objects and enables remote code execution.",
# Behavioral AST (B.2.1)
"AST1": "Direct exec() call allows arbitrary code execution. An attacker can inject code that runs with the full privileges of the process.",
"AST2": "Direct eval() call evaluates arbitrary expressions. This can be exploited to execute malicious code or exfiltrate data.",
Expand All @@ -110,6 +112,7 @@ class PatternCategory(StrEnum):
"AST7": "Dynamic getattr() with a non-literal attribute name can access arbitrary object attributes, potentially bypassing access controls.",
"AST8": "A dangerous execution chain combines code execution (exec/eval) with a dynamic source (network, encoded data, dynamic import), creating a high-confidence attack vector.",
"AST9": "Reflective access to an execution sink via getattr() with a constant name (e.g. getattr(os, 'system'), getattr(builtins, 'exec')) is functionally identical to a direct exec/os.system call but evades name-based detection. This is a deliberate evasion technique rather than idiomatic code.",
"AST10": "Untrusted data is passed to an insecure deserializer (pickle, marshal, dill, jsonpickle, joblib, yaml.load without a safe Loader, or torch.load without weights_only). These deserializers reconstruct arbitrary objects and invoke callables during loading, so deserializing attacker-controlled bytes is equivalent to arbitrary code execution.",
# YARA (B.1.12)
"YR1": "YARA rule matched a known malware signature (reverse shell, backdoor, ransomware, C2 framework, or info stealer).",
"YR2": "YARA rule matched a known webshell pattern (PHP, Python, JSP, or ASPX webshell).",
Expand Down Expand Up @@ -137,6 +140,11 @@ class PatternCategory(StrEnum):
"SSRF1": "Code accesses a cloud instance metadata endpoint (e.g. 169.254.169.254). A single request can return temporary IAM credentials, making this a high-value SSRF target for credential theft.",
"SSRF2": "Code issues a request to a loopback, link-local, or private-range host. This can reach internal services not meant to be exposed and is a common SSRF pivot.",
"SSRF3": "Request target host is built from a dynamic or untrusted value. If the host is attacker-influenced, this enables SSRF to arbitrary internal or metadata endpoints.",
# Insecure Deserialization (multi-language)
"DS1": "PHP unserialize() on untrusted input enables object injection: crafted serialized data instantiates arbitrary classes and triggers magic methods (__wakeup/__destruct), leading to POP-chain code execution.",
"DS2": "Ruby Marshal.load / Marshal.restore reconstructs arbitrary objects from a binary blob. On attacker-controlled input this is a well-known remote code execution vector.",
"DS3": "Ruby YAML.load / Psych.load / Oj.load (in object mode) can instantiate arbitrary Ruby objects from untrusted YAML/JSON, enabling code execution. Use YAML.safe_load or Psych.safe_load instead.",
"DS4": "JavaScript deserialization via node-serialize/funcster/serialize-to-js evaluates embedded functions (the _$$ND_FUNC$$_ marker), so unserializing attacker input executes arbitrary code in the Node process.",
}

# Rule ID -> category (for report output)
Expand Down Expand Up @@ -187,6 +195,7 @@ class PatternCategory(StrEnum):
"TT3": PatternCategory.DATA_EXFILTRATION.value,
"TT4": PatternCategory.DATA_EXFILTRATION.value,
"TT5": PatternCategory.PRIVILEGE_ESCALATION.value,
"TT6": PatternCategory.DESERIALIZATION.value,
# YARA (B.1.12)
"YR1": PatternCategory.YARA_MATCH.value,
"YR2": PatternCategory.YARA_MATCH.value,
Expand Down Expand Up @@ -214,6 +223,11 @@ class PatternCategory(StrEnum):
"SSRF1": PatternCategory.SERVER_SIDE_REQUEST_FORGERY.value,
"SSRF2": PatternCategory.SERVER_SIDE_REQUEST_FORGERY.value,
"SSRF3": PatternCategory.SERVER_SIDE_REQUEST_FORGERY.value,
# Insecure Deserialization (multi-language)
"DS1": PatternCategory.DESERIALIZATION.value,
"DS2": PatternCategory.DESERIALIZATION.value,
"DS3": PatternCategory.DESERIALIZATION.value,
"DS4": PatternCategory.DESERIALIZATION.value,
}

# Rule ID -> pattern display name (for report output)
Expand Down Expand Up @@ -291,6 +305,14 @@ class PatternCategory(StrEnum):
"SSRF1": "Cloud Metadata Access",
"SSRF2": "Internal Network Request",
"SSRF3": "Dynamic Request Target",
# Behavioral AST / Taint deserialization
"AST10": "Insecure Deserialization",
"TT6": "Untrusted Data to Deserializer Flow",
# Insecure Deserialization (multi-language)
"DS1": "PHP Object Injection",
"DS2": "Ruby Marshal Deserialization",
"DS3": "Unsafe Ruby YAML Deserialization",
"DS4": "Unsafe JavaScript Deserialization",
}

# Pattern-specific remediations (how to fix the issue)
Expand Down Expand Up @@ -354,12 +376,14 @@ class PatternCategory(StrEnum):
"AST7": "Replace dynamic getattr() with explicit attribute access or a dictionary lookup with an allowlist of permitted attributes.",
"AST8": "Remove the execution chain entirely. Never pass network data, decoded bytes, or dynamically imported code to exec()/eval(). Use structured data formats instead.",
"AST9": "Call the function directly instead of reflectively (write exec(...) / os.system(...) explicitly), or remove it. If reflection is genuinely required, restrict it to an allowlist of safe attribute names that excludes execution sinks.",
"AST10": "Never deserialize untrusted input with pickle/marshal/dill/jsonpickle/joblib. Use a data-only format such as JSON. For YAML use yaml.safe_load; for PyTorch use torch.load(..., weights_only=True); for numpy avoid allow_pickle=True. If a binary format is unavoidable, verify an HMAC/signature over the bytes before loading.",
# Behavioral Taint Tracking (B.2.2)
"TT1": "Add validation or sanitization between the data source and sink. Never pass raw source data directly to a sink without checking its content.",
"TT2": "Validate tainted variables before passing them to sinks. Use allowlists, type checks, or sanitization functions on data from external sources.",
"TT3": "Never send credentials or environment variables over the network. Use secure credential stores and avoid transmitting secrets in request bodies or URLs.",
"TT4": "Validate and filter file contents before sending over the network. Ensure sensitive files (credentials, configs) are never transmitted to external endpoints.",
"TT5": "Never pass external input to exec(), eval(), os.system(), or subprocess without strict validation. Use allowlists and parameterized commands instead.",
"TT6": "Do not deserialize external input or bundled/downloaded files with pickle/marshal/dill/jsonpickle/joblib/yaml.unsafe_load. Use JSON or another data-only format, and verify integrity (HMAC/signature) before loading any binary blob.",
# YARA (B.1.12)
"YR1": "Remove the malware payload or compromised file entirely. Investigate how it entered the skill and audit all other artifacts for additional indicators of compromise.",
"YR2": "Remove the webshell code immediately. Webshells provide unauthorized remote command execution. Audit the skill for additional backdoors or persistence mechanisms.",
Expand Down Expand Up @@ -387,6 +411,11 @@ class PatternCategory(StrEnum):
"SSRF1": "Remove access to cloud metadata endpoints unless strictly required. If metadata is needed, restrict it (e.g. IMDSv2 with hop limit) and never expose returned credentials.",
"SSRF2": "Avoid requests to loopback/link-local/private hosts from skill code. If internal access is intended, document it and validate the target against an allowlist.",
"SSRF3": "Do not build request URLs from untrusted input. Validate the host against an allowlist and reject internal/metadata addresses before issuing the request.",
# Insecure Deserialization (multi-language)
"DS1": "Avoid unserialize() on untrusted PHP input. Use json_decode() for data, or restrict allowed classes via the second argument: unserialize($data, ['allowed_classes' => false]).",
"DS2": "Never call Marshal.load/Marshal.restore on untrusted data. Use JSON.parse for data exchange; Marshal is only safe for data you produced and trust.",
"DS3": "Replace YAML.load/Psych.load with YAML.safe_load (or Psych.safe_load) and pass an explicit permitted-classes allowlist. For Oj, avoid :object mode on untrusted input.",
"DS4": "Do not use node-serialize/funcster/serialize-to-js to deserialize untrusted input. Use JSON.parse for data, which never executes embedded code.",
}


Expand Down
Loading
Loading