From 60f351cd2e6ac595f84721c30f0725c530760421 Mon Sep 17 00:00:00 2001 From: Adam Sachs Date: Wed, 10 Dec 2025 07:37:28 -0500 Subject: [PATCH 1/2] initial commit for mssql query hints --- src/fides/api/graph/config.py | 3 + src/fides/api/models/datasetconfig.py | 20 +++ src/fides/api/schemas/query_hints/__init__.py | 11 ++ src/fides/api/schemas/query_hints/base.py | 100 +++++++++++++++ .../schemas/query_hints/mssql_query_hints.py | 64 ++++++++++ .../microsoft_sql_server_query_config.py | 26 ++++ tests/ops/schemas/test_query_hints.py | 116 +++++++++++++++++ .../connectors/test_mssql_query_config.py | 120 ++++++++++++++++++ 8 files changed, 460 insertions(+) create mode 100644 src/fides/api/schemas/query_hints/__init__.py create mode 100644 src/fides/api/schemas/query_hints/base.py create mode 100644 src/fides/api/schemas/query_hints/mssql_query_hints.py create mode 100644 tests/ops/schemas/test_query_hints.py create mode 100644 tests/ops/service/connectors/test_mssql_query_config.py diff --git a/src/fides/api/graph/config.py b/src/fides/api/graph/config.py index 3716676fb65..b2cbbe94535 100644 --- a/src/fides/api/graph/config.py +++ b/src/fides/api/graph/config.py @@ -99,6 +99,7 @@ from fides.api.schemas.partitioning.time_based_partitioning import ( validate_partitioning_list, ) +from fides.api.schemas.query_hints.base import QueryHints from fides.api.util.collection_util import merge_dicts from fides.api.util.querytoken import QueryToken @@ -464,6 +465,8 @@ class Collection(BaseModel): data_categories: Set[FidesKey] = set() masking_strategy_override: Optional[MaskingStrategyOverride] = None partitioning: Optional[Union[List[TimeBasedPartitioning], Dict[str, Any]]] = None + # Query hints for optimizing database queries (e.g., MAXDOP for MSSQL) + query_hints: Optional[QueryHints] = None @property def field_dict(self) -> Dict[FieldPath, Field]: diff --git a/src/fides/api/models/datasetconfig.py b/src/fides/api/models/datasetconfig.py index 835c9d91762..1727e4ac152 100644 --- a/src/fides/api/models/datasetconfig.py +++ b/src/fides/api/models/datasetconfig.py @@ -19,6 +19,7 @@ ) from fides.api.graph.data_type import parse_data_type_string from fides.api.models.connectionconfig import ConnectionConfig, ConnectionType +from fides.api.schemas.query_hints.base import QueryHints from fides.api.service.masking.strategy.masking_strategy import MaskingStrategy from fides.api.util.saas_util import merge_datasets @@ -336,6 +337,24 @@ def convert_dataset_to_graph( if collection.fides_meta and collection.fides_meta.partitioning: collection_partitioning = collection.fides_meta.partitioning + # Extract query hints from collection metadata if present + collection_query_hints = None + if ( + collection.fides_meta + and hasattr(collection.fides_meta, "query_hints") + and collection.fides_meta.query_hints + ): + try: + collection_query_hints = QueryHints( + hints=collection.fides_meta.query_hints + ) + except Exception: + logger.warning( + "Invalid query_hints on collection {}.{}, ignoring", + dataset_name, + collection.name, + ) + graph_collection = Collection( name=collection.name, fields=graph_fields, @@ -347,6 +366,7 @@ def convert_dataset_to_graph( set(collection.data_categories) if collection.data_categories else set() ), partitioning=collection_partitioning, + query_hints=collection_query_hints, ) graph_collections.append(graph_collection) logger.debug( diff --git a/src/fides/api/schemas/query_hints/__init__.py b/src/fides/api/schemas/query_hints/__init__.py new file mode 100644 index 00000000000..acd6472368c --- /dev/null +++ b/src/fides/api/schemas/query_hints/__init__.py @@ -0,0 +1,11 @@ +"""Query hints schemas for database-specific query optimization.""" + +from fides.api.schemas.query_hints.base import QueryHint, QueryHints +from fides.api.schemas.query_hints.mssql_query_hints import MSSQLHintType, MSSQLQueryHint + +__all__ = [ + "QueryHint", + "QueryHints", + "MSSQLHintType", + "MSSQLQueryHint", +] diff --git a/src/fides/api/schemas/query_hints/base.py b/src/fides/api/schemas/query_hints/base.py new file mode 100644 index 00000000000..f07dc8c8451 --- /dev/null +++ b/src/fides/api/schemas/query_hints/base.py @@ -0,0 +1,100 @@ +"""Base classes for query hints.""" + +from abc import ABC, abstractmethod +from typing import Any, ClassVar, Dict, List, Optional, Set, Type + +from loguru import logger +from pydantic import BaseModel + + +class QueryHint(BaseModel, ABC): + """ + Base class for database-specific query hints. + + Each database implementation must define: + - The hint type enum + - Validation for hint values + - How to render the hint as SQL + """ + + # Registry of implementations by connection type + _implementations: ClassVar[Dict[str, Type["QueryHint"]]] = {} + + # The connection types this hint applies to + connection_types: ClassVar[Set[str]] = set() + + def __init_subclass__(cls, **kwargs: Any) -> None: + super().__init_subclass__(**kwargs) + for conn_type in cls.connection_types: + cls._implementations[conn_type] = cls + + @classmethod + def get_implementation(cls, connection_type: str) -> Optional[Type["QueryHint"]]: + """Get the QueryHint implementation for a connection type.""" + return cls._implementations.get(connection_type) + + @classmethod + def get_supported_connection_types(cls) -> Set[str]: + """Get all connection types that support query hints.""" + return set(cls._implementations.keys()) + + @abstractmethod + def to_sql_option(self) -> str: + """ + Render this hint as a SQL OPTION clause component. + + Returns the hint without the OPTION() wrapper, e.g., "MAXDOP 1" + """ + + +class QueryHints(BaseModel): + """ + Container for multiple query hints that can be specified on a Collection. + + Example YAML: + fides_meta: + query_hints: + - hint_type: maxdop + value: 1 + """ + + hints: List[Dict[str, Any]] = [] + + def get_hints_for_connection_type(self, connection_type: str) -> List[QueryHint]: + """ + Parse and validate hints for a specific connection type. + Returns only hints that are valid for this connection type. + """ + implementation = QueryHint.get_implementation(connection_type) + if implementation is None: + return [] + + valid_hints = [] + for hint_dict in self.hints: + try: + hint = implementation.model_validate(hint_dict) + valid_hints.append(hint) + except (ValueError, Exception) as exc: + # Skip hints that don't validate for this connection type + logger.debug( + "Skipping invalid query hint for connection type {}: {}", + connection_type, + exc, + ) + continue + + return valid_hints + + def to_sql_option_clause(self, connection_type: str) -> Optional[str]: + """ + Generate the full SQL OPTION clause for this connection type. + + Returns None if no valid hints exist for this connection type. + Returns e.g., "OPTION (MAXDOP 1)" for MSSQL. + """ + hints = self.get_hints_for_connection_type(connection_type) + if not hints: + return None + + hint_parts = [hint.to_sql_option() for hint in hints] + return f"OPTION ({', '.join(hint_parts)})" diff --git a/src/fides/api/schemas/query_hints/mssql_query_hints.py b/src/fides/api/schemas/query_hints/mssql_query_hints.py new file mode 100644 index 00000000000..9aee619af38 --- /dev/null +++ b/src/fides/api/schemas/query_hints/mssql_query_hints.py @@ -0,0 +1,64 @@ +"""Microsoft SQL Server specific query hints.""" + +from enum import Enum +from typing import ClassVar, Optional, Set + +from pydantic import model_validator + +from fides.api.schemas.query_hints.base import QueryHint + + +class MSSQLHintType(str, Enum): + """ + Supported Microsoft SQL Server query hints. + + Reference: https://learn.microsoft.com/en-us/sql/t-sql/queries/hints-transact-sql-query + + We explicitly enumerate only safe, performance-related hints. + This prevents SQL injection by only allowing known hint types. + """ + + # Parallelism hints + MAXDOP = "maxdop" + + # Future hints can be added here as needed: + # RECOMPILE = "recompile" + # OPTIMIZE_FOR_UNKNOWN = "optimize_for_unknown" + # FAST = "fast" + # MAXRECURSION = "maxrecursion" + + +class MSSQLQueryHint(QueryHint): + """ + Microsoft SQL Server query hint. + + Example usage in Dataset YAML: + fides_meta: + query_hints: + - hint_type: maxdop + value: 1 + """ + + connection_types: ClassVar[Set[str]] = {"mssql"} + + hint_type: MSSQLHintType + value: Optional[int] = None + + @model_validator(mode="after") + def validate_hint_value(self) -> "MSSQLQueryHint": + """Validate that the hint has appropriate values.""" + if self.hint_type == MSSQLHintType.MAXDOP: + if self.value is None: + raise ValueError("MAXDOP hint requires a value") + if not isinstance(self.value, int) or self.value < 0 or self.value > 64: + raise ValueError("MAXDOP value must be an integer between 0 and 64") + + return self + + def to_sql_option(self) -> str: + """Render as SQL OPTION clause component.""" + if self.hint_type == MSSQLHintType.MAXDOP: + return f"MAXDOP {self.value}" + + # Future hints would be handled here + raise ValueError(f"Unknown hint type: {self.hint_type}") diff --git a/src/fides/api/service/connectors/query_configs/microsoft_sql_server_query_config.py b/src/fides/api/service/connectors/query_configs/microsoft_sql_server_query_config.py index 427f2b2e223..631e85dd274 100644 --- a/src/fides/api/service/connectors/query_configs/microsoft_sql_server_query_config.py +++ b/src/fides/api/service/connectors/query_configs/microsoft_sql_server_query_config.py @@ -1,3 +1,5 @@ +from typing import List + from fides.api.service.connectors.query_configs.query_config import ( QueryStringWithoutTuplesOverrideQueryConfig, ) @@ -7,3 +9,27 @@ class MicrosoftSQLServerQueryConfig(QueryStringWithoutTuplesOverrideQueryConfig) """ Generates SQL valid for SQLServer. """ + + def get_formatted_query_string( + self, + field_list: str, + clauses: List[str], + ) -> str: + """ + Returns an SQL query string with optional MSSQL query hints. + + If query_hints are configured on the collection, appends an OPTION clause. + Example output: + SELECT a, b FROM table WHERE x IN (:x) OPTION (MAXDOP 1) + """ + base_query = f"SELECT {field_list} FROM {self.node.collection.name} WHERE {' OR '.join(clauses)}" + + # Check if collection has query hints configured + if self.node.collection.query_hints: + option_clause = self.node.collection.query_hints.to_sql_option_clause( + "mssql" + ) + if option_clause: + return f"{base_query} {option_clause}" + + return base_query diff --git a/tests/ops/schemas/test_query_hints.py b/tests/ops/schemas/test_query_hints.py new file mode 100644 index 00000000000..ebdfc8d15d1 --- /dev/null +++ b/tests/ops/schemas/test_query_hints.py @@ -0,0 +1,116 @@ +"""Tests for query hints schemas.""" + +import pytest +from pydantic import ValidationError + +from fides.api.schemas.query_hints.base import QueryHint, QueryHints +from fides.api.schemas.query_hints.mssql_query_hints import MSSQLHintType, MSSQLQueryHint + + +class TestMSSQLQueryHint: + """Tests for Microsoft SQL Server query hints.""" + + def test_maxdop_hint_valid(self): + """Test that a valid MAXDOP hint is created successfully.""" + hint = MSSQLQueryHint(hint_type=MSSQLHintType.MAXDOP, value=1) + assert hint.hint_type == MSSQLHintType.MAXDOP + assert hint.value == 1 + assert hint.to_sql_option() == "MAXDOP 1" + + def test_maxdop_hint_zero(self): + """Test that MAXDOP 0 is valid (unlimited parallelism).""" + hint = MSSQLQueryHint(hint_type=MSSQLHintType.MAXDOP, value=0) + assert hint.to_sql_option() == "MAXDOP 0" + + def test_maxdop_hint_max_value(self): + """Test that MAXDOP 64 is valid (max allowed).""" + hint = MSSQLQueryHint(hint_type=MSSQLHintType.MAXDOP, value=64) + assert hint.to_sql_option() == "MAXDOP 64" + + def test_maxdop_hint_missing_value(self): + """Test that MAXDOP hint requires a value.""" + with pytest.raises(ValidationError) as exc_info: + MSSQLQueryHint(hint_type=MSSQLHintType.MAXDOP) + assert "MAXDOP hint requires a value" in str(exc_info.value) + + def test_maxdop_hint_negative_value(self): + """Test that negative MAXDOP values are rejected.""" + with pytest.raises(ValidationError) as exc_info: + MSSQLQueryHint(hint_type=MSSQLHintType.MAXDOP, value=-1) + assert "MAXDOP value must be an integer between 0 and 64" in str(exc_info.value) + + def test_maxdop_hint_value_too_high(self): + """Test that MAXDOP values > 64 are rejected.""" + with pytest.raises(ValidationError) as exc_info: + MSSQLQueryHint(hint_type=MSSQLHintType.MAXDOP, value=65) + assert "MAXDOP value must be an integer between 0 and 64" in str(exc_info.value) + + def test_from_dict(self): + """Test creating hint from dictionary (as would come from YAML).""" + hint_dict = {"hint_type": "maxdop", "value": 1} + hint = MSSQLQueryHint.model_validate(hint_dict) + assert hint.hint_type == MSSQLHintType.MAXDOP + assert hint.value == 1 + + +class TestQueryHint: + """Tests for the base QueryHint class.""" + + def test_mssql_implementation_registered(self): + """Test that MSSQL implementation is registered.""" + impl = QueryHint.get_implementation("mssql") + assert impl == MSSQLQueryHint + + def test_unknown_connection_type_returns_none(self): + """Test that unknown connection types return None.""" + impl = QueryHint.get_implementation("unknown_db") + assert impl is None + + def test_supported_connection_types(self): + """Test getting supported connection types.""" + supported = QueryHint.get_supported_connection_types() + assert "mssql" in supported + + +class TestQueryHints: + """Tests for the QueryHints container.""" + + def test_empty_hints(self): + """Test empty hints container.""" + hints = QueryHints(hints=[]) + assert hints.get_hints_for_connection_type("mssql") == [] + assert hints.to_sql_option_clause("mssql") is None + + def test_single_mssql_hint(self): + """Test single MSSQL hint.""" + hints = QueryHints(hints=[{"hint_type": "maxdop", "value": 1}]) + mssql_hints = hints.get_hints_for_connection_type("mssql") + assert len(mssql_hints) == 1 + assert mssql_hints[0].to_sql_option() == "MAXDOP 1" + + def test_to_sql_option_clause(self): + """Test generating full OPTION clause.""" + hints = QueryHints(hints=[{"hint_type": "maxdop", "value": 1}]) + clause = hints.to_sql_option_clause("mssql") + assert clause == "OPTION (MAXDOP 1)" + + def test_invalid_hints_skipped(self): + """Test that invalid hints are skipped silently.""" + hints = QueryHints( + hints=[ + {"hint_type": "maxdop", "value": 1}, # Valid + {"hint_type": "invalid_hint", "value": 99}, # Invalid + {"hint_type": "maxdop", "value": -1}, # Invalid value + ] + ) + mssql_hints = hints.get_hints_for_connection_type("mssql") + # Only the valid hint should be returned + assert len(mssql_hints) == 1 + assert mssql_hints[0].value == 1 + + def test_hints_for_unsupported_connection_type(self): + """Test that unsupported connection types return no hints.""" + hints = QueryHints(hints=[{"hint_type": "maxdop", "value": 1}]) + postgres_hints = hints.get_hints_for_connection_type("postgres") + assert postgres_hints == [] + assert hints.to_sql_option_clause("postgres") is None diff --git a/tests/ops/service/connectors/test_mssql_query_config.py b/tests/ops/service/connectors/test_mssql_query_config.py new file mode 100644 index 00000000000..a7b3b1d517e --- /dev/null +++ b/tests/ops/service/connectors/test_mssql_query_config.py @@ -0,0 +1,120 @@ +"""Tests for Microsoft SQL Server query configuration.""" + +from unittest.mock import MagicMock + +import pytest + +from fides.api.graph.config import Collection, FieldPath, ScalarField +from fides.api.schemas.query_hints.base import QueryHints +from fides.api.service.connectors.query_configs.microsoft_sql_server_query_config import ( + MicrosoftSQLServerQueryConfig, +) + + +@pytest.fixture +def mock_execution_node(): + """Create a mock execution node for testing.""" + node = MagicMock() + node.collection = Collection( + name="test_table", + fields=[ + ScalarField(name="id", primary_key=True), + ScalarField(name="email"), + ScalarField(name="name"), + ], + ) + node.address = MagicMock() + node.address.value = "test_dataset:test_table" + return node + + +@pytest.fixture +def mock_execution_node_with_hints(): + """Create a mock execution node with query hints configured.""" + node = MagicMock() + node.collection = Collection( + name="test_table", + fields=[ + ScalarField(name="id", primary_key=True), + ScalarField(name="email"), + ScalarField(name="name"), + ], + query_hints=QueryHints(hints=[{"hint_type": "maxdop", "value": 1}]), + ) + node.address = MagicMock() + node.address.value = "test_dataset:test_table" + return node + + +class TestMicrosoftSQLServerQueryConfig: + """Tests for MSSQL query configuration.""" + + def test_get_formatted_query_string_without_hints(self, mock_execution_node): + """Test query string generation without hints.""" + config = MicrosoftSQLServerQueryConfig(mock_execution_node) + + query = config.get_formatted_query_string( + field_list="id, email, name", + clauses=["email = :email"], + ) + + assert query == "SELECT id, email, name FROM test_table WHERE email = :email" + assert "OPTION" not in query + + def test_get_formatted_query_string_with_maxdop_hint( + self, mock_execution_node_with_hints + ): + """Test query string generation with MAXDOP hint.""" + config = MicrosoftSQLServerQueryConfig(mock_execution_node_with_hints) + + query = config.get_formatted_query_string( + field_list="id, email, name", + clauses=["email = :email"], + ) + + assert ( + query + == "SELECT id, email, name FROM test_table WHERE email = :email OPTION (MAXDOP 1)" + ) + + def test_get_formatted_query_string_with_multiple_clauses( + self, mock_execution_node_with_hints + ): + """Test query string with multiple WHERE clauses and hints.""" + config = MicrosoftSQLServerQueryConfig(mock_execution_node_with_hints) + + query = config.get_formatted_query_string( + field_list="id, email, name", + clauses=["email = :email", "id IN (:id_0, :id_1)"], + ) + + expected = "SELECT id, email, name FROM test_table WHERE email = :email OR id IN (:id_0, :id_1) OPTION (MAXDOP 1)" + assert query == expected + + def test_get_formatted_query_string_empty_hints(self, mock_execution_node): + """Test that empty hints don't add OPTION clause.""" + mock_execution_node.collection.query_hints = QueryHints(hints=[]) + config = MicrosoftSQLServerQueryConfig(mock_execution_node) + + query = config.get_formatted_query_string( + field_list="id, email", + clauses=["email = :email"], + ) + + assert "OPTION" not in query + + def test_get_formatted_query_string_invalid_hints_ignored(self, mock_execution_node): + """Test that invalid hints are ignored and don't break query generation.""" + mock_execution_node.collection.query_hints = QueryHints( + hints=[{"hint_type": "invalid", "value": 999}] + ) + config = MicrosoftSQLServerQueryConfig(mock_execution_node) + + query = config.get_formatted_query_string( + field_list="id, email", + clauses=["email = :email"], + ) + + # Invalid hints should be ignored, no OPTION clause added + assert "OPTION" not in query + assert query == "SELECT id, email FROM test_table WHERE email = :email" From aed7f95d6a504792ffc5b54d039b036c879427a1 Mon Sep 17 00:00:00 2001 From: Adam Sachs Date: Wed, 10 Dec 2025 21:55:40 -0500 Subject: [PATCH 2/2] use fideslang alpha tag with query hint support --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9a634e40946..dffde2befbb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -77,4 +77,4 @@ toml==0.10.2 twilio==7.15.0 typing-extensions==4.12.2 versioneer==0.19 -fideslang==3.1.2 +fideslang==3.1.3a0