Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,39 @@ def optimized_get_pk_constraint(
return {"constrained_columns": index_columns, "name": index_name}


def _read_padded_char_field(row: Any, field: str) -> Optional[str]:
"""Read a CHAR(N) field that Teradata returns space-padded, return it stripped.

Returns None when the field is missing or not a string — callers should
leave any existing value alone in that case."""
raw = getattr(row, field, None) if not isinstance(row, dict) else row.get(field)
return raw.strip() if isinstance(raw, str) else None


def _strip_padded_nullable(row: Any, col_info: Dict[str, Any]) -> None:
"""Re-derive col_info['nullable'] by stripping CHAR(1) padding from row.Nullable.

teradatasqlalchemy does a strict `row['Nullable'] == 'Y'` check, but
Teradata returns CHAR(1) values space-padded ('Y '/'N '), so the check
evaluates False for every nullable column. No-op when the row's Nullable
isn't a string (e.g. None for view columns without QVCI enabled)."""
val = _read_padded_char_field(row, "Nullable")
if val is not None:
col_info["nullable"] = val == "Y"


def _strip_padded_autoincrement(row: Any, col_info: Dict[str, Any]) -> None:
"""Re-derive col_info['autoincrement'] by stripping padding from row.IdColType.

Same root cause as _strip_padded_nullable: teradatasqlalchemy checks
`row['IdColType'] in ('GA', 'GD')` strictly, but Teradata returns IdColType
space-padded ('GA '), so the check evaluates False for every identity
column. No-op when the raw value isn't a string."""
val = _read_padded_char_field(row, "IdColType")
if val is not None:
col_info["autoincrement"] = val in ("GA", "GD")


def optimized_get_columns(
self: Any,
connection: Connection,
Expand Down Expand Up @@ -749,6 +782,8 @@ def optimized_get_columns(
for row in res:
try:
col_info = self._get_column_info(row)
_strip_padded_nullable(row, col_info)
_strip_padded_autoincrement(row, col_info)

# Add CommentString as comment field for column description
if hasattr(row, "CommentString") and row.CommentString:
Expand Down
97 changes: 97 additions & 0 deletions metadata-ingestion/tests/unit/test_teradata_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -4108,3 +4108,100 @@ def test_disabled_cursor_skips_streaming_entirely(self) -> None:

conn.execution_options.assert_not_called()
assert result is expected


class TestCharPaddingFixes:
"""Teradata returns CHAR(N) values space-padded over the wire. The library's
strict comparisons (`row['Nullable'] == 'Y'`, `row['IdColType'] in ('GA',
'GD')`) then evaluate False for every column, so genuinely nullable columns
hydrate as nullable=False and identity columns hydrate as
autoincrement=False. Exercise the fix through optimized_get_columns so we
catch a regression at the actual code path."""

def _call_with_row(self, row, library_col_info=None):
"""Drive optimized_get_columns with a single mocked row.

library_col_info simulates what teradatasqlalchemy's _get_column_info
would return; the test then asserts our padding fixes override it."""
mock_dialect = MagicMock()
mock_dialect.default_schema_name = "mydb"
mock_dialect._get_column_info.return_value = library_col_info or {
"name": "col1",
"nullable": False,
"autoincrement": False,
}
mock_dialect.get_schema_columns.return_value = {"my_table": [row]}

tables_cache: Dict[str, List[TeradataTable]] = {
"mydb": [
TeradataTable(
database="mydb",
name="my_table",
description=None,
object_type="Table",
create_timestamp=datetime.now(),
last_alter_name=None,
last_alter_timestamp=None,
request_text=None,
)
]
}
return optimized_get_columns(
mock_dialect,
MagicMock(),
"my_table",
"mydb",
tables_cache=tables_cache,
)

@staticmethod
def _row(**fields):
"""SQLAlchemy Row-like object with attribute access for given fields."""
row = MagicMock(spec=list(fields.keys()) + ["CommentString"])
for k, v in fields.items():
setattr(row, k, v)
row.CommentString = None
return row

# Nullable: customer-reported bug
def test_padded_nullable_y_hydrates_as_true(self):
cols = self._call_with_row(self._row(Nullable="Y "))
assert cols[0]["nullable"] is True

def test_padded_nullable_n_hydrates_as_false(self):
cols = self._call_with_row(self._row(Nullable="N "))
assert cols[0]["nullable"] is False

def test_clean_nullable_y_unchanged(self):
cols = self._call_with_row(self._row(Nullable="Y"))
assert cols[0]["nullable"] is True

def test_dict_row_padded_nullable(self):
"""HELP-derived dict rows must work alongside SQLAlchemy Row objects."""
cols = self._call_with_row({"Nullable": "Y ", "ColumnName": "col1"})
assert cols[0]["nullable"] is True

# IdColType: latent sibling bug, same root cause
def test_padded_idcoltype_hydrates_as_autoincrement_true(self):
"""The library does `row['IdColType'] in ('GA','GD')`; Teradata returns
'GA ' padded, so the check fails for every identity column."""
cols = self._call_with_row(
self._row(Nullable="Y", IdColType="GA "),
library_col_info={"name": "col1", "nullable": True, "autoincrement": False},
)
assert cols[0]["autoincrement"] is True

def test_padded_idcoltype_gd_hydrates_as_autoincrement_true(self):
cols = self._call_with_row(
self._row(Nullable="Y", IdColType="GD "),
library_col_info={"name": "col1", "nullable": True, "autoincrement": False},
)
assert cols[0]["autoincrement"] is True

def test_no_idcoltype_leaves_autoincrement_alone(self):
"""Non-identity columns have IdColType=None; we must not overwrite."""
cols = self._call_with_row(
self._row(Nullable="Y", IdColType=None),
library_col_info={"name": "col1", "nullable": True, "autoincrement": False},
)
assert cols[0]["autoincrement"] is False
Loading