diff --git a/Dockerfile b/Dockerfile index 75bd0ac..faef76c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,6 +29,16 @@ COPY collectors/ ${LAMBDA_TASK_ROOT}/collectors/ COPY polygon_client.py ${LAMBDA_TASK_ROOT}/ COPY weekly_collector.py ${LAMBDA_TASK_ROOT}/ COPY store/ ${LAMBDA_TASK_ROOT}/store/ +# validators/ — top-level imports in collectors/alternative.py + +# collectors/fundamentals.py (added 2026-05-16 via PR #254 per-collector +# value-range validation) require the package present in the Lambda image. +# Latent CI failure mode 2026-05-18 → 2026-05-19 (10 consecutive canary +# rollbacks to v87) surfaced by the Wave-3 PR3-wave-2 deploy (#273) — +# every push that touched a deploy-triggering path since #254 failed at +# canary with ``No module named 'validators'``. The canary correctly +# rolled back each time so prod (v87) was unaffected; the latent break +# only blocked any new code from ever reaching ``live``. +COPY validators/ ${LAMBDA_TASK_ROOT}/validators/ # flow-doctor.yaml at LAMBDA_TASK_ROOT is loaded by setup_logging() at # module-top of lambda/handler.py. The path resolves via: diff --git a/tests/test_dockerfile_copies_match_deployed_imports.py b/tests/test_dockerfile_copies_match_deployed_imports.py new file mode 100644 index 0000000..974a778 --- /dev/null +++ b/tests/test_dockerfile_copies_match_deployed_imports.py @@ -0,0 +1,191 @@ +"""Pin ``Dockerfile`` to COPY every top-level package imported from the +Lambda-deployed Python modules. + +Background +---------- +The Phase 2 Lambda image only contains the directories the Dockerfile +explicitly ``COPY``s into ``${LAMBDA_TASK_ROOT}``. When a deployed +module gains a top-level ``from X import ...`` for a local package ``X`` +that is NOT in the COPY list, the canary fails at module-load time with +``No module named 'X'`` — but only AFTER the image has been built, +pushed to ECR, the new version published, and the alias swapped. The +canary correctly rolls back so production is unaffected, but the latent +break blocks ANY new code from ever reaching ``live``. + +This exact failure mode bit production: + + - 2026-05-16 (PR #254 per-collector value-range validation): added + top-level ``from validators.price_validator import ...`` to + ``collectors/alternative.py`` + ``collectors/fundamentals.py`` but + did not add ``COPY validators/`` to the Dockerfile. CI rolled back + every push for 10 consecutive deploys (5/18-18:20Z through + 5/20-00:25Z) until the Wave-3 PR3-wave-2 deploy (#273) surfaced + the gap to the operator. + +This test scans every Lambda-deployed module's top-level imports for +``from import ...`` / ``import `` where + resolves to a local directory under the repo root. Every +such must appear in the Dockerfile's COPY directives. +Future PRs that introduce a new local-package import without the +matching Dockerfile COPY fail this test in CI, not in the post-merge +canary. +""" + +from __future__ import annotations + +import ast +import re +from pathlib import Path + +_REPO_ROOT = Path(__file__).resolve().parent.parent +_DOCKERFILE = _REPO_ROOT / "Dockerfile" + +# Modules / dirs that DON'T need to be in the image because the deploy +# isn't Lambda-bound, or they're stdlib / third-party (caught by +# requirements.txt). Keep this list tight — additions here are an +# escape hatch and should be justified inline. +_NON_LAMBDA_PACKAGES = frozenset({ + "tests", # not deployed + "builders", # currently NOT deployed; Lambda Phase-2 path doesn't + # reach builders code. If a Phase-2 import path ever needs builders + # (e.g. via collectors → weekly_collector → builders), add the + # COPY here AND remove from this allowlist. + "infrastructure", # deploy scripts, never run in Lambda + "rag", # RAG ingestion is an EC2 spot stage, not a Lambda + "features", # spot-only feature compute + "validators", # canonical state added via this PR; kept here in case + # the import discipline needs to be relaxed later — see below + # _DEPLOYED_LOCAL_PACKAGES check. +}) + +# The actual files copied into the Lambda image. Mirror the Dockerfile +# COPY directives' top-level entries. +_LAMBDA_DEPLOYED_FILES = ( + "lambda/handler.py", + "weekly_collector.py", + "polygon_client.py", +) +_LAMBDA_DEPLOYED_DIRS = ( + "collectors", + "store", + "validators", +) + + +def _local_packages() -> set[str]: + """Set of top-level directory names that contain ``__init__.py`` at + the repo root — i.e. the local packages a deployed module might + legitimately import from.""" + return { + p.name for p in _REPO_ROOT.iterdir() + if p.is_dir() and (p / "__init__.py").exists() and not p.name.startswith(".") + } + + +def _toplevel_imports(py_file: Path) -> set[str]: + """Parse ``py_file`` with ast and return the set of top-level + package names referenced by ``import X`` / ``from X.Y import ...`` + at MODULE scope (not inside functions/classes). + """ + tree = ast.parse(py_file.read_text(), filename=str(py_file)) + out: set[str] = set() + for node in tree.body: # module-scope only — deferred imports are fine + if isinstance(node, ast.Import): + for alias in node.names: + root = alias.name.split(".")[0] + out.add(root) + elif isinstance(node, ast.ImportFrom): + if node.level == 0 and node.module: + root = node.module.split(".")[0] + out.add(root) + return out + + +def _walk_python_files(paths: tuple[str, ...]) -> list[Path]: + """Expand the deployed-paths tuple into actual ``.py`` Paths.""" + out: list[Path] = [] + for p in paths: + full = _REPO_ROOT / p + if full.is_file() and full.suffix == ".py": + out.append(full) + elif full.is_dir(): + out.extend(full.rglob("*.py")) + return out + + +def _dockerfile_copied_dirs() -> set[str]: + """Parse the Dockerfile and return the set of directory names + explicitly COPY'd into ``${LAMBDA_TASK_ROOT}``.""" + text = _DOCKERFILE.read_text() + out: set[str] = set() + # Match: ``COPY / ${LAMBDA_TASK_ROOT}//`` (the trailing + # slash on the source convention denotes a directory copy). + for m in re.finditer( + r"^COPY\s+([A-Za-z_][A-Za-z0-9_]*)/\s+\${LAMBDA_TASK_ROOT}/", + text, + flags=re.MULTILINE, + ): + out.add(m.group(1)) + return out + + +def test_dockerfile_copies_validators_for_collectors_imports(): + """``collectors/alternative.py`` + ``collectors/fundamentals.py`` + have top-level imports from ``validators.price_validator`` since + PR #254. The Dockerfile MUST COPY ``validators/`` so the canary + can resolve those imports at Lambda load. + """ + deployed = _dockerfile_copied_dirs() + assert "validators" in deployed, ( + "Dockerfile does not COPY ``validators/``. " + "``collectors/alternative.py`` + ``collectors/fundamentals.py`` " + "have top-level ``from validators.price_validator import ...`` " + "since PR #254. Without this COPY the canary fails with " + "``No module named 'validators'`` and rolls back to the prior " + "version — every push since 2026-05-18. Add " + "``COPY validators/ ${LAMBDA_TASK_ROOT}/validators/`` to the " + "Dockerfile next to the other application-code COPY lines." + ) + + +def test_every_toplevel_local_import_in_lambda_code_is_dockerfile_copied(): + """Scan every deployed Python file's MODULE-SCOPE imports. Any + ``from import ...`` / ``import `` where ```` is a + local directory under the repo root MUST be in the Dockerfile's + COPY list (or in the explicitly-non-deployed allowlist). + + Catches the 2026-05-18 regression class — top-level import added + to a deployed module without the matching Dockerfile COPY — at PR + time, not in the post-merge canary rollback. + """ + local_pkgs = _local_packages() + deployed_dirs = _dockerfile_copied_dirs() + deployed_files = _walk_python_files(_LAMBDA_DEPLOYED_FILES) + \ + _walk_python_files(_LAMBDA_DEPLOYED_DIRS) + + missing: dict[str, list[str]] = {} + for py in deployed_files: + for imp in _toplevel_imports(py): + if imp not in local_pkgs: + continue + if imp in _NON_LAMBDA_PACKAGES: + continue + if imp in deployed_dirs: + continue + missing.setdefault(imp, []).append( + str(py.relative_to(_REPO_ROOT)) + ) + + assert not missing, ( + "Deployed Lambda code has top-level imports of local packages " + "that the Dockerfile does NOT COPY. The canary will fail at " + "load time with ``No module named ''``.\n\nMissing:\n" + + "\n".join( + f" - {pkg}/ (imported by: {', '.join(sorted(set(files)))})" + for pkg, files in sorted(missing.items()) + ) + + "\n\nEither add ``COPY / ${LAMBDA_TASK_ROOT}//`` to " + "the Dockerfile, or — if the import is intentionally deferred " + "and never reached in the Lambda path — move it inside the " + "function that needs it so it isn't a module-scope import." + )