From 1d9933751520a6e6deff6aa9de414600f9632c24 Mon Sep 17 00:00:00 2001
From: Alessandro Molina <alessandro.molina@posit.co>
Date: Fri, 19 Dec 2025 15:56:15 +0100
Subject: [PATCH] initial poc

---
 src/scandeps.py                      |  93 +++++++++++++++++++
 tests/fixtures/sample_notebook.ipynb |  42 +++++++++
 tests/fixtures/sample_report.qmd     |  19 ++++
 tests/test_scandeps.py               | 131 +++++++++++++++++++++++++++
 4 files changed, 285 insertions(+)
 create mode 100644 src/scandeps.py
 create mode 100644 tests/fixtures/sample_notebook.ipynb
 create mode 100644 tests/fixtures/sample_report.qmd
 create mode 100644 tests/test_scandeps.py

diff --git a/src/scandeps.py b/src/scandeps.py
new file mode 100644
index 0000000..c98d5aa
--- /dev/null
+++ b/src/scandeps.py
@@ -0,0 +1,93 @@
+import json
+import os
+import re
+import sys
+from pathlib import Path
+from typing import Iterable, Set
+
+
+IMPORT_RE = re.compile(
+    r"^\s*import\s+([A-Za-z_][\w\.]*\s*(?:,\s*[A-Za-z_][\w\.]*)*)",
+    re.MULTILINE,
+)
+FROM_RE = re.compile(r"^\s*from\s+([A-Za-z_][\w\.]*)\s+import", re.MULTILINE)
+STD_LIB = (
+    set(getattr(sys, "stdlib_module_names", []))
+    | set(sys.builtin_module_names)
+    | {"__future__", "__main__"}
+)
+
+
+def _base_package(module: str) -> str:
+    return module.split(".", 1)[0].strip()
+
+
+def scan_code(code: str) -> Set[str]:
+    # Depends on Python 3.10+ for sys.stdlib_module_names
+    if sys.version_info < (3, 10):
+        raise RuntimeError("Python 3.10+ required for stdlib_module_names")
+    pkgs: Set[str] = set()
+    for match in IMPORT_RE.finditer(code):
+        for mod in match.group(1).split(","):
+            pkgs.add(_base_package(mod))
+    for match in FROM_RE.finditer(code):
+        pkgs.add(_base_package(match.group(1)))
+    return {pkg for pkg in pkgs if pkg and pkg not in STD_LIB}
+
+
+def read_notebook(path: str) -> str:
+    with open(path, "r", encoding="utf-8") as handle:
+        notebook = json.load(handle)
+    cells = notebook.get("cells") or []
+    lines: list[str] = []
+    for cell in cells:
+        if cell.get("cell_type") != "code":
+            continue
+        source = cell.get("source", [])
+        if isinstance(source, str):
+            lines.append(source)
+        else:
+            lines.extend(str(line) for line in source)
+    return "\n".join(lines)
+
+
+def read_qmd(path: str) -> str:
+    lines = Path(path).read_text(encoding="utf-8").splitlines()
+    code: list[str] = []
+    in_block = False
+    for line in lines:
+        if in_block:
+            if line.startswith("```"):
+                in_block = False
+                code.append("")
+            else:
+                code.append(line.rstrip("\r"))
+        elif line.startswith("```{python"):
+            in_block = True
+    return "\n".join(code)
+
+
+def _local_package_exists(dirpath: str, name: str) -> bool:
+    return os.path.isdir(os.path.join(dirpath, name)) or os.path.isfile(
+        os.path.join(dirpath, f"{name}.py")
+    )
+
+
+def scan_project(root: str) -> list[str]:
+    imports: Set[str] = set()
+    for dirpath, _, files in os.walk(root):
+        for filename in files:
+            ext = os.path.splitext(filename)[1].lower()
+            path = os.path.join(dirpath, filename)
+            code = ""
+            if ext == ".py":
+                code = Path(path).read_text(encoding="utf-8")
+            elif ext == ".ipynb":
+                code = read_notebook(path)
+            elif ext == ".qmd":
+                code = read_qmd(path)
+            if not code:
+                continue
+            names = {name for name in scan_code(code) if not _local_package_exists(dirpath, name)}
+            imports.update(names)
+    return sorted(imports)
diff --git a/tests/fixtures/sample_notebook.ipynb b/tests/fixtures/sample_notebook.ipynb
new file mode 100644
index 0000000..6b38be0
--- /dev/null
+++ b/tests/fixtures/sample_notebook.ipynb
@@ -0,0 +1,42 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Sample Notebook",
+    "This cell is ignored"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas\n",
+    "import numpy as np\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": "from sklearn.model_selection import train_test_split\nprint('done')"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tests/fixtures/sample_report.qmd b/tests/fixtures/sample_report.qmd
new file mode 100644
index 0000000..3932567
--- /dev/null
+++ b/tests/fixtures/sample_report.qmd
@@ -0,0 +1,19 @@
+---
+title: Sample Report
+---
+
+Some prose before code.
+
+```{python}
+import pandas as pd
+print(pd.__version__)
+```
+
+```{r}
+# not python, should be ignored
+print("r block")
+```
+
+```{python}
+from sklearn import metrics
+```
diff --git a/tests/test_scandeps.py b/tests/test_scandeps.py
new file mode 100644
index 0000000..5609c60
--- /dev/null
+++ b/tests/test_scandeps.py
@@ -0,0 +1,131 @@
+import json
+from pathlib import Path
+
+import pytest
+
+from scandeps import read_notebook, read_qmd, scan_code, scan_project
+
+
+FIXTURES = Path(__file__).parent / "fixtures"
+
+
+def test_scan_code_parses_various_imports():
+    assert scan_code(
+        """
+import pandas.core.frame, requests
+    from sklearn.model_selection import train_test_split
+\tfrom torch import (
+        nn,
+        optim,
+    )
+    import typing  # stdlib should be dropped
+    """
+    ) == {"pandas", "requests", "sklearn", "torch"}
+
+
+def test_scan_code_filters_stdlib_and_builtins():
+    assert (
+        scan_code(
+            "import sys, json\nfrom builtins import open\nfrom __future__ import annotations\n"
+        )
+        == set()
+    )
+
+
+def test_scan_code_requires_python_310(monkeypatch):
+    monkeypatch.setattr("scandeps.sys.version_info", (3, 9, 9))
+    with pytest.raises(RuntimeError):
+        scan_code("import requests")
+
+
+def test_read_notebook_collects_only_code_cells():
+    lines = [
+        line
+        for line in read_notebook(str(FIXTURES / "sample_notebook.ipynb")).splitlines()
+        if line
+    ]
+    assert lines == [
+        "import pandas",
+        "import numpy as np",
+        "from sklearn.model_selection import train_test_split",
+        "print('done')",
+    ]
+
+
+def test_read_qmd_extracts_python_blocks_only():
+    assert read_qmd(str(FIXTURES / "sample_report.qmd")).splitlines() == [
+        "import pandas as pd",
+        "print(pd.__version__)",
+        "",
+        "from sklearn import metrics",
+    ]
+
+
+def test_scan_project_collects_imports_across_formats(tmp_path):
+    project = tmp_path / "proj"
+    project.mkdir()
+    (project / "localpkg").mkdir()
+    (project / "localpkg" / "__init__.py").write_text("")
+    (project / "localmod.py").write_text("# local module\n")
+
+    (project / "app.py").write_text(
+        """
+import requests, typing
+from pandas.core.frame import DataFrame
+from torch import nn
+import localpkg
+from localmod import value
+"""
+    )
+
+    notebook_dir = project / "notebooks"
+    notebook_dir.mkdir()
+    (notebook_dir / "helpers.py").write_text("# helper\n")
+    with (notebook_dir / "analysis.ipynb").open("w", encoding="utf-8") as handle:
+        json.dump(
+            {
+                "cells": [
+                    {
+                        "cell_type": "code",
+                        "source": [
+                            "import matplotlib.pyplot\n",
+                            "from scipy import stats\n",
+                            "import helpers\n",
+                        ],
+                    }
+                ],
+                "metadata": {},
+                "nbformat": 4,
+                "nbformat_minor": 5,
+            },
+            handle,
+        )
+
+    docs = project / "docs"
+    docs.mkdir()
+    (docs / "report.qmd").write_text(
+        """\
+```{python}
+import seaborn
+from sklearn.linear_model import LogisticRegression
+```
+"""
+    )
+
+    assert scan_project(str(project)) == [
+        "matplotlib",
+        "pandas",
+        "requests",
+        "scipy",
+        "seaborn",
+        "sklearn",
+        "torch",
+    ]
+
+
+def test_scan_project_dedupes_and_orders(tmp_path):
+    project = tmp_path / "proj2"
+    project.mkdir()
+    (project / "module.py").write_text("import requests\nimport requests\nfrom pandas import DataFrame\n")
+    (project / "another.py").write_text("from pandas.io import json\nimport requests\n")
+    assert scan_project(str(project)) == ["pandas", "requests"]