From 1d9933751520a6e6deff6aa9de414600f9632c24 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 19 Dec 2025 15:56:15 +0100 Subject: [PATCH] initial poc --- src/scandeps.py | 93 +++++++++++++++++++ tests/fixtures/sample_notebook.ipynb | 42 +++++++++ tests/fixtures/sample_report.qmd | 19 ++++ tests/test_scandeps.py | 131 +++++++++++++++++++++++++++ 4 files changed, 285 insertions(+) create mode 100644 src/scandeps.py create mode 100644 tests/fixtures/sample_notebook.ipynb create mode 100644 tests/fixtures/sample_report.qmd create mode 100644 tests/test_scandeps.py diff --git a/src/scandeps.py b/src/scandeps.py new file mode 100644 index 0000000..c98d5aa --- /dev/null +++ b/src/scandeps.py @@ -0,0 +1,93 @@ +import json +import os +import re +import sys +from pathlib import Path +from typing import Iterable, Set + + +IMPORT_RE = re.compile( + r"^\s*import\s+([A-Za-z_][\w\.]*\s*(?:,\s*[A-Za-z_][\w\.]*)*)", + re.MULTILINE, +) +FROM_RE = re.compile(r"^\s*from\s+([A-Za-z_][\w\.]*)\s+import", re.MULTILINE) +STD_LIB = ( + set(getattr(sys, "stdlib_module_names", [])) + | set(sys.builtin_module_names) + | {"__future__", "__main__"} +) + + +def _base_package(module: str) -> str: + return module.split(".", 1)[0].strip() + + +def scan_code(code: str) -> Set[str]: + # Depends on Python 3.10+ for sys.stdlib_module_names + if sys.version_info < (3, 10): + raise RuntimeError("Python 3.10+ required for stdlib_module_names") + pkgs: Set[str] = set() + for match in IMPORT_RE.finditer(code): + for mod in match.group(1).split(","): + pkgs.add(_base_package(mod)) + for match in FROM_RE.finditer(code): + pkgs.add(_base_package(match.group(1))) + return {pkg for pkg in pkgs if pkg and pkg not in STD_LIB} + + +def read_notebook(path: str) -> str: + with open(path, "r", encoding="utf-8") as handle: + notebook = json.load(handle) + cells = notebook.get("cells") or [] + lines: list[str] = [] + for cell in cells: + if cell.get("cell_type") != "code": + continue + source = cell.get("source", []) + if isinstance(source, str): + lines.append(source) + else: + lines.extend(str(line) for line in source) + return "\n".join(lines) + + +def read_qmd(path: str) -> str: + lines = Path(path).read_text(encoding="utf-8").splitlines() + code: list[str] = [] + in_block = False + for line in lines: + if in_block: + if line.startswith("```"): + in_block = False + code.append("") + else: + code.append(line.rstrip("\r")) + elif line.startswith("```{python"): + in_block = True + return "\n".join(code) + + +def _local_package_exists(dirpath: str, name: str) -> bool: + return os.path.isdir(os.path.join(dirpath, name)) or os.path.isfile( + os.path.join(dirpath, f"{name}.py") + ) + + +def scan_project(root: str) -> list[str]: + imports: Set[str] = set() + for dirpath, _, files in os.walk(root): + for filename in files: + ext = os.path.splitext(filename)[1].lower() + path = os.path.join(dirpath, filename) + code = "" + if ext == ".py": + code = Path(path).read_text(encoding="utf-8") + elif ext == ".ipynb": + code = read_notebook(path) + elif ext == ".qmd": + code = read_qmd(path) + if not code: + continue + names = {name for name in scan_code(code) if not _local_package_exists(dirpath, name)} + imports.update(names) + return sorted(imports) diff --git a/tests/fixtures/sample_notebook.ipynb b/tests/fixtures/sample_notebook.ipynb new file mode 100644 index 0000000..6b38be0 --- /dev/null +++ b/tests/fixtures/sample_notebook.ipynb @@ -0,0 +1,42 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sample Notebook", + "This cell is ignored" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas\n", + "import numpy as np\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": "from sklearn.model_selection import train_test_split\nprint('done')" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/fixtures/sample_report.qmd b/tests/fixtures/sample_report.qmd new file mode 100644 index 0000000..3932567 --- /dev/null +++ b/tests/fixtures/sample_report.qmd @@ -0,0 +1,19 @@ +--- +title: Sample Report +--- + +Some prose before code. + +```{python} +import pandas as pd +print(pd.__version__) +``` + +```{r} +# not python, should be ignored +print("r block") +``` + +```{python} +from sklearn import metrics +``` diff --git a/tests/test_scandeps.py b/tests/test_scandeps.py new file mode 100644 index 0000000..5609c60 --- /dev/null +++ b/tests/test_scandeps.py @@ -0,0 +1,131 @@ +import json +from pathlib import Path + +import pytest + +from scandeps import read_notebook, read_qmd, scan_code, scan_project + + +FIXTURES = Path(__file__).parent / "fixtures" + + +def test_scan_code_parses_various_imports(): + assert scan_code( + """ +import pandas.core.frame, requests + from sklearn.model_selection import train_test_split +\tfrom torch import ( + nn, + optim, + ) + import typing # stdlib should be dropped + """ + ) == {"pandas", "requests", "sklearn", "torch"} + + +def test_scan_code_filters_stdlib_and_builtins(): + assert ( + scan_code( + "import sys, json\nfrom builtins import open\nfrom __future__ import annotations\n" + ) + == set() + ) + + +def test_scan_code_requires_python_310(monkeypatch): + monkeypatch.setattr("scandeps.sys.version_info", (3, 9, 9)) + with pytest.raises(RuntimeError): + scan_code("import requests") + + +def test_read_notebook_collects_only_code_cells(): + lines = [ + line + for line in read_notebook(str(FIXTURES / "sample_notebook.ipynb")).splitlines() + if line + ] + assert lines == [ + "import pandas", + "import numpy as np", + "from sklearn.model_selection import train_test_split", + "print('done')", + ] + + +def test_read_qmd_extracts_python_blocks_only(): + assert read_qmd(str(FIXTURES / "sample_report.qmd")).splitlines() == [ + "import pandas as pd", + "print(pd.__version__)", + "", + "from sklearn import metrics", + ] + + +def test_scan_project_collects_imports_across_formats(tmp_path): + project = tmp_path / "proj" + project.mkdir() + (project / "localpkg").mkdir() + (project / "localpkg" / "__init__.py").write_text("") + (project / "localmod.py").write_text("# local module\n") + + (project / "app.py").write_text( + """ +import requests, typing +from pandas.core.frame import DataFrame +from torch import nn +import localpkg +from localmod import value +""" + ) + + notebook_dir = project / "notebooks" + notebook_dir.mkdir() + (notebook_dir / "helpers.py").write_text("# helper\n") + with (notebook_dir / "analysis.ipynb").open("w", encoding="utf-8") as handle: + json.dump( + { + "cells": [ + { + "cell_type": "code", + "source": [ + "import matplotlib.pyplot\n", + "from scipy import stats\n", + "import helpers\n", + ], + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5, + }, + handle, + ) + + docs = project / "docs" + docs.mkdir() + (docs / "report.qmd").write_text( + """\ +```{python} +import seaborn +from sklearn.linear_model import LogisticRegression +``` +""" + ) + + assert scan_project(str(project)) == [ + "matplotlib", + "pandas", + "requests", + "scipy", + "seaborn", + "sklearn", + "torch", + ] + + +def test_scan_project_dedupes_and_orders(tmp_path): + project = tmp_path / "proj2" + project.mkdir() + (project / "module.py").write_text("import requests\nimport requests\nfrom pandas import DataFrame\n") + (project / "another.py").write_text("from pandas.io import json\nimport requests\n") + assert scan_project(str(project)) == ["pandas", "requests"]