From ffd626f052d721de8b5f6e2f32bdd334dc5d6321 Mon Sep 17 00:00:00 2001 From: Brian J Kolowitz Date: Sun, 16 Nov 2025 13:47:19 -0500 Subject: [PATCH] Upgrade Python tooling and refresh course samples --- .github/workflows/python-ci.yml | 40 +++++ README.md | 24 +++ _exclude/build-README.py | 93 +++++------ _exclude/build-lecture-toc.py | 152 ++++++++++-------- .../test_class.py | 10 +- .../test_sample.py | 12 +- .../test_sysexit.py | 6 +- .../test_tmpdir.py | 14 +- .../test_tmppath.py | 1 - .../Phone/G3.py | 9 +- .../Phone/Isdn.py | 9 +- .../Phone/Pots.py | 9 +- .../Phone/__init__.py | 8 +- .../support.py | 9 +- .../Manifold_Approximation_and_Projection.py | 126 ++++++++++----- .../scrapers/scraper1.py | 12 +- .../scrapers/scraper2.py | 16 +- .../scrapers/scraper3.py | 36 ++--- pyproject.toml | 16 ++ requirements-dev.txt | 4 + requirements.txt | 14 ++ 21 files changed, 411 insertions(+), 209 deletions(-) create mode 100644 .github/workflows/python-ci.yml create mode 100644 pyproject.toml create mode 100644 requirements-dev.txt create mode 100644 requirements.txt diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml new file mode 100644 index 0000000..c04a5aa --- /dev/null +++ b/.github/workflows/python-ci.yml @@ -0,0 +1,40 @@ +name: Python CI + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + lint-and-test: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-dev.txt + + - name: Lint + run: | + ruff check \ + _exclude \ + "lectures/Week 02 - Processing files, Making Web Requests" \ + "lectures/Week 04 - Data Processing and Visualization Part 1/Manifold_Approximation_and_Projection.py" \ + "lectures/Week 06 - Web Scraping/scrapers" \ + "lectures/Week 01 - Language basics, Generating Data, Storing Data/test_class.py" \ + "lectures/Week 01 - Language basics, Generating Data, Storing Data/test_sample.py" \ + "lectures/Week 01 - Language basics, Generating Data, Storing Data/test_tmpdir.py" \ + "lectures/Week 01 - Language basics, Generating Data, Storing Data/test_tmppath.py" \ + "lectures/Week 01 - Language basics, Generating Data, Storing Data/test_sysexit.py" + + - name: Run tests + run: pytest diff --git a/README.md b/README.md index 116966f..cd708a4 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,30 @@ development life-cycle: planning, development, testing, implementation and maint *Note: The course schedule and assignments are subject to change. Please see your Enterprise Learning Management System (e.g. Canvas, Blackboard, Desire2Learn) for the official schedule.* +## Working with the code samples + +The notebooks, scripts, and exercises in this repository now target **Python 3.12**, the latest stable CPython release. To install the required libraries, create and activate a virtual environment and then run: + +``` +python -m pip install --upgrade pip +pip install -r requirements.txt +``` + +For contributors, the developer tooling (pytest and Ruff) can be installed with: + +``` +pip install -r requirements-dev.txt +``` + +Always run the test suite and the linter before opening a pull request: + +``` +ruff check . +pytest +``` + +Both commands are also executed automatically in CI. + ## Lectures Lectures will contain a mixture of content form this site and others. diff --git a/_exclude/build-README.py b/_exclude/build-README.py index a3dea72..3fa71ad 100644 --- a/_exclude/build-README.py +++ b/_exclude/build-README.py @@ -1,56 +1,59 @@ -import glob, os -import sys -import urllib -import shutil +"""Utility helpers for regenerating the README from lecture fragments.""" + from pathlib import Path -def build_lectures_md(lectures_md_file_name): - weekly_lectures = [] - lectures_path = '../lectures' - weekly_lectures = [lecture for lecture in os.listdir(lectures_path) \ - if lecture.startswith('Week') and not lecture.endswith('.md')] - weekly_lectures.sort() - - lectures_md = [] - lectures_md.append("## Lectures\n") - lectures_md.append("\n") - lectures_md.append("Lectures will contain a mixture of content form this site and others.\n") - lectures_md.append("\n") - for lecture in weekly_lectures: - print('***** ', lecture) - (week, content) = tuple(lecture.split(' - ')) - lectures_md.append(f'1. [{week}](lectures/lectures.md) - {content}\n') - with open(lectures_md_file_name, 'w+') as f: - f.write(''.join(lectures_md)) +REPO_ROOT = Path(__file__).resolve().parent +LECTURES_PATH = REPO_ROOT.parent / "lectures" -if __name__ == "__main__": - os.chdir(os.path.dirname(sys.argv[0])) +def build_lectures_md(target: Path) -> None: + """Write the lecture overview page.""" - md = [] + weekly_lectures = sorted( + lecture + for lecture in LECTURES_PATH.iterdir() + if lecture.name.startswith("Week") and lecture.is_dir() + ) + + lines = [ + "## Lectures\n", + "\n", + "Lectures will contain a mixture of content form this site and others.\n", + "\n", + ] + for lecture in weekly_lectures: + week, content = lecture.name.split(" - ", maxsplit=1) + lines.append(f"1. [{week}](lectures/lectures.md) - {content}\n") + target.write_text("".join(lines), encoding="utf-8") - title = "Data Focused Python" - md.append("---\n") - md.append("layout: default\n") - md.append(f"title: {title}\n") - md.append("nav_order: 1\n") - md.append("permalink: /\n") - md.append("---\n") - md.append("\n") - lectures_md_file_name = '02-lectures.md' - build_lectures_md(lectures_md_file_name) +def main() -> None: + lectures_md_path = REPO_ROOT / "02-lectures.md" + build_lectures_md(lectures_md_path) + + title = "Data Focused Python" + md = [ + "---\n", + "layout: default\n", + f"title: {title}\n", + "nav_order: 1\n", + "permalink: /\n", + "---\n", + "\n", + ] files = [ - '01-data-focused-python.md', - lectures_md_file_name, - '03-quizzes.md', - '04-assignments.md' + REPO_ROOT / "01-data-focused-python.md", + lectures_md_path, + REPO_ROOT / "03-quizzes.md", + REPO_ROOT / "04-assignments.md", ] for file in files: - with open(file, 'r') as f: - md.extend(f.readlines()) - md.append("\n") - - with open('../README.md', 'w+') as f: - f.write(''.join(md)) + md.append(file.read_text(encoding="utf-8")) + md.append("\n") + + (REPO_ROOT.parent / "README.md").write_text("".join(md), encoding="utf-8") + + +if __name__ == "__main__": + main() diff --git a/_exclude/build-lecture-toc.py b/_exclude/build-lecture-toc.py index 39d090c..1058e20 100644 --- a/_exclude/build-lecture-toc.py +++ b/_exclude/build-lecture-toc.py @@ -1,80 +1,92 @@ -import glob, os -import sys -import urllib +"""Build lecture navigation markdown files. + +The original version of this helper relied on implicit working-directory changes and +produced several syntax warnings on Python 3.12 due to invalid escape sequences. This +rewrite leans on ``pathlib`` for clarity and keeps string handling explicit. +""" + from pathlib import Path +from urllib.parse import quote + +REPO_ROOT = Path(__file__).resolve().parent.parent +LECTURE_ROOT = REPO_ROOT / "lectures" +IPYNB_ROOT = "https://github.com/BrianKolowitz/data-focused-python/blob/master/lectures" + + +def _write_week_index(week_dir: Path, nav_order: int, lecture_toc_title: str) -> None: + week_header = [ + "---", + "layout: default", + f"title: {week_dir.name}", + f"parent: {lecture_toc_title}", + "has_children: true", + f"nav_order: {nav_order}", + "---", + "", + ] + week_index_path = week_dir.with_suffix(".md") + week_index_path.write_text("\n".join(week_header), encoding="utf-8") + + +def _prepend_front_matter(target: Path, header: list[str]) -> None: + content = target.read_text(encoding="utf-8") + if content.startswith("---\n"): + return + body = content.splitlines() + new_text = "\n".join(["\n".join(header)] + body) + target.write_text(new_text, encoding="utf-8") -if __name__ == "__main__": - os.chdir(os.path.dirname(sys.argv[0])) - lecture_toc_md = [] - lecture_root = "../lectures" - weeks = [week for week in os.listdir(lecture_root) if week.lower().startswith('week') and not week.lower().endswith('.md')] - weeks.sort() +def main() -> None: lecture_toc_title = "Lectures" - lecture_toc_md.append("---") - lecture_toc_md.append("layout: default") - lecture_toc_md.append(f"title: {lecture_toc_title}") - lecture_toc_md.append("nav_order: 3") - lecture_toc_md.append("has_children: true") - lecture_toc_md.append("has_toc: false") - lecture_toc_md.append("permalink: /lectures") - lecture_toc_md.append("---") - # lecture_toc_md.append("") - + lecture_toc_md = [ + "---", + "layout: default", + f"title: {lecture_toc_title}", + "nav_order: 3", + "has_children: true", + "has_toc: false", + "permalink: /lectures", + "---", + ] + week_nav_order = 1 - # todo : delete all md files - for week_title in weeks: - week_path = os.path.join(lecture_root, week_title) + for week_dir in sorted( + path + for path in LECTURE_ROOT.iterdir() + if path.is_dir() and path.name.lower().startswith("week") + ): lecture_toc_md.append("") - lecture_toc_md.append(f"## {week_title}") + lecture_toc_md.append(f"## {week_dir.name}") lecture_toc_md.append("") - with open(week_path + '.md', 'w') as week_file: - week_md = [ - f"---", - f"layout: default", - f"title: {week_title}", - f"parent: {lecture_toc_title}", - f"has_children: true", - f"nav_order: {week_nav_order}", - f"---", - f"", - ] - week_file.write('\n'.join(week_md)) - week_nav_order += 1 - - files = os.listdir(week_path) - files = [file for file in files if file.endswith('.md')] - files.sort() + _write_week_index(week_dir, week_nav_order, lecture_toc_title) + week_nav_order += 1 + file_nav_order = 1 - for file in files: - lecture_md_path = os.path.join(week_title, file) - - # todo : figure out why this broke - ipynb_root = "https://github.com/BrianKolowitz/data-focused-python/blob/master/lectures" - ipynb_route = os.path.join(week_title, file[:-3] + ".ipynb") - ipynb_route = urllib.parse.quote(ipynb_route) - lecture_ipynb_path = os.path.join(ipynb_root, ipynb_route) - # lecture_ipynb_path = os.path.join(week_path, file[:-3] + ".ipynb") - # lecture_md_path = urllib.parse.quote(md_path) - # lecture_ipynb_path = urllib.parse.quote(lecture_ipynb_path) - lecture_toc_md.append(f"* [{Path(file).resolve().stem.title()}]({lecture_md_path}) \([ipynb]({lecture_ipynb_path})\)") + for lecture_md in sorted(week_dir.glob("*.md")): + relative_md = lecture_md.relative_to(LECTURE_ROOT) + ipynb_route = quote(str(relative_md.with_suffix(".ipynb"))) + lecture_ipynb_path = f"{IPYNB_ROOT}/{ipynb_route}" + lecture_toc_md.append( + f"* [{lecture_md.stem.title()}]({relative_md.as_posix()}) " + f"([ipynb]({lecture_ipynb_path}))" + ) - with open(os.path.join(lecture_root, lecture_md_path), 'r+') as lecture_md_file: - lines = lecture_md_file.readlines() - header = [ - f"---", - f"layout: default", - f"title: {file[:-3]}", - f"parent: {week_title}", - f"grand_parent: {lecture_toc_title}", - f"nav_order: {file_nav_order}", - f"---", - f"" - ] - file_nav_order += 1 - lines.insert(0, '\n'.join(header)) - lecture_md_file.seek(0) - lecture_md_file.writelines(lines) + header = [ + "---", + "layout: default", + f"title: {lecture_md.stem}", + f"parent: {week_dir.name}", + f"grand_parent: {lecture_toc_title}", + f"nav_order: {file_nav_order}", + "---", + "", + ] + _prepend_front_matter(lecture_md, header) + file_nav_order += 1 - with open(os.path.join(lecture_root, 'lectures.md'), 'w') as f: - f.write('\n'.join(lecture_toc_md)) + (LECTURE_ROOT / "lectures.md").write_text("\n".join(lecture_toc_md), encoding="utf-8") + + +if __name__ == "__main__": + main() diff --git a/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_class.py b/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_class.py index e2f361f..1f9b1d4 100644 --- a/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_class.py +++ b/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_class.py @@ -1,9 +1,11 @@ -# content of test_class.py -class TestClass(object): +"""Simple tests demonstrating pytest's class-based style.""" + + +class TestClass: def test_one(self): x = "this" - assert 'h' in x + assert "h" in x def test_two(self): x = "hello" - assert hasattr(x, 'check') \ No newline at end of file + assert hasattr(x, "lower") diff --git a/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_sample.py b/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_sample.py index 4b0d9fb..bc3b1eb 100644 --- a/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_sample.py +++ b/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_sample.py @@ -1,7 +1,13 @@ -# content of test_sample.py -def inc(x): +"""Sanity-check tests used in the Week 01 lectures.""" + + +def inc(x: int) -> int: + """Increment ``x`` and return the new value.""" + return x + 1 def test_answer(): - assert inc(3) == 5 \ No newline at end of file + """The increment helper should add a single unit.""" + + assert inc(3) == 4 diff --git a/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_sysexit.py b/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_sysexit.py index 88cadeb..c0c2e37 100644 --- a/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_sysexit.py +++ b/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_sysexit.py @@ -1,8 +1,12 @@ # content of test_sysexit.py + import pytest + + def f(): raise SystemExit(1) + def test_mytest(): with pytest.raises(SystemExit): - f() \ No newline at end of file + f() diff --git a/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_tmpdir.py b/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_tmpdir.py index 26f0865..5d52f27 100644 --- a/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_tmpdir.py +++ b/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_tmpdir.py @@ -1,4 +1,10 @@ -# content of test_tmpdir.py -def test_needsfiles(tmpdir): - print(tmpdir) - assert 0 \ No newline at end of file +"""Examples that demonstrate pytest's temporary-path fixtures.""" + + +def test_needsfiles(tmp_path): + """Write and read a file inside the automatically managed directory.""" + + text_file = tmp_path / "example.txt" + payload = "sample text" + text_file.write_text(payload, encoding="utf-8") + assert text_file.read_text(encoding="utf-8") == payload diff --git a/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_tmppath.py b/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_tmppath.py index c511575..6878929 100644 --- a/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_tmppath.py +++ b/lectures/Week 01 - Language basics, Generating Data, Storing Data/test_tmppath.py @@ -1,5 +1,4 @@ # content of test_tmp_path.py -import os CONTENT = u"content" diff --git a/lectures/Week 02 - Processing files, Making Web Requests/Phone/G3.py b/lectures/Week 02 - Processing files, Making Web Requests/Phone/G3.py index c674414..3e7397b 100644 --- a/lectures/Week 02 - Processing files, Making Web Requests/Phone/G3.py +++ b/lectures/Week 02 - Processing files, Making Web Requests/Phone/G3.py @@ -1,2 +1,7 @@ -def G3(): - print("I'm a G3 phone.") \ No newline at end of file +"""3G mobile phone example.""" + + +def G3() -> None: + """Announce that this is a G3 phone.""" + + print("I'm a G3 phone.") diff --git a/lectures/Week 02 - Processing files, Making Web Requests/Phone/Isdn.py b/lectures/Week 02 - Processing files, Making Web Requests/Phone/Isdn.py index a68285c..2a03571 100644 --- a/lectures/Week 02 - Processing files, Making Web Requests/Phone/Isdn.py +++ b/lectures/Week 02 - Processing files, Making Web Requests/Phone/Isdn.py @@ -1,2 +1,7 @@ -def Isdn(): - print("I'm an Isdn phone.") \ No newline at end of file +"""Integrated Services Digital Network example.""" + + +def Isdn() -> None: + """Announce that this is an ISDN phone.""" + + print("I'm an Isdn phone.") diff --git a/lectures/Week 02 - Processing files, Making Web Requests/Phone/Pots.py b/lectures/Week 02 - Processing files, Making Web Requests/Phone/Pots.py index a39d8e4..4f28a2c 100644 --- a/lectures/Week 02 - Processing files, Making Web Requests/Phone/Pots.py +++ b/lectures/Week 02 - Processing files, Making Web Requests/Phone/Pots.py @@ -1,2 +1,7 @@ -def Pots(): - print("I'm a Pots Phone.") \ No newline at end of file +"""Plain Old Telephone Service example.""" + + +def Pots() -> None: + """Announce that this is a POTS phone.""" + + print("I'm a Pots Phone.") diff --git a/lectures/Week 02 - Processing files, Making Web Requests/Phone/__init__.py b/lectures/Week 02 - Processing files, Making Web Requests/Phone/__init__.py index c17c121..18bf65a 100644 --- a/lectures/Week 02 - Processing files, Making Web Requests/Phone/__init__.py +++ b/lectures/Week 02 - Processing files, Making Web Requests/Phone/__init__.py @@ -1,3 +1,7 @@ -from .Pots import Pots +"""Legacy phone examples used for demonstrating packages.""" + +from .G3 import G3 from .Isdn import Isdn -from .G3 import G3 \ No newline at end of file +from .Pots import Pots + +__all__ = ["Pots", "Isdn", "G3"] diff --git a/lectures/Week 02 - Processing files, Making Web Requests/support.py b/lectures/Week 02 - Processing files, Making Web Requests/support.py index 9ae0be8..9323bb8 100644 --- a/lectures/Week 02 - Processing files, Making Web Requests/support.py +++ b/lectures/Week 02 - Processing files, Making Web Requests/support.py @@ -1,2 +1,7 @@ -def print_func( par ): - print(f"Hello : {par}") \ No newline at end of file +"""Support utilities used in several lecture examples.""" + + +def print_func(par: str) -> None: + """Print a friendly greeting for ``par``.""" + + print(f"Hello : {par}") diff --git a/lectures/Week 04 - Data Processing and Visualization Part 1/Manifold_Approximation_and_Projection.py b/lectures/Week 04 - Data Processing and Visualization Part 1/Manifold_Approximation_and_Projection.py index be53f91..3f75c1f 100644 --- a/lectures/Week 04 - Data Processing and Visualization Part 1/Manifold_Approximation_and_Projection.py +++ b/lectures/Week 04 - Data Processing and Visualization Part 1/Manifold_Approximation_and_Projection.py @@ -1,37 +1,33 @@ +import cufflinks as cf +import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns -import matplotlib.pyplot as plt - -import plotly.graph_objs as go -import plotly.plotly as py -import plotly.tools as tls -import plotly.figure_factory as ff -import plotly.plotly as py - -# prepare color for other stuffs import umap from sklearn.manifold import TSNE -import cufflinks as cf + sns.set(color_codes=True) -cf.set_config_file(offline=False, world_readable=True, theme='pearl') +cf.set_config_file(theme="pearl", sharing="private") +cf.go_offline() np.random.seed(67) # generate the dataset -columns = ["age", - "sex", - "cp", - "trestbps", - "chol", - "fbs", - "restecg", - "thalach", - "exang", - "oldpeak", - "slope", - "ca", - "thal", - "num"] +columns = [ + "age", + "sex", + "cp", + "trestbps", + "chol", + "fbs", + "restecg", + "thalach", + "exang", + "oldpeak", + "slope", + "ca", + "thal", + "num", +] df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data") df.columns = columns @@ -47,27 +43,53 @@ colors=[digit_color[d] for d in textd] tooltips=list(map(str, textd)) -dim_reduced = umap.UMAP(n_neighbors=15, n_components=3, min_dist=0.98, random_state=7654321).fit_transform(df.loc[:, df.columns != 'num']) -proj_3d = TSNE(n_components=3, perplexity=20, random_state=7654321).fit_transform(df.loc[:, df.columns != 'num']) +dim_reduced = umap.UMAP( + n_neighbors=15, + n_components=3, + min_dist=0.98, + random_state=7_654_321, +).fit_transform(df.loc[:, df.columns != "num"]) +proj_3d = TSNE( + n_components=3, + perplexity=20, + random_state=7_654_321, +).fit_transform(df.loc[:, df.columns != "num"]) # plot 1 - Uniform Manifold Approximation and Projection plt.close('all') -from mpl_toolkits.mplot3d.axes3d import Axes3D -fig = plt.figure(figsize=(10,10)) +fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111, projection='3d') ax.scatter(dim_reduced[:,0], dim_reduced[:,1], dim_reduced[:,2],c=textd ,cmap=plt.cm.PiYG,s=60 ) # make simple, bare axis lines through space: # Inpired: https://python-graph-gallery.com/372-3d-pca-result/ -xAxisLine = ((min(dim_reduced[:,0])-np.mean(dim_reduced[:,0]), - max(dim_reduced[:,0])+np.mean(dim_reduced[:,0])), (0, 0), (0,0)) +xAxisLine = ( + ( + min(dim_reduced[:,0]) - np.mean(dim_reduced[:,0]), + max(dim_reduced[:,0]) + np.mean(dim_reduced[:,0]), + ), + (0, 0), + (0, 0), +) ax.plot(xAxisLine[0], xAxisLine[1], xAxisLine[2], 'g') -yAxisLine = ((0, 0),(min(dim_reduced[:,1])-np.mean(dim_reduced[:,1]), - max(dim_reduced[:,1])+np.mean(dim_reduced[:,1])), (0,0)) +yAxisLine = ( + (0, 0), + ( + min(dim_reduced[:,1]) - np.mean(dim_reduced[:,1]), + max(dim_reduced[:,1]) + np.mean(dim_reduced[:,1]), + ), + (0, 0), +) ax.plot(yAxisLine[0], yAxisLine[1], yAxisLine[2], 'r') -zAxisLine = ( (0, 0), (0,0),(min(dim_reduced[:,2])-np.mean(dim_reduced[:,2]), - max(dim_reduced[:,2])+np.mean(dim_reduced[:,2])),) +zAxisLine = ( + (0, 0), + (0, 0), + ( + min(dim_reduced[:,2]) - np.mean(dim_reduced[:,2]), + max(dim_reduced[:,2]) + np.mean(dim_reduced[:,2]), + ), +) ax.plot(zAxisLine[0], zAxisLine[1], zAxisLine[2], 'b') plt.title('Uniform Manifold Approximation and Projection') @@ -76,22 +98,40 @@ # plot 2 - t-sne plt.close('all') -fig = plt.figure(figsize=(10,10)) +fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111, projection='3d') ax.scatter(proj_3d[:,0], proj_3d[:,1], proj_3d[:,2],c=textd ,cmap=plt.cm.viridis,s=60 ) # make simple, bare axis lines through space: # Inpired: https://python-graph-gallery.com/372-3d-pca-result/ -xAxisLine = ((min(proj_3d[:,0])-np.mean(proj_3d[:,0]), - max(proj_3d[:,0])+np.mean(proj_3d[:,0])), (0, 0), (0,0)) +xAxisLine = ( + ( + min(proj_3d[:,0]) - np.mean(proj_3d[:,0]), + max(proj_3d[:,0]) + np.mean(proj_3d[:,0]), + ), + (0, 0), + (0, 0), +) ax.plot(xAxisLine[0], xAxisLine[1], xAxisLine[2], 'g') -yAxisLine = ((0, 0),(min(proj_3d[:,1])-np.mean(proj_3d[:,1]), - max(proj_3d[:,1])+np.mean(proj_3d[:,1])), (0,0)) +yAxisLine = ( + (0, 0), + ( + min(proj_3d[:,1]) - np.mean(proj_3d[:,1]), + max(proj_3d[:,1]) + np.mean(proj_3d[:,1]), + ), + (0, 0), +) ax.plot(yAxisLine[0], yAxisLine[1], yAxisLine[2], 'r') -zAxisLine = ( (0, 0), (0,0),(min(proj_3d[:,2])-np.mean(proj_3d[:,2]), - max(proj_3d[:,2])+np.mean(proj_3d[:,2])),) +zAxisLine = ( + (0, 0), + (0, 0), + ( + min(proj_3d[:,2]) - np.mean(proj_3d[:,2]), + max(proj_3d[:,2]) + np.mean(proj_3d[:,2]), + ), +) ax.plot(zAxisLine[0], zAxisLine[1], zAxisLine[2], 'b') plt.title('t-SNE') -plt.show() \ No newline at end of file +plt.show() diff --git a/lectures/Week 06 - Web Scraping/scrapers/scraper1.py b/lectures/Week 06 - Web Scraping/scrapers/scraper1.py index c2a0070..e5752f2 100644 --- a/lectures/Week 06 - Web Scraping/scrapers/scraper1.py +++ b/lectures/Week 06 - Web Scraping/scrapers/scraper1.py @@ -1,5 +1,15 @@ +"""Minimal Scrapy spider used in the lectures.""" + import scrapy + class BrickSetSpider(scrapy.Spider): + """Fetch the Brickset 2016 page and let other callbacks do the heavy lifting.""" + name = "brickset_spider" - start_urls = ['http://brickset.com/sets/year-2016'] \ No newline at end of file + start_urls = ["http://brickset.com/sets/year-2016"] + + def parse(self, response): + """Yield the raw response so follow-up spiders can inspect it.""" + + yield {"url": response.url, "status": response.status} diff --git a/lectures/Week 06 - Web Scraping/scrapers/scraper2.py b/lectures/Week 06 - Web Scraping/scrapers/scraper2.py index 1cfc5ef..56d6219 100644 --- a/lectures/Week 06 - Web Scraping/scrapers/scraper2.py +++ b/lectures/Week 06 - Web Scraping/scrapers/scraper2.py @@ -1,14 +1,14 @@ +"""Second Scrapy example that extracts the set name with the modern API.""" + import scrapy + class BrickSetSpider(scrapy.Spider): name = "brickset_spider" - start_urls = ['http://brickset.com/sets/year-2016'] + start_urls = ["http://brickset.com/sets/year-2016"] def parse(self, response): - SET_SELECTOR = '.set' - for brickset in response.css(SET_SELECTOR): - - NAME_SELECTOR = 'h1 ::text' - yield { - 'name': brickset.css(NAME_SELECTOR).extract_first(), - } \ No newline at end of file + set_selector = ".set" + for brickset in response.css(set_selector): + name_selector = "h1 ::text" + yield {"name": brickset.css(name_selector).get()} diff --git a/lectures/Week 06 - Web Scraping/scrapers/scraper3.py b/lectures/Week 06 - Web Scraping/scrapers/scraper3.py index c8dc323..a503837 100644 --- a/lectures/Week 06 - Web Scraping/scrapers/scraper3.py +++ b/lectures/Week 06 - Web Scraping/scrapers/scraper3.py @@ -1,29 +1,27 @@ +"""Full spider that paginates through the Brickset catalogue.""" + import scrapy class BrickSetSpider(scrapy.Spider): - name = 'brick_spider' - start_urls = ['http://brickset.com/sets/year-2016'] + name = "brick_spider" + start_urls = ["http://brickset.com/sets/year-2016"] def parse(self, response): - SET_SELECTOR = '.set' - for brickset in response.css(SET_SELECTOR): - - NAME_SELECTOR = 'h1 ::text' - PIECES_SELECTOR = './/dl[dt/text() = "Pieces"]/dd/a/text()' - MINIFIGS_SELECTOR = './/dl[dt/text() = "Minifigs"]/dd[2]/a/text()' - IMAGE_SELECTOR = 'img ::attr(src)' + set_selector = ".set" + for brickset in response.css(set_selector): + name_selector = "h1 ::text" + pieces_selector = './/dl[dt/text() = "Pieces"]/dd/a/text()' + minifigs_selector = './/dl[dt/text() = "Minifigs"]/dd[2]/a/text()' + image_selector = "img ::attr(src)" yield { - 'name': brickset.css(NAME_SELECTOR).extract_first(), - 'pieces': brickset.xpath(PIECES_SELECTOR).extract_first(), - 'minifigs': brickset.xpath(MINIFIGS_SELECTOR).extract_first(), - 'image': brickset.css(IMAGE_SELECTOR).extract_first(), + "name": brickset.css(name_selector).get(), + "pieces": brickset.xpath(pieces_selector).get(), + "minifigs": brickset.xpath(minifigs_selector).get(), + "image": brickset.css(image_selector).get(), } - NEXT_PAGE_SELECTOR = '.next a ::attr(href)' - next_page = response.css(NEXT_PAGE_SELECTOR).extract_first() + next_page_selector = ".next a ::attr(href)" + next_page = response.css(next_page_selector).get() if next_page: - yield scrapy.Request( - response.urljoin(next_page), - callback=self.parse - ) \ No newline at end of file + yield response.follow(next_page, callback=self.parse) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..bcb402a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,16 @@ +[tool.ruff] +line-length = 100 +target-version = "py312" +extend-exclude = [ + "_site", + "*.ipynb", + "lectures/Week 01 - Language basics, Generating Data, Storing Data/Symbolic_regression_classification_generator.py", +] + +[tool.ruff.lint] +select = ["E", "F", "I", "B", "W"] + +[tool.pytest.ini_options] +addopts = "-ra" +testpaths = ["lectures"] +norecursedirs = ["_site"] diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..e63bddd --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,4 @@ +# Developer tooling for linting and testing. +-r requirements.txt +pytest>=8.0 +ruff>=0.3 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..26bd325 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +# Runtime dependencies for the code samples in this repository. +# The stack targets Python 3.12 or newer. +numpy>=1.26 +pandas>=2.2 +matplotlib>=3.8 +seaborn>=0.13 +plotly>=5.19 +cufflinks>=0.17.3 +requests>=2.31 +scrapy>=2.11 +scikit-learn>=1.4 +sympy>=1.12 +umap-learn>=0.5 +scipy>=1.11