diff --git a/routes/main_routes.py b/routes/main_routes.py index f023c32..db273bb 100644 --- a/routes/main_routes.py +++ b/routes/main_routes.py @@ -136,6 +136,34 @@ def recommend(): return jsonify({"projects": results}), 200 +@main.route("/api/project//resources") +def project_resources(project_id): + """Return the validated resource list for a project. + + Each resource is parsed from its raw "Label: URL" string format and + returned as a structured object so the frontend can render broken + links differently from valid ones. + + Response shape: + { + "project_id": 1, + "resources": [ + {"label": "Python official docs", "url": "https://docs.python.org", "valid": true}, + {"label": "Broken link", "url": "not-a-url", "valid": false} + ] + } + """ + from utils.url_validator import validate_resources + + project = find_project_by_id(project_id) + if not project: + return jsonify({"error": "Project not found."}), 404 + + validated = validate_resources(project.get("resources", [])) + return jsonify({ + "project_id": project_id, + "resources": validated + }), 200 @main.route("/project/") def project_detail(project_id): diff --git a/tests/test_resource_url_validation.py b/tests/test_resource_url_validation.py new file mode 100644 index 0000000..26022d3 --- /dev/null +++ b/tests/test_resource_url_validation.py @@ -0,0 +1,267 @@ +# tests/test_resource_url_validation.py +# Tests for bug #781 — learning resource URLs must be validated and +# served as structured objects so broken links can be handled gracefully. +# +# Run with: python -m pytest tests/test_resource_url_validation.py -v + +import sys +import os +import pytest + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from utils.url_validator import is_valid_url, parse_resource, validate_resource, validate_resources +from utils.data_loader import load_all_projects, clear_cache +from app import app + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture +def client(): + app.config["TESTING"] = True + with app.test_client() as c: + yield c + + +@pytest.fixture(autouse=True) +def reset_cache(): + clear_cache() + yield + clear_cache() + + +# --------------------------------------------------------------------------- +# is_valid_url +# --------------------------------------------------------------------------- + +def test_valid_https_url(): + assert is_valid_url("https://docs.python.org") is True + +def test_valid_http_url(): + assert is_valid_url("http://example.com") is True + +def test_valid_url_with_path(): + assert is_valid_url("https://docs.python.org/3/library/csv.html") is True + +def test_valid_url_with_query(): + assert is_valid_url("https://example.com/search?q=python") is True + +def test_valid_url_with_fragment(): + assert is_valid_url("https://example.com/page#section") is True + +def test_valid_url_with_port(): + assert is_valid_url("https://localhost:5000/api") is True + +def test_invalid_bare_domain(): + assert is_valid_url("example.com") is False + +def test_invalid_ftp_scheme(): + assert is_valid_url("ftp://example.com") is False + +def test_invalid_mailto(): + assert is_valid_url("mailto:test@example.com") is False + +def test_invalid_empty_string(): + assert is_valid_url("") is False + +def test_invalid_none(): + assert is_valid_url(None) is False + +def test_invalid_plain_text(): + assert is_valid_url("not a url at all") is False + +def test_invalid_missing_scheme(): + assert is_valid_url("//example.com/path") is False + + +# --------------------------------------------------------------------------- +# parse_resource +# --------------------------------------------------------------------------- + +def test_parse_label_and_url(): + result = parse_resource("Python official docs: https://docs.python.org") + assert result["label"] == "Python official docs" + assert result["url"] == "https://docs.python.org" + +def test_parse_url_only(): + result = parse_resource("https://realpython.com") + assert result["label"] == "https://realpython.com" + assert result["url"] == "https://realpython.com" + +def test_parse_label_with_https_url_and_path(): + result = parse_resource("CSV module guide: https://docs.python.org/3/library/csv.html") + assert result["label"] == "CSV module guide" + assert result["url"] == "https://docs.python.org/3/library/csv.html" + +def test_parse_does_not_split_on_url_colon(): + """Colons inside the URL (e.g. https://) must not split the label.""" + result = parse_resource("MDN Fetch API: https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API") + assert result["label"] == "MDN Fetch API" + assert "developer.mozilla.org" in result["url"] + +def test_parse_empty_string(): + result = parse_resource("") + assert result["label"] == "" + assert result["url"] == "" + +def test_parse_none(): + result = parse_resource(None) + assert result["label"] == "" + assert result["url"] == "" + +def test_parse_strips_whitespace(): + result = parse_resource(" Label : https://example.com ") + assert result["url"] == "https://example.com" + + +# --------------------------------------------------------------------------- +# validate_resource +# --------------------------------------------------------------------------- + +def test_validate_resource_valid(): + result = validate_resource("Python docs: https://docs.python.org") + assert result["valid"] is True + assert result["label"] == "Python docs" + assert result["url"] == "https://docs.python.org" + +def test_validate_resource_invalid_url(): + result = validate_resource("Broken link: not-a-url") + assert result["valid"] is False + +def test_validate_resource_empty(): + result = validate_resource("") + assert result["valid"] is False + +def test_validate_resource_has_all_keys(): + result = validate_resource("Label: https://example.com") + assert "label" in result + assert "url" in result + assert "valid" in result + + +# --------------------------------------------------------------------------- +# validate_resources (list) +# --------------------------------------------------------------------------- + +def test_validate_resources_all_valid(): + raw = [ + "Python docs: https://docs.python.org", + "MDN: https://developer.mozilla.org", + ] + results = validate_resources(raw) + assert len(results) == 2 + assert all(r["valid"] for r in results) + +def test_validate_resources_mixed(): + raw = [ + "Good link: https://docs.python.org", + "Bad link: not-a-url", + ] + results = validate_resources(raw) + assert results[0]["valid"] is True + assert results[1]["valid"] is False + +def test_validate_resources_empty_list(): + assert validate_resources([]) == [] + +def test_validate_resources_not_a_list(): + assert validate_resources(None) == [] + + +# --------------------------------------------------------------------------- +# All resources in projects.json must have valid URL format +# --------------------------------------------------------------------------- + +def test_all_project_resources_have_valid_urls(): + """Every resource URL in projects.json must pass format validation.""" + from utils.url_validator import parse_resource, is_valid_url + projects = load_all_projects() + broken = [] + for project in projects: + for raw in project.get("resources", []): + parsed = parse_resource(raw) + url = parsed.get("url", "") + if url and not is_valid_url(url): + broken.append((project["id"], project["title"], url)) + + assert broken == [], ( + "Malformed resource URLs found in projects.json:\n" + + "\n".join(f" project id={pid} '{title}': {url}" for pid, title, url in broken) + ) + +def test_all_projects_have_at_least_one_resource(): + """Every project must have at least one learning resource.""" + projects = load_all_projects() + missing = [p for p in projects if not p.get("resources")] + assert missing == [], ( + "Projects with no resources: " + + ", ".join(str(p["id"]) for p in missing) + ) + +def test_all_resource_strings_are_non_empty(): + """No resource entry should be an empty string.""" + projects = load_all_projects() + for project in projects: + for raw in project.get("resources", []): + assert raw.strip(), ( + f"Empty resource string in project id={project['id']}" + ) + + +# --------------------------------------------------------------------------- +# /api/project//resources route +# --------------------------------------------------------------------------- + +def test_resources_route_returns_200(client): + response = client.get("/api/project/1/resources") + assert response.status_code == 200 + +def test_resources_route_returns_json(client): + response = client.get("/api/project/1/resources") + data = response.get_json() + assert data is not None + +def test_resources_route_has_project_id(client): + data = client.get("/api/project/1/resources").get_json() + assert "project_id" in data + assert data["project_id"] == 1 + +def test_resources_route_has_resources_list(client): + data = client.get("/api/project/1/resources").get_json() + assert "resources" in data + assert isinstance(data["resources"], list) + +def test_resources_route_each_item_has_label(client): + data = client.get("/api/project/1/resources").get_json() + for item in data["resources"]: + assert "label" in item + +def test_resources_route_each_item_has_url(client): + data = client.get("/api/project/1/resources").get_json() + for item in data["resources"]: + assert "url" in item + +def test_resources_route_each_item_has_valid_flag(client): + data = client.get("/api/project/1/resources").get_json() + for item in data["resources"]: + assert "valid" in item + assert isinstance(item["valid"], bool) + +def test_resources_route_project_1_all_valid(client): + """Project 1 resources in the dataset must all pass URL validation.""" + data = client.get("/api/project/1/resources").get_json() + invalid = [r for r in data["resources"] if not r["valid"]] + assert invalid == [], f"Invalid resources in project 1: {invalid}" + +def test_resources_route_invalid_project_returns_404(client): + response = client.get("/api/project/99999/resources") + assert response.status_code == 404 + assert "error" in response.get_json() + +def test_resources_route_security_headers(client): + response = client.get("/api/project/1/resources") + assert response.headers.get("X-Frame-Options") == "DENY" + assert response.headers.get("X-Content-Type-Options") == "nosniff" diff --git a/utils/data_loader.py b/utils/data_loader.py index bc529a3..7d70254 100644 --- a/utils/data_loader.py +++ b/utils/data_loader.py @@ -1,11 +1,18 @@ +# utils/data_loader.py import json import os import threading +import logging + +from utils.url_validator import is_valid_url, parse_resource DATA_FILE = os.path.join(os.path.dirname(__file__), "..", "data", "projects.json") _projects_cache = None _cache_lock = threading.Lock() +logger = logging.getLogger("devpath.data_loader") + + def validate_projects(projects): """ Validate project dataset integrity. @@ -15,9 +22,10 @@ def validate_projects(projects): - Duplicate project titles (case-insensitive) - Missing required fields - Empty required string fields + - Malformed resource URLs (logs a warning, does not raise) Raises: - ValueError: If any validation rule is violated. + ValueError: If any structural validation rule is violated. """ seen_ids = set() seen_titles = set() @@ -51,6 +59,20 @@ def validate_projects(projects): raise ValueError(f"Duplicate project title found: {project['title']}") seen_titles.add(title) + # Resource URL format validation — warn, do not raise + # Broken URLs in production are logged so they can be fixed in the + # data file without crashing the application. + for raw in project.get("resources", []): + parsed = parse_resource(raw) + url = parsed.get("url", "") + if url and not is_valid_url(url): + logger.warning( + "Malformed resource URL in project '%s' (id=%s): %r", + project.get("title", "Unknown"), + project_id, + url, + ) + def load_all_projects(): """Read and return the full list of projects from the JSON file. @@ -66,11 +88,13 @@ def load_all_projects(): validate_projects(_projects_cache) return _projects_cache + def get_available_levels(): """Return all unique project levels.""" projects = load_all_projects() return sorted({p["level"] for p in projects}) + def find_project_by_id(project_id): """Return the project whose 'id' matches project_id, or None.""" for project in load_all_projects(): diff --git a/utils/url_validator.py b/utils/url_validator.py new file mode 100644 index 0000000..7a82fae --- /dev/null +++ b/utils/url_validator.py @@ -0,0 +1,111 @@ +# utils/url_validator.py +# Utilities for parsing and validating learning resource URLs. +# +# Resources in projects.json are stored as plain strings in one of +# two formats: +# "Label text: https://example.com/path" (label + URL) +# "https://example.com/path" (URL only) +# +# This module provides: +# parse_resource(raw) → {"label": str, "url": str} +# is_valid_url(url) → bool (format check only, no HTTP request) +# validate_resource(raw) → {"label": str, "url": str, "valid": bool} +# +# Deliberately no outbound HTTP requests — format validation only. +# This keeps startup fast and avoids network dependencies in tests. + +import re + +# Matches http:// or https:// followed by a domain and optional path. +# Intentionally strict: rejects bare domains, ftp://, mailto:, etc. +_URL_RE = re.compile( + r'^https?://' # scheme + r'(?:[A-Za-z0-9-]+\.)+[A-Za-z]{2,}' # domain + r'(?::\d+)?' # optional port + r'(?:[/?#][^\s]*)?$', # optional path/query/fragment + re.IGNORECASE, +) + + +def is_valid_url(url: str) -> bool: + """Return True if *url* is a well-formed http/https URL. + + Only checks format — does not make any network request. + + Args: + url: The URL string to validate. + + Returns: + True if the URL matches the expected http/https pattern. + """ + if not url or not isinstance(url, str): + return False + return bool(_URL_RE.match(url.strip())) + + +def parse_resource(raw: str) -> dict: + """Split a raw resource string into a label and URL. + + Handles two formats: + "Label text: https://example.com" → label="Label text", url="https://..." + "https://example.com" → label="https://example.com", url="https://..." + + If the string contains ": http" the part before the colon becomes the + label and everything from "http" onward becomes the URL. This avoids + splitting on colons that appear inside URLs (e.g. "https://..."). + + Args: + raw: The raw resource string from projects.json. + + Returns: + A dict with keys "label" (str) and "url" (str). + Both values are stripped of leading/trailing whitespace. + If no URL can be parsed, "url" is an empty string. + """ + if not raw or not isinstance(raw, str): + return {"label": "", "url": ""} + + raw = raw.strip() + + # Find the first occurrence of ": http" to split label from URL + split_marker = ": http" + idx = raw.find(split_marker) + if idx != -1: + label = raw[:idx].strip() + url = raw[idx + 2:].strip() # skip ": " → starts at "http" + return {"label": label, "url": url} + + # No label prefix — treat the entire string as a URL + return {"label": raw, "url": raw} + + +def validate_resource(raw: str) -> dict: + """Parse a raw resource string and validate its URL format. + + Args: + raw: The raw resource string from projects.json. + + Returns: + A dict with keys: + "label" (str) — human-readable link text + "url" (str) — the URL (may be empty if unparseable) + "valid" (bool) — True if the URL passes format validation + """ + parsed = parse_resource(raw) + parsed["valid"] = is_valid_url(parsed["url"]) + return parsed + + +def validate_resources(resources: list) -> list: + """Validate a list of raw resource strings. + + Args: + resources: List of raw resource strings from a project's + "resources" field. + + Returns: + List of dicts, each with "label", "url", and "valid" keys. + """ + if not isinstance(resources, list): + return [] + return [validate_resource(raw) for raw in resources]