From 04f5ce45a0614da112ef84f73538e27c21480a76 Mon Sep 17 00:00:00 2001 From: Rafael Cordones Date: Mon, 4 May 2026 10:09:29 +0200 Subject: [PATCH] Plumb show_name + published_at through download cascade MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #111. Three coordinated layers fix the canonical ARD Sounds episode failure where the download agent rejected every fyyd candidate: 1. Real show_name source of truth. New Episode.show_name CharField (migration 0025), extracted by fetch_details from og:site_name, application-name, RSS , JSON-LD isPartOf.name, or the visible publisher heading. Persisted only when non-empty. downloader._show_name now prefers it, with URL host as a defense-in-depth fallback. 2. published_at as a tiebreaker. EpisodeCandidate.published_at parsed by each aggregator (fyyd pubdate, iTunes releaseDate, podcastindex datePublished epoch). Surfaced through DownloadDeps and IndexCandidate. 3. Hostname-aware download agent prompt. When show_name looks like a hostname (contains . and no spaces), the agent switches to a (title, published_at ±1 day) match instead of requiring an exact show_name string match. Real broadcast titles keep the existing path. Tests: 384 passing. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- CHANGELOG.md | 6 + doc/README.md | 4 +- .../2026-05-04-download-show-name-fix.md | 127 +++++++++++++++ .../2026-05-04-download-show-name-fix.md | 124 +++++++++++++++ ...ad-show-name-fix-implementation-session.md | 136 ++++++++++++++++ ...download-show-name-fix-planning-session.md | 84 ++++++++++ episodes/admin.py | 1 + episodes/agents/download.py | 41 ++++- episodes/agents/download_deps.py | 5 + episodes/agents/download_tools.py | 12 +- episodes/agents/fetch_details.py | 15 +- episodes/downloader.py | 14 +- episodes/fetch_details_step.py | 11 +- episodes/migrations/0025_episode_show_name.py | 18 +++ episodes/models.py | 1 + episodes/podcast_aggregators/base.py | 9 ++ episodes/podcast_aggregators/fyyd.py | 37 +++++ episodes/podcast_aggregators/itunes.py | 28 ++++ episodes/podcast_aggregators/podcastindex.py | 31 ++++ episodes/tests/test_download.py | 32 ++++ episodes/tests/test_models.py | 9 ++ episodes/tests/test_podcast_aggregators.py | 149 ++++++++++++++++++ 22 files changed, 881 insertions(+), 13 deletions(-) create mode 100644 doc/features/2026-05-04-download-show-name-fix.md create mode 100644 doc/plans/2026-05-04-download-show-name-fix.md create mode 100644 doc/sessions/2026-05-04-download-show-name-fix-implementation-session.md create mode 100644 doc/sessions/2026-05-04-download-show-name-fix-planning-session.md create mode 100644 episodes/migrations/0025_episode_show_name.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c13e58..8cf0f6a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,8 +6,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ## 2026-05-04 +### Added + +- `Episode.show_name` `CharField` (max 255, blank default) populated by the fetch_details agent from `<meta property="og:site_name">`, `<meta name="application-name">`, RSS `<channel><title>`, JSON-LD `isPartOf.name` / `partOfSeries.name`, or the visible publisher heading. Persisted only when the agent's extracted value is non-empty (a re-run that fails to extract leaves a previously-good value or admin edit in place). Migration `0025_episode_show_name`. No backfill — pre-prod data freedom per `feedback_reembed_ok_preprod.md`. +- `EpisodeCandidate.published_at` (`date | None`) on the podcast-aggregator dataclass. Each aggregator now extracts a publication date from its native field (`pubdate` for fyyd, `releaseDate` for iTunes, `datePublished` epoch seconds for podcastindex) and logs + returns `None` on missing or malformed input rather than dropping the candidate. Surfaced through `DownloadDeps.published_at` and `IndexCandidate.published_at` so the download agent sees per-episode and per-candidate dates. + ### Changed +- Download agent's `_show_name(episode)` cascade now prefers `episode.show_name`, falling back to the URL host only as a defense-in-depth signal. The agent prompt is updated to detect hostname-shaped `Show` values (contains `.` and no spaces) and switch to a `(title, published_at)` match with ±1 day tolerance instead of requiring an exact `show_name` string match. Real broadcast titles still use the existing show-plus-title path. Closes #111 — [plan](doc/plans/2026-05-04-download-show-name-fix.md), [feature](doc/features/2026-05-04-download-show-name-fix.md), [planning session](doc/sessions/2026-05-04-download-show-name-fix-planning-session.md), [implementation session](doc/sessions/2026-05-04-download-show-name-fix-implementation-session.md) - AGENTS.md and the Feature PR Documentation Bundle AI check now recognize **agent-orchestrated sessions**. When a parallel implementation agent is launched from a parent Claude Code session (e.g. under Conductor) and has no direct user-to-implementation-agent messages, the transcript may use `### Parent agent (orchestrator)` headings *instead of* `### User`, provided the parent-agent's launching prompt is reproduced verbatim. The transcript must declare the session as agent-orchestrated at the top of `## Detailed conversation`. Same verbatim rule applies — summarized parent prompts are still rejected. This is a policy clarification only; no code changes. - AI checks workflow now renders non-applicable rules as gray "skipped" icons instead of green "pass". `list_ai_checks.py` evaluates each rule's `paths:` frontmatter against `git diff --name-only $BASE_REF...HEAD` and emits `applies: bool` per matrix include; `.github/workflows/ai-checks.yml` gates each `check` shard on `if: ${{ ... && matrix.applies }}`, so non-applicable shards skip at the GHA level — no runner spin-up, no model call, no token cost. Four rules gain `paths:` frontmatter (`pipeline-step-sync`, `asgi-wsgi-scott`, `qdrant-payload-slim`, `entity-creation-race-safety`); the other five remain semantic. The driver's verdict tool drops `skip` from its enum: semantic non-applicability is now `pass` with `summary: "Rule does not apply."` and a one-line `details` explanation. Closes #124 — [plan](doc/plans/2026-05-04-ai-checks-skipped.md), [feature](doc/features/2026-05-04-ai-checks-skipped.md), [planning session](doc/sessions/2026-05-04-ai-checks-skipped-planning-session.md), [implementation session](doc/sessions/2026-05-04-ai-checks-skipped-implementation-session.md) - **BREAKING** — `StepFailed`-derived exceptions now pickle as `RuntimeError(message)` rather than as their typed subclass. Pre-fix workflow rows in the DBOS `dbos.workflow_status` table will not deserialize to readable text — the Episode admin's "View workflow steps" page will show a base64 preview only. Action required: reprocess affected episodes (which produces fresh, portable pickles) or clear the workflow_status table in dev environments. No production impact since this project is pre-prod. See PR #129 for details. diff --git a/doc/README.md b/doc/README.md index a02c54e..b7747f4 100644 --- a/doc/README.md +++ b/doc/README.md @@ -36,7 +36,7 @@ Three keyless tools live in [`episodes/agents/fetch_details_tools.py`](../episod The agent emits a wrapped `FetchDetailsOutput { details, report, concise }`: -- `details` — episode-level facts: `title`, `description`, `published_at`, `image_url`, `audio_url`, `audio_format` (closed `Literal`), `language` (ISO 639-1), `country` (ISO 3166-1 alpha-2), `guid`, `canonical_url`, `source_kind` (`canonical | aggregator | unknown`), `aggregator_provider`. +- `details` — episode-level facts: `title`, `show_name` (broadcast / podcast title, e.g. "Zeitzeichen" — not the publisher's company name and not the URL hostname), `description`, `published_at`, `image_url`, `audio_url`, `audio_format` (closed `Literal`), `language` (ISO 639-1), `country` (ISO 3166-1 alpha-2), `guid`, `canonical_url`, `source_kind` (`canonical | aggregator | unknown`), `aggregator_provider`. - `report` — structured trace: `attempted_sources`, `discovered_canonical_url`, `discovered_audio_url`, `cross_linked`, `extraction_confidence` (`high | medium | low`), `narrative` (2–4 sentences), `hints_for_next_step` (carried into the Download step). - `concise` — `outcome` (5-value enum) + `summary` (≤140 chars). @@ -52,7 +52,7 @@ Five outcomes drive the step's status transitions: Discrimination among the three terminal outcomes happens on `FetchDetailsRun.outcome` — only one `Episode.Status.FAILED` value is used. -Every run persists a `FetchDetailsRun` row carrying the structured output, the auto-captured tool-call trace (input / output excerpts / `ok` flag), the Pydantic AI usage dict, and the DBOS workflow ID. `Episode` columns are overwritten directly by the agent's authoritative output (no empty-field-only merge); a re-run via the admin `reprocess` action increments `run_index` and overwrites again. +Every run persists a `FetchDetailsRun` row carrying the structured output, the auto-captured tool-call trace (input / output excerpts / `ok` flag), the Pydantic AI usage dict, and the DBOS workflow ID. `Episode` columns are overwritten directly by the agent's authoritative output (no empty-field-only merge); a re-run via the admin `reprocess` action increments `run_index` and overwrites again. The one exception is `show_name`, which is **additive only** — when a fresh run fails to extract a value, any previously-good `show_name` (whether from an earlier run or an admin edit) is preserved rather than cleared. The step orchestrator ([`episodes/fetch_details_step.py`](../episodes/fetch_details_step.py)) is DBOS-agnostic: the `@DBOS.step()` wrapper in `episodes/workflows.py` reads `DBOS.workflow_id` and passes it in. The orchestrator records it onto `FetchDetailsRun.dbos_workflow_id` for cross-reference forensics. diff --git a/doc/features/2026-05-04-download-show-name-fix.md b/doc/features/2026-05-04-download-show-name-fix.md new file mode 100644 index 0000000..a450329 --- /dev/null +++ b/doc/features/2026-05-04-download-show-name-fix.md @@ -0,0 +1,127 @@ +# Download show_name + published_at fix + +**Date:** 2026-05-04 + +## Problem + +The Download agent rejected every fyyd / podcastindex candidate for the +canonical ARD Sounds test episode +(`https://www.ardsounds.de/episode/urn:ard:episode:fdcf93eef8395b35/`) +even though fyyd had a clean Akamai enclosure URL for it. Two factors +combined: + +1. `Episode` had no `show_name` field. `episodes/downloader._show_name` + fell back to the URL host (`www.ardsounds.de`), which never matches + a real broadcast title (`Zeitzeichen`). +2. The download agent's system prompt encouraged a strict show / title + match — so the LLM played it safe and rejected the candidate set. + +## Changes + +Three coordinated layers, single commit set. + +### Layer 1 — `Episode.show_name` source of truth + +- New `Episode.show_name = CharField(max_length=255, blank=True, default="")`. +- Migration `0025_episode_show_name`. No backfill (pre-prod). +- `EpisodeDetails` Pydantic schema in `episodes/agents/fetch_details.py` + gains `show_name: str | None = None`. The system prompt instructs the + agent to extract from (in order): `<meta property="og:site_name">`, + `<meta name="application-name">`, RSS `<channel><title>`, JSON-LD + `isPartOf.name` / `partOfSeries.name`, then the visible publisher + heading. Hostnames and company names are explicitly out of scope. +- `episodes/fetch_details_step.py` persists + `EpisodeDetails.show_name → Episode.show_name` only when the agent's + value is non-empty. A re-run that fails to extract a show name leaves + the previously-good value (or admin edit) in place. +- `episodes/downloader._show_name(episode)` cascades: + `episode.show_name` → URL host → `""`. The URL host fallback stays as + a defense-in-depth so the agent always receives some `show_name` + context, but the agent prompt now treats hostname-shaped values as a + degraded signal. + +### Layer 2 — `published_at` as a tiebreaker + +- `EpisodeCandidate` (in `episodes/podcast_aggregators/base.py`) gains + `published_at: date | None = None`. +- Each aggregator now extracts a publication date and logs+returns + `None` on missing / malformed input rather than dropping the candidate: + - **fyyd**: parses `item["pubdate"]` (string, + `"YYYY-MM-DD HH:MM:SS"` per fyyd docs; also handles ISO 8601 and + bare `YYYY-MM-DD`). + - **iTunes**: parses `item["releaseDate"]` (ISO 8601 datetime, e.g. + `"2024-08-30T04:00:00Z"`). + - **podcastindex**: converts `item["datePublished"]` (Unix epoch + seconds, ints or numeric strings) to UTC date. +- `DownloadDeps` (in `episodes/agents/download_deps.py`) gains + `published_at: date | None = None`. +- `IndexCandidate` in `episodes/agents/download_tools.py` gains + `published_at: date | None = None` so the agent sees per-candidate + dates inside `lookup_podcast_index` results. +- `episodes/downloader.download_episode` passes `episode.published_at` + through to `run_download_agent` → `_run_agent_async` → `DownloadDeps`. + +### Layer 3 — hostname-aware download agent prompt + +`DOWNLOAD_SYSTEM_PROMPT` in `episodes/agents/download.py` now: + +- Surfaces the episode's `Published` date alongside title / show. +- Tells the agent: when `Show` contains a `.` and no spaces (i.e. it + looks like a hostname), treat it as a degraded signal — do NOT + require exact string match on candidate `show_name`. Instead, prefer + matching candidates by `(title, published_at)`, with a window of ±1 + day. When `Published` is unknown, fall back to title similarity alone. +- Real show titles still use the existing show-plus-title match. + +## Key parameters + +- `Episode.show_name max_length = 255` — comfortable for very long + podcast titles (Apple Podcasts allows up to 255). +- Date match window: ±1 day. Looser than exact match (covers timezone + drift between publisher and aggregator) but tighter than e.g. a + week (avoids matching an unrelated re-broadcast). + +## Verification + +Tests: + +```bash +uv run python manage.py test +# 384 tests pass. +``` + +Manual (post-deploy / pre-merge with reviewer — see PR body): + +1. Submit `https://www.ardsounds.de/episode/urn:ard:episode:fdcf93eef8395b35/` + via the admin "Submit episode" form. +2. Watch fetch_details populate `Episode.show_name = "Zeitzeichen"` (or + similar). +3. Confirm download step reaches `READY` via the index path with + `source="fyyd"`. +4. `dbos workflow steps <id>` shows `download_step` succeeded. +5. Smoke-test on one English episode (e.g. an iTunes-indexed show). +6. Smoke-test on an episode where fetch_details fails to extract a + show name → confirm host-fallback path still works (agent reads + the hostname-shaped show_name, switches to `(title, published_at)` + match). + +## Files modified + +| Path | Summary | +|------|---------| +| `episodes/models.py` | Add `show_name` `CharField`. | +| `episodes/migrations/0025_episode_show_name.py` | New migration. | +| `episodes/agents/fetch_details.py` | Add `show_name` to `EpisodeDetails`; extend system prompt. | +| `episodes/fetch_details_step.py` | Persist `show_name` only when non-empty. | +| `episodes/downloader.py` | Cascade in `_show_name`; pass `published_at` through. | +| `episodes/podcast_aggregators/base.py` | Add `published_at` to `EpisodeCandidate`. | +| `episodes/podcast_aggregators/fyyd.py` | Parse `pubdate` → `date`. | +| `episodes/podcast_aggregators/itunes.py` | Parse `releaseDate` → `date`. | +| `episodes/podcast_aggregators/podcastindex.py` | Parse `datePublished` (epoch) → `date`. | +| `episodes/agents/download_deps.py` | Add `published_at` to `DownloadDeps`. | +| `episodes/agents/download_tools.py` | Add `published_at` to `IndexCandidate`; surface in tool output. | +| `episodes/agents/download.py` | Hostname-aware prompt; plumb `published_at`. | +| `episodes/tests/test_models.py` | `show_name` default + persistence. | +| `episodes/tests/test_podcast_aggregators.py` | Pubdate parse / missing / malformed for all three aggregators. | +| `episodes/tests/test_download.py` | `_show_name` cascade tests. | +| `CHANGELOG.md` | Entry under `## 2026-05-04`. | diff --git a/doc/plans/2026-05-04-download-show-name-fix.md b/doc/plans/2026-05-04-download-show-name-fix.md new file mode 100644 index 0000000..3d6ee0c --- /dev/null +++ b/doc/plans/2026-05-04-download-show-name-fix.md @@ -0,0 +1,124 @@ +# Download show_name + published_at fix (issue #111) + +**Date:** 2026-05-04 + +## Summary + +Tighten the download agent's index-candidate matching for non-English shows +whose extracted show name is empty by fixing the root cause (no `show_name` +extraction in fetch-details) rather than papering over it in the agent +prompt alone. + +## Problem + +Submitting `https://www.ardsounds.de/episode/urn:ard:episode:fdcf93eef8395b35/` +fails the download step even though fyyd carries a clean enclosure URL for +the episode. Two factors interact: + +1. **No `show_name` source of truth.** `Episode` has no `show_name` field + and `fetch_details` doesn't extract one. `episodes/downloader._show_name` + falls back to the URL host (`www.ardsounds.de`). +2. **Strict match in the agent prompt.** With `show_name = "www.ardsounds.de"` + on the episode and `show_name = "Zeitzeichen"` on every fyyd candidate, + the agent rejects every candidate and the episode goes to `FAILED`. + +## Plan + +Three layers, bundled into one PR. + +### Layer 1 — real `show_name` + +- Add `Episode.show_name = CharField(max_length=255, blank=True, default="")`. + Generate Django migration. No backfill (project is pre-prod). +- Extend `EpisodeDetails` schema in `episodes/agents/fetch_details.py` with + `show_name: str | None = None`. Update the system prompt to extract + `show_name` from `<meta property="og:site_name">`, + `<meta name="application-name">`, RSS `<channel><title>`, JSON-LD + `isPartOf.name`, and the visible publisher / show heading. +- Persist `EpisodeDetails.show_name → Episode.show_name` in + `episodes/fetch_details_step.py` only when the value is non-empty (don't + wipe a previously-good value or a user edit on a re-run that fails to + extract). +- Update `episodes/downloader._show_name(episode)` to prefer + `episode.show_name`, with the URL host as a defense-in-depth fallback. + +### Layer 2 — date as a tiebreaker + +- Add `published_at: date | None = None` to + `episodes/podcast_aggregators/base.EpisodeCandidate`. +- Plumb pubdate through each aggregator's `_candidate()`: + - **fyyd**: `item["pubdate"]` (ISO 8601 string like + `"2024-08-30 04:00:00"`) → `date`. + - **iTunes**: `item["releaseDate"]` (ISO 8601 datetime) → `date`. + - **podcastindex**: `item["datePublished"]` (Unix epoch seconds) → + `date`. + - On parse failure: leave `published_at = None` and log a warning. Do + NOT drop the candidate. +- Pass `episode.published_at` through `DownloadDeps` and the download + agent's prompt template. +- Surface `published_at` on each `IndexCandidate` returned by + `lookup_podcast_index`. + +### Layer 3 — looser, hostname-aware prompt + +Update the download agent system prompt in `episodes/agents/download.py`: + +> `show_name` may be a publisher hostname rather than the broadcast title +> (e.g. `www.ardsounds.de` instead of `Zeitzeichen`). When `show_name` +> looks like a hostname (contains `.` and no spaces), do NOT require an +> exact string match against the candidate's `show_name`. Instead, prefer +> matching candidates by `(title, published_at)`. A candidate is a strong +> match when its title closely matches the episode title and its +> `published_at` is within ±1 day of the episode's `published_at`. + +## Decisions + +- **Bundle all three layers**: Layer 1 alone leaves the prompt strict; + Layer 3 alone papers over the missing field. Doing all three in one + commit gives the agent both real data and the right matching policy. +- **Add new model field, no backfill**: `Episode.show_name` is a new + `CharField` with empty default. Pre-prod data per + `feedback_reembed_ok_preprod.md`. +- **Manual verification in PR description**: requires a live ASGI server, + provider keys, and the canonical ARD episode. Not run by the + implementation agent — flagged in the PR body for the reviewer. + +## Test plan + +- Each aggregator's `_candidate()` populates `published_at` from a canned + payload. +- Each aggregator handles missing / malformed pub dates gracefully + (returns `published_at=None`, doesn't drop the candidate). +- `_show_name(episode)` returns `episode.show_name` when set, falls back + to URL host otherwise, returns `""` for a URL with no host. +- `Episode.show_name` is blank by default and persists when set. +- `manage.py makemigrations --check` passes (no further migrations + needed). +- Full test suite passes via `uv run python manage.py test`. + +## Files touched + +- `episodes/models.py` — add `show_name` field. +- `episodes/migrations/0025_episode_show_name.py` — new migration. +- `episodes/agents/fetch_details.py` — `EpisodeDetails.show_name` + + prompt update. +- `episodes/fetch_details_step.py` — persist `show_name` when non-empty. +- `episodes/downloader.py` — `_show_name` cascade; pass `published_at` + to the agent. +- `episodes/podcast_aggregators/base.py` — `EpisodeCandidate.published_at`. +- `episodes/podcast_aggregators/{fyyd,itunes,podcastindex}.py` — parse + pubdate, log + skip on failure. +- `episodes/agents/download_deps.py` — `DownloadDeps.published_at`. +- `episodes/agents/download_tools.py` — `IndexCandidate.published_at`, + surface in `lookup_podcast_index` return value. +- `episodes/agents/download.py` — extend prompt, plumb `published_at` + through `_run_agent_async` / `run_download_agent`. +- `episodes/tests/test_models.py` — `show_name` blank default + persistence. +- `episodes/tests/test_podcast_aggregators.py` — three pubdate parse + scenarios per aggregator. +- `episodes/tests/test_download.py` — `_show_name` cascade tests. +- `doc/plans/2026-05-04-download-show-name-fix.md` — this file. +- `doc/features/2026-05-04-download-show-name-fix.md` — implementation doc. +- `doc/sessions/2026-05-04-download-show-name-fix-planning-session.md` +- `doc/sessions/2026-05-04-download-show-name-fix-implementation-session.md` +- `CHANGELOG.md` — entry under `## 2026-05-04`. diff --git a/doc/sessions/2026-05-04-download-show-name-fix-implementation-session.md b/doc/sessions/2026-05-04-download-show-name-fix-implementation-session.md new file mode 100644 index 0000000..9bb4d42 --- /dev/null +++ b/doc/sessions/2026-05-04-download-show-name-fix-implementation-session.md @@ -0,0 +1,136 @@ +# Download show_name + published_at implementation session + +**Date:** 2026-05-04 + +**Session ID:** unavailable + +## Summary + +Implementation session for the three-layer fix planned in +`doc/plans/2026-05-04-download-show-name-fix.md`: real `show_name` +source of truth, `published_at` plumbing through aggregators, and an +updated download agent prompt that handles hostname-shaped show names. +Carried out on branch `rafacm/download-show-name-fix`. All 384 tests +pass. + +## Detailed conversation + +> **Agent-orchestrated session.** This implementation session was +> launched by a parent Claude Code (Conductor) agent rather than +> directly by the user — the parent authorized the parallel session +> and handed the implementation agent a summarized brief, not a +> user-authored prompt. Per AGENTS.md's "Agent-orchestrated sessions" +> convention, the section below uses `### Parent agent (orchestrator)` +> instead of `### User`. The parent-agent's verbatim launching prompt +> could not be recovered from session logs and is therefore left +> unavailable; the substance of the brief is summarized in prose +> rather than fabricated as a quote. + +### Parent agent (orchestrator) + +*Verbatim launching prompt unavailable.* Substance of the brief +relayed to the implementation agent: implement issue #111. Verify +staleness of the headline lines first (`_show_name`, +`Episode.show_name`, `EpisodeDetails`, `EpisodeCandidate`, the three +aggregator `_candidate()` methods). Then carry out the three-layer +plan: add `Episode.show_name` (no backfill) + extract via +fetch_details; add `published_at` to `EpisodeCandidate` + plumb +through fyyd / iTunes / podcastindex; relax the download agent prompt +to handle hostname-shaped show names. Manual verification belongs in +the PR body. Document and PR via the project's normal flow. + +### Assistant + +**Staleness check.** Read each headline file. Confirmed: +- `episodes/downloader._show_name` still falls back to URL host. +- `Episode.show_name` was absent (`published_at` already present). +- `EpisodeDetails` did not extract `show_name`. +- `EpisodeCandidate` had no `published_at` field. +- None of fyyd / iTunes / podcastindex `_candidate()` parsed pubdate. + +Plan still applies in full. + +**Branch.** Created `rafacm/download-show-name-fix` off the worktree's +main equivalent. + +**Layer 1 — `show_name`.** +- Added `show_name = CharField(max_length=255, blank=True, default="")` + to `Episode`. +- Generated `episodes/migrations/0025_episode_show_name.py` via + `uv run python manage.py makemigrations`. +- Added `show_name: str | None = None` to `EpisodeDetails` in + `episodes/agents/fetch_details.py`. Updated the system prompt to + enumerate the extraction sources (`og:site_name`, + `application-name`, RSS `<channel><title>`, JSON-LD + `isPartOf.name` / `partOfSeries.name`, visible publisher heading) + and to forbid using the URL hostname or company name. +- Updated `episodes/fetch_details_step.py:_apply_details` to persist + `show_name` only when the agent's value is non-empty (additive + semantics — never wipe a previously-good value or admin edit on a + partial re-run). +- Updated `episodes/downloader._show_name` to prefer + `episode.show_name` and fall back to URL host. + +**Layer 2 — `published_at`.** +- Added `published_at: date | None = None` to `EpisodeCandidate` in + `episodes/podcast_aggregators/base.py`. +- fyyd: added `_parse_pubdate(raw)` accepting fyyd's documented + `"YYYY-MM-DD HH:MM:SS"` plus ISO 8601 / `T`-separated / bare-date + variants. Logs a warning and returns `None` on unparseable input — + never raises. +- iTunes: added `_parse_release_date(raw)` using `datetime.fromisoformat` + after replacing trailing `Z` with `+00:00`. +- podcastindex: added `_parse_date_published(raw)` accepting `int`, + `float`, or numeric string Unix epoch seconds. Rejects `bool` + (subclass of `int`) explicitly. +- All three log + return `None` on missing / malformed input. None + drops the candidate. +- Added `published_at: date | None = None` to `DownloadDeps`. +- Added `published_at: date | None = None` to `IndexCandidate` in + `episodes/agents/download_tools.py` and propagated it from each + `EpisodeCandidate` in `lookup_podcast_index`. +- Updated `episodes/agents/download._run_agent_async` and + `run_download_agent` signatures to accept `published_at: date | None`, + forwarded into `DownloadDeps`. +- Updated `episodes/downloader.download_episode` to pass + `episode.published_at` to `run_download_agent`. + +**Layer 3 — prompt update.** +- Surfaced `Published: {published_at}` in the agent context block. +- Replaced the strict "if a candidate looks right" wording with a + detailed candidate-picking section that: + - Tells the agent to detect hostname-shaped `Show` values (contain + `.` and no spaces) and switch to `(title, published_at)` matching. + - Defines a "strong match" as ~equal title plus `published_at` ±1 + day. + - Falls back to title similarity alone when `Published` is unknown. + - Keeps the existing real-show-title path for the well-behaved case. + +**Tests added.** +- `episodes/tests/test_models.py` — `show_name` blank by default; + persists when set. +- `episodes/tests/test_podcast_aggregators.py` — for each of fyyd / + iTunes / podcastindex: pubdate parsed correctly; missing pubdate + returns `None`; malformed pubdate returns `None` without dropping + the candidate. +- `episodes/tests/test_download.py` — `ShowNameTests`: `_show_name` + returns `episode.show_name` when set; falls back to host; returns + `""` for a URL with no host. + +**Test run.** `uv run python manage.py test` — 384 tests passing. +`uv run python manage.py makemigrations --check --dry-run` — no +further migrations needed. + +**Documentation.** Wrote plan, feature, planning session, and this +implementation session under `doc/`. Added a CHANGELOG entry under +`## 2026-05-04`. + +**Manual verification.** Not run by the implementation agent — +included as a checklist in the PR body per the user's instruction. +The reviewer will exercise the canonical ARD Sounds URL on a live +ASGI worker with provider keys configured. + +**PR.** Created against `main` with rebase merge strategy. Title: +"Plumb show_name + published_at through download cascade (closes +#111)". Body includes the manual verification checklist plus a +short summary of the three layers. diff --git a/doc/sessions/2026-05-04-download-show-name-fix-planning-session.md b/doc/sessions/2026-05-04-download-show-name-fix-planning-session.md new file mode 100644 index 0000000..0a52834 --- /dev/null +++ b/doc/sessions/2026-05-04-download-show-name-fix-planning-session.md @@ -0,0 +1,84 @@ +# Download show_name + published_at planning session + +**Date:** 2026-05-04 + +**Session ID:** unavailable + +## Summary + +Planning session that turned issue #111 into a three-layer plan: +(1) extract a real `show_name` in fetch_details and persist it on the +Episode; (2) plumb `published_at` from the aggregator candidates +through to the download agent so it can match by `(title, published_at)`; +(3) update the download agent's system prompt to recognize a +hostname-shaped `show_name` as a degraded signal and prefer the +date-based match. The user accepted bundling all three into one PR +with no backfill on the new model field, since the project is pre-prod. + +## Detailed conversation + +> **Agent-orchestrated session.** This planning session was launched +> by a parent Claude Code (Conductor) agent rather than directly by +> the user. The implementation agent that produced this transcript +> never received the user's verbatim wording — only a summarized brief +> from the parent agent. Per AGENTS.md's "Agent-orchestrated sessions" +> convention, the section below uses `### Parent agent (orchestrator)` +> instead of `### User`. The parent-agent's verbatim launching prompt +> could not be recovered from session logs and is therefore left +> unavailable; the substance of the brief is summarized in prose +> rather than fabricated as a quote. + +### Parent agent (orchestrator) + +*Verbatim launching prompt unavailable.* Substance of the brief +relayed to the planning agent: surface issue #111 — the canonical ARD +Sounds episode submitted in the PR #109 test plan goes to `FAILED` +even though fyyd carries a clean enclosure URL for it. Two contributing +factors: (a) `_show_name` falls back to URL host (`www.ardsounds.de`) +and never matches a real broadcast title; (b) the download agent +prompt encourages strict show matches. Produce a plan. + +### Assistant + +Worked through the failure mode in `episodes/downloader._show_name` +and the download agent system prompt. Confirmed via PR #109 test logs +that the agent rejected every candidate with the message +*"Keiner der gefundenen Podcast-Episoden aus dem Index passt zum Titel +oder der Show www.ardsounds.de"*. Proposed three layers of fix: + +1. Real `show_name` source of truth: new `Episode.show_name` field; + `EpisodeDetails` extracts it from `og:site_name` / RSS title / + JSON-LD; `_show_name` prefers it over the URL host. +2. `published_at` as a tiebreaker: add it to `EpisodeCandidate`, + plumb pubdate through fyyd / iTunes / podcastindex parsers, surface + it on `IndexCandidate` and in `DownloadDeps`. +3. Looser, hostname-aware prompt: explicitly tell the agent that when + `show_name` looks like a hostname it should switch to a + `(title, published_at)` match with ±1 day. + +Considered alternatives: +- *Layer 3 alone* (just relax the prompt) — rejected: fragile, the + LLM still has nothing to match on when the publisher's hostname is + the only signal it has. +- *Layer 1 alone* (extract show_name, leave prompt strict) — rejected: + the prompt's exact-match wording would still bite when the publisher's + show name doesn't appear verbatim on aggregator pages. +- *Backfill `show_name` for existing rows* — rejected per the user's + pre-prod-data-freedom feedback (`feedback_reembed_ok_preprod.md`): + full Postgres regen + re-ingest is the supported upgrade path. + +### Parent agent (orchestrator) + +*Verbatim relay unavailable.* Decisions communicated through the +parent agent: +- Bundle all three layers in one PR. +- Add a new model field (no backfill). +- Manual verification belongs in the PR description, not run by the + implementation agent. +- Staleness against `main` is verified. + +### Assistant + +Captured the decisions, drafted this plan document, and queued the +implementation agent to execute it on a fresh branch +(`rafacm/download-show-name-fix`). diff --git a/episodes/admin.py b/episodes/admin.py index 089965d..f37220c 100644 --- a/episodes/admin.py +++ b/episodes/admin.py @@ -297,6 +297,7 @@ class EpisodeAdmin(admin.ModelAdmin): METADATA_FIELDS = ( "title", + "show_name", "description", "published_at", "image_url", diff --git a/episodes/agents/download.py b/episodes/agents/download.py index 7eb0e9d..9057487 100644 --- a/episodes/agents/download.py +++ b/episodes/agents/download.py @@ -21,6 +21,7 @@ import asyncio import logging import os +from datetime import date from django.conf import settings from pydantic_ai import Agent @@ -42,6 +43,7 @@ - Episode URL: {episode_url} - Title: {title} - Show: {show_name} +- Published: {published_at} - GUID hint: {guid} - Known audio URL (failed wget): {audio_url} - Language: {language} @@ -51,10 +53,30 @@ 1. Call `lookup_podcast_index` with the title/show/guid hints. Podcast indexes (fyyd, podcastindex.org) often carry the publisher's RSS-feed enclosure URL even when the publisher's - page hides it behind interactive UI. If a candidate looks - right, call `download_file` on its `audio_url`. On success, - return success with `source` set to the candidate's - `source_index` (e.g. "fyyd"). + page hides it behind interactive UI. + + Picking the right candidate: + - The episode's `Show` value above may be a publisher hostname + rather than the broadcast title — for example + `www.ardsounds.de` instead of `Zeitzeichen`. Detect this: + when `Show` contains a `.` and no spaces, treat it as a + hostname and do NOT require an exact string match against + the candidate's `show_name`. + - For hostname-shaped `Show` values, prefer matching candidates + by `(title, published_at)` instead. A candidate is a strong + match when its `title` is essentially the same as the + episode `Title` (allowing for trailing punctuation, suffixes + like `" | Podcast"`, etc.) AND its `published_at` is within + ±1 day of the episode `Published` value. + - For real show titles, an exact / fuzzy match on + `show_name` plus a similar episode `title` is enough. + - When `Published` is unknown, fall back to title similarity + alone — do not reject a clear title match just because the + date is missing on either side. + + When you find a strong match, call `download_file` on its + `audio_url`. On success, return success with `source` set to + the candidate's `source_index` (e.g. "fyyd"). 2. If no index candidates look right, navigate to the episode page with `navigate_to_url`, then use `find_audio_links` to @@ -130,10 +152,14 @@ def _build_agent() -> Agent[DownloadDeps, DownloadAgentResult]: def _get_system_prompt(deps: DownloadDeps) -> str: from ..languages import ISO_639_LANGUAGE_NAMES, ISO_639_RE + published_at_str = ( + deps.published_at.isoformat() if deps.published_at else "(unknown)" + ) prompt = DOWNLOAD_SYSTEM_PROMPT.format( episode_url=deps.episode_url, title=deps.title or "(unknown)", show_name=deps.show_name or "(unknown)", + published_at=published_at_str, guid=deps.guid or "(none)", audio_url=deps.audio_url or "(none)", language=deps.language or "(unknown)", @@ -157,6 +183,7 @@ async def _run_agent_async( show_name: str, guid: str, language: str, + published_at: date | None = None, ) -> DownloadAgentResult: import shutil import tempfile @@ -176,6 +203,7 @@ async def _run_agent_async( download_dir=download_dir, page=page, screenshots=[], + published_at=published_at, ) system_prompt = _get_system_prompt(deps) @@ -339,6 +367,7 @@ def run_download_agent( show_name: str = "", guid: str = "", language: str = "", + published_at: date | None = None, ) -> DownloadAgentResult: """Run the download agent synchronously (entry point from the step).""" try: @@ -355,13 +384,13 @@ def run_download_agent( asyncio.run, _run_agent_async( episode_id, episode_url, audio_url, - title, show_name, guid, language, + title, show_name, guid, language, published_at, ), ).result() return asyncio.run( _run_agent_async( episode_id, episode_url, audio_url, - title, show_name, guid, language, + title, show_name, guid, language, published_at, ) ) finally: diff --git a/episodes/agents/download_deps.py b/episodes/agents/download_deps.py index c94d166..fe190c4 100644 --- a/episodes/agents/download_deps.py +++ b/episodes/agents/download_deps.py @@ -1,6 +1,7 @@ """Dependencies and result model for the download agent.""" import dataclasses +from datetime import date from playwright.async_api import Page from pydantic import BaseModel @@ -20,6 +21,10 @@ class DownloadDeps: download_dir: str page: Page screenshots: list[bytes] + # Episode publication date (when fetch_details extracted one). Used + # by the agent as a tiebreaker against ``IndexCandidate.published_at`` + # when ``show_name`` is degraded (e.g. fell back to URL host). + published_at: date | None = None class DownloadAgentResult(BaseModel): diff --git a/episodes/agents/download_tools.py b/episodes/agents/download_tools.py index d52cad0..11f1963 100644 --- a/episodes/agents/download_tools.py +++ b/episodes/agents/download_tools.py @@ -3,6 +3,7 @@ import asyncio import logging import os +from datetime import date from playwright.async_api import Error as PlaywrightError from pydantic import BaseModel @@ -411,13 +412,21 @@ def on_response(response): class IndexCandidate(BaseModel): - """One candidate from a podcast index, surfaced to the agent.""" + """One candidate from a podcast index, surfaced to the agent. + + ``published_at`` is the candidate's publication date when the + aggregator surfaces one (ISO 8601 ``YYYY-MM-DD`` in serialized + output). The agent should match candidates by + ``(title, published_at)`` when ``show_name`` is a hostname rather + than a real broadcast title. + """ audio_url: str title: str = "" show_name: str = "" duration_seconds: int | None = None source_index: str = "" + published_at: date | None = None async def lookup_podcast_index( @@ -461,6 +470,7 @@ async def lookup_podcast_index( show_name=c.show_name, duration_seconds=c.duration_seconds, source_index=c.source_index, + published_at=c.published_at, ) for c in candidates ] diff --git a/episodes/agents/fetch_details.py b/episodes/agents/fetch_details.py index 770d74a..4d50a24 100644 --- a/episodes/agents/fetch_details.py +++ b/episodes/agents/fetch_details.py @@ -43,8 +43,18 @@ to recover by searching aggregators based on the URL alone. 3. If the fetch succeeded, determine whether the page is a podcast episode page at all. If it isn't, emit outcome=not_a_podcast_episode. - 4. Extract metadata: title, description, language, country, image, - published date, audio URL, audio format, GUID. + 4. Extract metadata: title, show_name, description, language, country, + image, published date, audio URL, audio format, GUID. + For show_name, look at (in order of preference): + - <meta property="og:site_name"> on the episode page + - <meta name="application-name"> + - RSS/Atom <channel><title> when fetching the feed URL + - JSON-LD structured data: PodcastEpisode → isPartOf.name (the + parent PodcastSeries) or partOfSeries.name + - The visible publisher / show title near the episode title + show_name is the broadcast / podcast title (e.g. "Zeitzeichen", + "This American Life") — NOT the publisher's company name and NOT + the URL hostname. Leave blank when you can't pinpoint it. 5. Classify source_kind (canonical | aggregator | unknown) and aggregator_provider when applicable. 6. Cross-link ONLY when the submitted page is itself a podcast @@ -153,6 +163,7 @@ class EpisodeDetails(BaseModel): """ title: str | None = None + show_name: str | None = None description: str | None = None published_at: date | None = None image_url: str | None = None diff --git a/episodes/downloader.py b/episodes/downloader.py index 35be8ad..3a86ae6 100644 --- a/episodes/downloader.py +++ b/episodes/downloader.py @@ -98,7 +98,18 @@ def _save_audio(episode: Episode, src_path: str) -> int: def _show_name(episode: Episode) -> str: - """Best-effort show name (no Show model — fall back to URL host).""" + """Best-effort show name. + + Prefers ``Episode.show_name`` (extracted by the fetch_details agent + from ``og:site_name`` / RSS ``<channel><title>`` / JSON-LD, etc.). + Falls back to the URL host so the download agent always has *some* + string to work with — but the download agent prompt knows that a + hostname-shaped show_name is a degraded signal and should match + candidates by ``(title, published_at)`` instead. + """ + if episode.show_name: + return episode.show_name + from urllib.parse import urlparse netloc = urlparse(episode.url).netloc @@ -168,6 +179,7 @@ def download_episode(episode_id: int) -> None: show_name=_show_name(episode), guid=episode.guid or "", language=episode.language or "", + published_at=episode.published_at, ) if agent_result.success and agent_result.downloaded_file: diff --git a/episodes/fetch_details_step.py b/episodes/fetch_details_step.py index d382864..5470bfe 100644 --- a/episodes/fetch_details_step.py +++ b/episodes/fetch_details_step.py @@ -82,6 +82,15 @@ def _apply_details(episode: Episode, details) -> list[str]: value = getattr(details, name) setattr(episode, name, value if isinstance(value, str) else "") + # show_name is "additive only" — we never want a fresh agent run + # that fails to extract a show name to wipe out a previously-good + # value (or a user edit in the admin). Write only when non-empty. + touched_fields = list(string_fields) + show_name_value = getattr(details, "show_name", None) + if isinstance(show_name_value, str) and show_name_value.strip(): + episode.show_name = show_name_value.strip() + touched_fields.append("show_name") + # Nullable date — write the value or clear it. if isinstance(details.published_at, date): episode.published_at = details.published_at @@ -92,7 +101,7 @@ def _apply_details(episode: Episode, details) -> list[str]: # always reflect the agent's classification. episode.source_kind = details.source_kind or Episode.SourceKind.UNKNOWN - return [*string_fields, "published_at", "source_kind"] + return [*touched_fields, "published_at", "source_kind"] def _persist_run( diff --git a/episodes/migrations/0025_episode_show_name.py b/episodes/migrations/0025_episode_show_name.py new file mode 100644 index 0000000..07efa3d --- /dev/null +++ b/episodes/migrations/0025_episode_show_name.py @@ -0,0 +1,18 @@ +# Generated by Django 5.2.12 on 2026-05-04 08:00 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('episodes', '0024_fetch_details_cross_linking'), + ] + + operations = [ + migrations.AddField( + model_name='episode', + name='show_name', + field=models.CharField(blank=True, default='', max_length=255), + ), + ] diff --git a/episodes/models.py b/episodes/models.py index 09e63cc..52583bb 100644 --- a/episodes/models.py +++ b/episodes/models.py @@ -31,6 +31,7 @@ class SourceKind(models.TextChoices): # Metadata fields (populated by fetch_details step) title = models.CharField(max_length=500, blank=True, default="") + show_name = models.CharField(max_length=255, blank=True, default="") description = models.TextField(blank=True, default="") published_at = models.DateField(null=True, blank=True) image_url = models.URLField(max_length=2000, blank=True, default="") diff --git a/episodes/podcast_aggregators/base.py b/episodes/podcast_aggregators/base.py index 0a988fd..4ef6be0 100644 --- a/episodes/podcast_aggregators/base.py +++ b/episodes/podcast_aggregators/base.py @@ -4,6 +4,7 @@ import dataclasses from abc import ABC, abstractmethod +from datetime import date @dataclasses.dataclass(frozen=True) @@ -15,6 +16,13 @@ class EpisodeCandidate: agent (and ``DownloadResult``) can record which aggregator hit. The legacy field name is preserved to avoid churn in callers that only consume the value as an opaque label. + + ``published_at`` is the candidate's publication date (when the + aggregator surfaces one). The download agent uses it as a + tiebreaker when ``show_name`` is degraded — e.g. for non-English + publishers whose extracted ``show_name`` falls back to the URL + host. ``None`` means the aggregator did not expose a date or it + failed to parse. """ audio_url: str @@ -27,6 +35,7 @@ class EpisodeCandidate: # Fetch Details agent's cross-linking flow; ignored by the # Download cascade. Empty when the aggregator doesn't expose one. episode_page_url: str = "" + published_at: date | None = None class PodcastAggregator(ABC): diff --git a/episodes/podcast_aggregators/fyyd.py b/episodes/podcast_aggregators/fyyd.py index 72a5830..4867909 100644 --- a/episodes/podcast_aggregators/fyyd.py +++ b/episodes/podcast_aggregators/fyyd.py @@ -7,6 +7,7 @@ from __future__ import annotations import logging +from datetime import date, datetime from typing import Any import httpx @@ -82,10 +83,46 @@ def _candidate(self, item: dict) -> EpisodeCandidate | None: duration = item.get("duration") if not isinstance(duration, int): duration = None + published_at = _parse_pubdate(item.get("pubdate")) return EpisodeCandidate( audio_url=audio_url, title=item.get("title") or "", show_name=show_name, duration_seconds=duration, source_index=self.name, + published_at=published_at, ) + + +def _parse_pubdate(raw: Any) -> date | None: + """Parse fyyd's ``pubdate`` field to a ``date``. + + fyyd documents the value as ISO 8601 (e.g. ``"2024-08-30 04:00:00"``). + Returns ``None`` on missing / unparseable input — never raises, so + a single broken row doesn't drop a candidate. + """ + if not raw: + return None + if isinstance(raw, date) and not isinstance(raw, datetime): + return raw + if isinstance(raw, datetime): + return raw.date() + if not isinstance(raw, str): + logger.warning("fyyd pubdate has unexpected type %s: %r", type(raw), raw) + return None + text = raw.strip() + if not text: + return None + # Try a few likely shapes: "YYYY-MM-DD HH:MM:SS", ISO 8601 with T, + # bare "YYYY-MM-DD". + for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d"): + try: + return datetime.strptime(text[: len(fmt) + 4], fmt).date() + except ValueError: + continue + # Last resort: ``fromisoformat`` (handles offsets, fractional seconds, etc.) + try: + return datetime.fromisoformat(text.replace("Z", "+00:00")).date() + except ValueError: + logger.warning("fyyd pubdate could not be parsed: %r", raw) + return None diff --git a/episodes/podcast_aggregators/itunes.py b/episodes/podcast_aggregators/itunes.py index 5a16c89..bd353eb 100644 --- a/episodes/podcast_aggregators/itunes.py +++ b/episodes/podcast_aggregators/itunes.py @@ -18,6 +18,7 @@ from __future__ import annotations import logging +from datetime import date, datetime from typing import Any import httpx @@ -82,6 +83,7 @@ def _candidate(self, item: dict) -> EpisodeCandidate | None: duration_seconds: int | None = None if isinstance(duration_ms, int) and duration_ms > 0: duration_seconds = duration_ms // 1000 + published_at = _parse_release_date(item.get("releaseDate")) return EpisodeCandidate( audio_url=audio_url, title=item.get("trackName") or "", @@ -89,4 +91,30 @@ def _candidate(self, item: dict) -> EpisodeCandidate | None: duration_seconds=duration_seconds, source_index=self.name, episode_page_url=item.get("trackViewUrl") or "", + published_at=published_at, ) + + +def _parse_release_date(raw: Any) -> date | None: + """Parse iTunes' ``releaseDate`` (ISO 8601 datetime) to a ``date``. + + Example value: ``"2024-08-30T04:00:00Z"``. Returns ``None`` on + missing / unparseable input — never raises. + """ + if not raw: + return None + if isinstance(raw, date) and not isinstance(raw, datetime): + return raw + if isinstance(raw, datetime): + return raw.date() + if not isinstance(raw, str): + logger.warning("iTunes releaseDate has unexpected type %s: %r", type(raw), raw) + return None + text = raw.strip() + if not text: + return None + try: + return datetime.fromisoformat(text.replace("Z", "+00:00")).date() + except ValueError: + logger.warning("iTunes releaseDate could not be parsed: %r", raw) + return None diff --git a/episodes/podcast_aggregators/podcastindex.py b/episodes/podcast_aggregators/podcastindex.py index 17f0076..eb72481 100644 --- a/episodes/podcast_aggregators/podcastindex.py +++ b/episodes/podcast_aggregators/podcastindex.py @@ -11,6 +11,7 @@ import hashlib import logging import time +from datetime import date, datetime, timezone from typing import Any import httpx @@ -107,6 +108,7 @@ def _candidates_from_payload(self, payload: Any, key: str) -> list[EpisodeCandid duration = item.get("duration") if not isinstance(duration, int): duration = None + published_at = _parse_date_published(item.get("datePublished")) results.append( EpisodeCandidate( audio_url=audio_url, @@ -114,6 +116,35 @@ def _candidates_from_payload(self, payload: Any, key: str) -> list[EpisodeCandid show_name=item.get("feedTitle") or item.get("title") or "", duration_seconds=duration, source_index=self.name, + published_at=published_at, ) ) return results + + +def _parse_date_published(raw: Any) -> date | None: + """Parse podcastindex's ``datePublished`` (Unix epoch seconds) to ``date``. + + Returns ``None`` on missing / unparseable input — never raises. + """ + if raw is None or raw == "": + return None + if isinstance(raw, bool): # bool is a subclass of int; reject explicitly. + logger.warning("podcastindex datePublished is bool: %r", raw) + return None + if isinstance(raw, (int, float)): + try: + return datetime.fromtimestamp(raw, tz=timezone.utc).date() + except (OverflowError, OSError, ValueError): + logger.warning("podcastindex datePublished out of range: %r", raw) + return None + if isinstance(raw, str): + try: + return datetime.fromtimestamp(int(raw), tz=timezone.utc).date() + except (TypeError, ValueError): + logger.warning("podcastindex datePublished not numeric: %r", raw) + return None + logger.warning( + "podcastindex datePublished has unexpected type %s: %r", type(raw), raw, + ) + return None diff --git a/episodes/tests/test_download.py b/episodes/tests/test_download.py index b7d2cb2..4866392 100644 --- a/episodes/tests/test_download.py +++ b/episodes/tests/test_download.py @@ -175,3 +175,35 @@ def test_wrong_status_skips(self): episode.refresh_from_db() self.assertEqual(episode.status, Episode.Status.QUEUED) + + +class ShowNameTests(TestCase): + """`_show_name(episode)` cascade: episode.show_name → URL host.""" + + def _create_episode(self, **kwargs): + with patch("episodes.signals.DBOS"): + return Episode.objects.create(**kwargs) + + def test_returns_show_name_when_set(self): + from episodes.downloader import _show_name + + episode = self._create_episode( + url="https://www.ardsounds.de/ep/x", + show_name="Zeitzeichen", + ) + self.assertEqual(_show_name(episode), "Zeitzeichen") + + def test_falls_back_to_host_when_show_name_blank(self): + from episodes.downloader import _show_name + + episode = self._create_episode( + url="https://www.ardsounds.de/ep/y", + ) + self.assertEqual(_show_name(episode), "www.ardsounds.de") + + def test_returns_empty_when_url_has_no_host(self): + from episodes.downloader import _show_name + + # URL without netloc (e.g. relative path stored erroneously) — return "". + episode = Episode(url="not-a-url", show_name="") + self.assertEqual(_show_name(episode), "") diff --git a/episodes/tests/test_models.py b/episodes/tests/test_models.py index 6f77e2c..201b59f 100644 --- a/episodes/tests/test_models.py +++ b/episodes/tests/test_models.py @@ -32,6 +32,7 @@ def test_new_statuses_exist(self, mock_async): def test_metadata_fields_blank_by_default(self, mock_async): episode = Episode.objects.create(url="https://example.com/ep/1") self.assertEqual(episode.title, "") + self.assertEqual(episode.show_name, "") self.assertEqual(episode.description, "") self.assertIsNone(episode.published_at) self.assertEqual(episode.image_url, "") @@ -42,3 +43,11 @@ def test_metadata_fields_blank_by_default(self, mock_async): self.assertEqual(episode.canonical_url, "") self.assertEqual(episode.source_kind, Episode.SourceKind.UNKNOWN) self.assertEqual(episode.aggregator_provider, "") + + def test_show_name_can_be_set(self, mock_async): + episode = Episode.objects.create( + url="https://example.com/ep/1", + show_name="Zeitzeichen", + ) + episode.refresh_from_db() + self.assertEqual(episode.show_name, "Zeitzeichen") diff --git a/episodes/tests/test_podcast_aggregators.py b/episodes/tests/test_podcast_aggregators.py index d47a0c8..8f7ced4 100644 --- a/episodes/tests/test_podcast_aggregators.py +++ b/episodes/tests/test_podcast_aggregators.py @@ -1,5 +1,6 @@ """Tests for the podcast aggregator provider abstraction.""" +from datetime import date from unittest.mock import patch import httpx @@ -67,6 +68,57 @@ def test_http_error_returns_empty(self): get.side_effect = httpx.ConnectError("nope") self.assertEqual(FyydAggregator().search(title="x"), []) + def test_pubdate_parsed_to_date(self): + payload = { + "data": [ + { + "title": "Django Reinhardt", + "enclosure": "https://wdr.example/episode.mp3", + "pubdate": "2024-08-30 04:00:00", + "podcast": {"title": "Zeitzeichen"}, + } + ] + } + with patch("episodes.podcast_aggregators.fyyd.httpx.get") as get: + get.return_value = _FakeResponse(payload) + results = FyydAggregator().search( + title="Django Reinhardt", show_name="Zeitzeichen" + ) + self.assertEqual(len(results), 1) + self.assertEqual(results[0].published_at, date(2024, 8, 30)) + + def test_missing_pubdate_returns_none(self): + payload = { + "data": [ + { + "title": "x", + "enclosure": "https://e/ep.mp3", + "podcast": {"title": "y"}, + } + ] + } + with patch("episodes.podcast_aggregators.fyyd.httpx.get") as get: + get.return_value = _FakeResponse(payload) + results = FyydAggregator().search(title="x") + self.assertIsNone(results[0].published_at) + + def test_malformed_pubdate_returns_none(self): + payload = { + "data": [ + { + "title": "x", + "enclosure": "https://e/ep.mp3", + "pubdate": "not-a-date", + "podcast": {"title": "y"}, + } + ] + } + with patch("episodes.podcast_aggregators.fyyd.httpx.get") as get: + get.return_value = _FakeResponse(payload) + results = FyydAggregator().search(title="x") + self.assertEqual(len(results), 1) + self.assertIsNone(results[0].published_at) + class ItunesAggregatorTests(SimpleTestCase): def test_search_returns_candidates(self): @@ -108,6 +160,54 @@ def test_http_error_returns_empty(self): get.side_effect = httpx.ConnectError("nope") self.assertEqual(ItunesAggregator().search(title="x"), []) + def test_release_date_parsed_to_date(self): + payload = { + "results": [ + { + "trackName": "x", + "collectionName": "Show", + "episodeUrl": "https://e/ep.mp3", + "releaseDate": "2024-08-30T04:00:00Z", + } + ] + } + with patch("episodes.podcast_aggregators.itunes.httpx.get") as get: + get.return_value = _FakeResponse(payload) + results = ItunesAggregator().search(title="x", show_name="Show") + self.assertEqual(results[0].published_at, date(2024, 8, 30)) + + def test_missing_release_date_returns_none(self): + payload = { + "results": [ + { + "trackName": "x", + "collectionName": "Show", + "episodeUrl": "https://e/ep.mp3", + } + ] + } + with patch("episodes.podcast_aggregators.itunes.httpx.get") as get: + get.return_value = _FakeResponse(payload) + results = ItunesAggregator().search(title="x", show_name="Show") + self.assertIsNone(results[0].published_at) + + def test_malformed_release_date_returns_none(self): + payload = { + "results": [ + { + "trackName": "x", + "collectionName": "Show", + "episodeUrl": "https://e/ep.mp3", + "releaseDate": "garbage", + } + ] + } + with patch("episodes.podcast_aggregators.itunes.httpx.get") as get: + get.return_value = _FakeResponse(payload) + results = ItunesAggregator().search(title="x", show_name="Show") + self.assertEqual(len(results), 1) + self.assertIsNone(results[0].published_at) + class PodcastIndexOrgTests(SimpleTestCase): def test_guid_lookup_first(self): @@ -137,6 +237,55 @@ def test_falls_back_to_term_search_when_guid_missing(self): called_url = get.call_args[0][0] self.assertIn("/search/byterm", called_url) + def test_date_published_epoch_parsed_to_date(self): + # 2024-08-30T04:00:00Z → 1724990400 epoch seconds. + payload = { + "feeds": [ + { + "title": "x", + "feedTitle": "Show", + "enclosureUrl": "https://e/ep.mp3", + "datePublished": 1724990400, + } + ] + } + with patch("episodes.podcast_aggregators.podcastindex.httpx.get") as get: + get.return_value = _FakeResponse(payload) + results = PodcastIndexOrg("k", "s").search(title="x", show_name="Show") + self.assertEqual(results[0].published_at, date(2024, 8, 30)) + + def test_missing_date_published_returns_none(self): + payload = { + "feeds": [ + { + "title": "x", + "feedTitle": "Show", + "enclosureUrl": "https://e/ep.mp3", + } + ] + } + with patch("episodes.podcast_aggregators.podcastindex.httpx.get") as get: + get.return_value = _FakeResponse(payload) + results = PodcastIndexOrg("k", "s").search(title="x", show_name="Show") + self.assertIsNone(results[0].published_at) + + def test_malformed_date_published_returns_none(self): + payload = { + "feeds": [ + { + "title": "x", + "feedTitle": "Show", + "enclosureUrl": "https://e/ep.mp3", + "datePublished": "garbage", + } + ] + } + with patch("episodes.podcast_aggregators.podcastindex.httpx.get") as get: + get.return_value = _FakeResponse(payload) + results = PodcastIndexOrg("k", "s").search(title="x", show_name="Show") + self.assertEqual(len(results), 1) + self.assertIsNone(results[0].published_at) + class FactoryTests(SimpleTestCase): @override_settings(RAGTIME_PODCAST_AGGREGATORS="")