diff --git a/AGENTS.md b/AGENTS.md index 8a30035..942b34e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -18,6 +18,7 @@ deterministic mp4 render. Human edits survive AI regeneration of the base. - `pnpm reframe compile [-o out.json] [--stdin] [--code ""] [--json]` — bundle + validate eDSL source into SceneIR JSON, NO render (no ffmpeg/chromium; fast). On failure: a concise classified error (`bundle`/`eval`/`validation`), never the base64 bundle; `--json` makes it `{ok:false,error,kind,issues?}` where `issues` is the structured validation problems (each `{code,path,message}` — e.g. `code:"unknown-blend", path:"nodes.box"`). The in-process equivalent is exported as `reframe-video/compile` (`loadScene`/`loadSceneFromCode`/`checkDeterminism`, server-only); a thrown `SceneValidationError` carries `.issues` (and `.problems` for back-compat), and `SceneLoadError.issues` propagates them across the scene bundle. Entry `packages/render-cli/src/compile.ts`; loader `loadScene.ts`. - `pnpm reframe frame [--t ] [-o out.png]` — render ONE frame at time `t` to a PNG (same renderer as `render`, no ffmpeg muxing; chromium only). For an agentic render-and-look loop (feed the frame back to a model). Reuses `renderFrameAt` (`frameLoop.ts`); entry `packages/render-cli/src/frame.ts`. - `pnpm reframe assemble [-o name] [--title "…"] [--bgm ] [--hold s] [--seed N]` — the **files → scene** path: probe each image/video for its real duration (ffprobe) and scaffold an editable montage scene `.ts` wiring `photoMontage` (clip-aware holds, no freeze) + an optional `title` + a music bed. Probed numbers are baked in → the emitted scene is a normal deterministic scene. Probe `packages/render-cli/src/media/probe.ts`; entry `assemble.ts`. +- `pnpm reframe narrate [--voice ] [--max-speed n] [--script ] [--dry-run]` — **scene-fitted Kokoro voiceover**. Reads a sibling `-vo/script.json` of `{ at, text }` lines (imported into `audio.narration`), computes each line's slot from the compiled label clock, synthesizes it with a Kokoro python sidecar (`narrate.py`), and **auto-fits** its speech rate to the slot (bounded by `--max-speed`, default 1.3; warns if even max overruns). Bakes `file`/`voice`/`speed`/`duration` back into `script.json` (like `assemble` bakes ffprobe numbers); the scene then plays each line as a label-anchored `file` cue (survives retiming/regen) with the bed ducking under the whole utterance. `--dry-run` prints the fit table from a length *estimate* (no synthesis, no Kokoro needed). Kokoro is an **optional dep** (`pip install kokoro` + espeak-ng), preflighted like ffmpeg/chromium; the `.wav` are external assets (same-machine, not golden) — commit `script.json` + wavs together. Entry `packages/render-cli/src/narrate.ts` + sidecar `narrate.py`; the IR field is `AudioIR.narration` (`packages/core/src/ir.ts`), resolved in `resolveAudioPlan` (`audio.ts`). See `examples/scenes/narrated-demo.ts`. - `pnpm reframe manifest [--json]` — dump the scene's **addressable surface**: every node (+ its `editableProps` and `animatedProps`), state, timeline label (+ `patchable` params), beat, and behavior, each with the overlay address that reaches it. The map an AI/human editor reads to patch a scene surgically (vs regenerating). Core `sceneManifest(compiled)` (`packages/core/src/manifest.ts`, exported); entry `packages/render-cli/src/manifest.ts`. - `pnpm reframe lint [--json] [--strict]` — the **studio-readiness gate**: (a) flag un-addressable motion (a tween/to/motionPath with no `label` can't be retimed by an overlay and a regen can silently drop it) + a `motionAddressableRatio` summary, and (b) for a `.ts` source, verify the scene is a **pure function of time** (`non-deterministic-render` finding) — it bundles once and evaluates TWICE, reporting the first IR address that differs (e.g. a `Math.random()`/`Date` baked into a prop), since a non-pure scene silently compiles to a different IR each time. `--strict` exits non-zero on findings (CI gate). Core `lintScene(compiled)` (addressability); `checkDeterminism(path)` (purity, exported via `reframe-video/compile`, `packages/render-cli/src/determinism.ts`); entry `lint.ts`. - `pnpm reframe verify-overlay ... [--json]` — compose an overlay onto a base and report applied-vs-orphaned, NO render. The regen-survival check: run vs the original base (all applied), then vs the AI-regenerated base — any orphan is a broken stable address. Non-zero exit on orphans (CI gate). Reuses `composeScene`/`formatComposeReport`; entry `verifyOverlay.ts`. @@ -290,7 +291,11 @@ addition to the git push, so both channels carry the same skill. - Audio: `scene.audio` cues anchor to timeline labels (they survive retiming); sfx are procedurally synthesized, CC0 samples live in `assets/sfx/` (LICENSE.md records provenance). Determinism contract covers the AudioPlan - and WAV bytes, not AAC-encoded mp4 bytes. + and WAV bytes, not AAC-encoded mp4 bytes. `audio.narration` lines (spoken VO) + resolve to label-anchored `file` cues after `reframe narrate` synthesizes them; + the Kokoro `.wav` are **external assets** (same-machine, version-dependent, like + images) — NOT part of the golden contract. Synthesis is out-of-band; only the + AudioPlan (cue timing + the baked `duration`-sized duck window) is deterministic. - Golden snapshots in `packages/core/test/__snapshots__` encode the determinism contract; if they change unexpectedly, that's a regression, not noise. diff --git a/CHANGELOG.md b/CHANGELOG.md index 440b74d..589404f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,27 @@ versions may change them. ## [Unreleased] +## [0.6.44] - 2026-06-21 + +### Added + +#### Scene-fitted Kokoro narration (`reframe narrate` + `audio.narration`) + +- New IR field **`AudioIR.narration`** — spoken voiceover lines (`{ at, text, voice? }`) + authored as a sibling `-vo/script.json` the scene imports. Each line resolves to a + **label-anchored `file` cue** (so VO survives retiming/regen), with a baked `duration` + sizing the bed's duck window. Additive + golden-safe (no narration → byte-identical plan). +- New command **`reframe narrate [--voice] [--max-speed] [--dry-run]`** — reads the + compiled **label clock**, synthesizes each line with a Kokoro python sidecar (`narrate.py`), + and **auto-fits** its speech rate to the slot between its anchor and the next line (bounded; + warns if even max speed overruns). Bakes `file`/`voice`/`speed`/`duration` back into + `script.json` (like `assemble` bakes ffprobe numbers). `--dry-run` prints the fit table from + a length estimate with no synthesis. +- Kokoro is an **optional dependency** (`pip install kokoro` + espeak-ng), preflighted like + ffmpeg/chromium. The `.wav` are external assets (same-machine, not golden) — the determinism + contract still covers the AudioPlan, not the synthesized audio bytes. +- Example `examples/scenes/narrated-demo.ts` (+ `narrated-demo-vo/script.json`). + ## [0.6.43] - 2026-06-21 ### Added diff --git a/README.md b/README.md index e211f7f..6251291 100644 --- a/README.md +++ b/README.md @@ -280,6 +280,7 @@ your scene. | `pnpm reframe verify-overlay ... [--json]` | compose an overlay onto a base and report applied-vs-orphaned, no render — the regen-survival check (non-zero exit on orphans) | | `pnpm reframe labels ` | print the compiled event clock (every timeline label → exact seconds) — the timing source for audio cues | | `pnpm reframe assemble [-o name]` | probe images/videos (ffprobe) and scaffold an editable montage scene `.ts` wired with `photoMontage` | +| `pnpm reframe narrate [--voice ] [--max-speed n] [--dry-run]` | scene-fitted Kokoro voiceover: synth each `audio.narration` line and auto-fit its rate to the slot (needs python + `kokoro`) | | `pnpm reframe player [-o out.html]` | bundle a scene into one self-contained HTML that plays the motion live in any browser | | `pnpm reframe logo [--motion ]` | animate a logo (or a simple-icons brand) into a sting | | `pnpm reframe diff [scene.ts] [--t ] [--mode side\|blend\|diff\|grid]` | compare a render against a reference image | @@ -362,7 +363,7 @@ site. The [`docs/`](docs/) folder is its [Mintlify](https://mintlify.com) source |---|---| | [Introduction](docs/introduction.mdx) · [Quickstart](docs/quickstart.mdx) · [The loop](docs/the-loop.mdx) | the pitch, install, and the AI-write / human-edit / deterministic-render model | | [Gallery](docs/gallery.mdx) | a curated visual reel of scenes | -| [Examples](examples/README.md) | all 67 example scenes, by category | +| [Examples](examples/README.md) | all 68 example scenes, by category | | [Guides](docs/guides/) | the eDSL, directing, HTML/GSAP, and regeneration-contract guides (also `pnpm reframe guide`) | Curated renders live in [`docs/assets/gallery/`](docs/assets/gallery) and accumulate via `pnpm gallery` (the committed home; `out/` stays scratch). @@ -375,7 +376,7 @@ Curated renders live in [`docs/assets/gallery/`](docs/assets/gallery) and accumu | `packages/renderer-canvas` | DisplayList → Canvas 2D (browser + capture shared) | | `packages/render-cli` | Playwright capture + ffmpeg encode; also renders arbitrary HTML/GSAP deterministically via a virtual clock | | `packages/preview` | the Vite editor | -| `examples/` | 67 example scenes (see [`examples/README.md`](examples/README.md)), overlays, compositions, the edit-survival demo | +| `examples/` | 68 example scenes (see [`examples/README.md`](examples/README.md)), overlays, compositions, the edit-survival demo | | `labs/` | experiments and product probes (live-data → baked scene → render), kept out of `examples/` so it stays purely demonstrative | | `docs/` | the [Mintlify](https://mintlify.com)-ready docs site + the authoring guides (also `pnpm reframe guide`) | | `benchmark/` | **measurement artifacts, not product code**: LLM generation benchmark (RESULTS/ANALYSIS.md), regeneration-contract experiment (regen/), calibrated motion profiler (harness/motion/, MOTION.md) | diff --git a/docs/cli-reference.mdx b/docs/cli-reference.mdx index 909f223..ea6a3c0 100644 --- a/docs/cli-reference.mdx +++ b/docs/cli-reference.mdx @@ -15,6 +15,7 @@ Run any command with `npx reframe-video ` (no clone needed) or `pnpm re | `player [-o out.html]` | bundle a scene into one self-contained HTML that plays the motion live in any browser | | `logo [--motion ] [--energy n] [--seed n]` | animate a logo (or a simple-icons brand) into a sting | | `assemble [-o name] [--title "…"] [--bgm ]` | probe images/videos (ffprobe) and scaffold an editable montage scene `.ts` | +| `narrate [--voice ] [--max-speed n] [--dry-run]` | scene-fitted Kokoro voiceover — synth each `audio.narration` line and auto-fit its rate to the slot (needs python + `kokoro`; `--dry-run` estimates without synthesis) | ## Inspect & validate diff --git a/docs/examples.mdx b/docs/examples.mdx index 9d880a3..ad7661b 100644 --- a/docs/examples.mdx +++ b/docs/examples.mdx @@ -1,6 +1,6 @@ --- title: Examples -description: "All 67 example scenes, by category — each a single self-contained file you can render." +description: "All 68 example scenes, by category — each a single self-contained file you can render." --- Every scene is one `.ts` file in [`examples/scenes/`](https://github.com/kiyeonjeon21/reframe/tree/main/examples/scenes) — self-contained and dependency-free. Render any of them: @@ -35,7 +35,7 @@ The [gallery](/gallery) has the curated visual reel; the [repo README](https://g [annual-report](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/annual-report.ts) · [chart-buildup](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/chart-buildup.ts) · [data-explainer](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/data-explainer.ts) · [flow-diagram](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/flow-diagram.ts) · [github-year](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/github-year.ts) ## Audio -[audio-visualizer](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/audio-visualizer.ts) · [auto-foley-demo](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/auto-foley-demo.ts) · [sample-showcase](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/sample-showcase.ts) · [sfx-compare](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/sfx-compare.ts) · [sfx-showcase](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/sfx-showcase.ts) +[audio-visualizer](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/audio-visualizer.ts) · [auto-foley-demo](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/auto-foley-demo.ts) · [narrated-demo](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/narrated-demo.ts) · [sample-showcase](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/sample-showcase.ts) · [sfx-compare](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/sfx-compare.ts) · [sfx-showcase](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/sfx-showcase.ts) ## Logo stings [logo-reveal](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/logo-reveal.ts) · [logo-reveal-regen](https://github.com/kiyeonjeon21/reframe/blob/main/examples/scenes/logo-reveal-regen.ts) diff --git a/docs/gallery.mdx b/docs/gallery.mdx index eabb65d..d02507f 100644 --- a/docs/gallery.mdx +++ b/docs/gallery.mdx @@ -3,7 +3,7 @@ title: Gallery description: "A reel of reframe scenes — each a few-line declaration, each a deterministic render." --- -Every clip below is a scene in [`examples/scenes/`](https://github.com/kiyeonjeon21/reframe/tree/main/examples/scenes). Render any of them yourself with `npx reframe-video render examples/scenes/.ts`. The full list — 67 scenes by category — is on the [Examples](/examples) page. +Every clip below is a scene in [`examples/scenes/`](https://github.com/kiyeonjeon21/reframe/tree/main/examples/scenes). Render any of them yourself with `npx reframe-video render examples/scenes/.ts`. The full list — 68 scenes by category — is on the [Examples](/examples) page. These gifs are the curated showcase. New renders accumulate here via `pnpm gallery` (the committed home, vs the gitignored `out/` scratch). diff --git a/examples/README.md b/examples/README.md index 2e1d3eb..2a7672a 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,6 +1,6 @@ # Examples -67 curated scenes, one per `.ts` file in [`scenes/`](scenes). Each is a single, self-contained, dependency-free document — render any of them: +68 curated scenes, one per `.ts` file in [`scenes/`](scenes). Each is a single, self-contained, dependency-free document — render any of them: ```bash pnpm reframe render examples/scenes/.ts # in this repo @@ -90,6 +90,7 @@ Also here: [`overlays/`](overlays) (human-edit layers), [`compositions/`](compos |---|---| | `audio-visualizer` | "THE DROP": radial spectrum bars, a pulsing core, a particle burst. | | `auto-foley-demo` | `autoFoley` scoring motion — whoosh / thud / pop following the tweens. | +| `narrated-demo` | Scene-fitted Kokoro voiceover: `audio.narration` from a sibling `script.json`, each line auto-fitted to its slot by `reframe narrate`, bed ducking under it. | | `sample-showcase` | The CC0 sample library: keypress / footstep / click / confirm / UI sounds. | | `sfx-compare` | Synth vs sample A/B for the six original names. | | `sfx-showcase` | The procedural SFX palette, per-cue seeded variation as a little melody. | diff --git a/examples/scenes/narrated-demo-vo/close.wav b/examples/scenes/narrated-demo-vo/close.wav new file mode 100644 index 0000000..ebc8a70 Binary files /dev/null and b/examples/scenes/narrated-demo-vo/close.wav differ diff --git a/examples/scenes/narrated-demo-vo/intro.wav b/examples/scenes/narrated-demo-vo/intro.wav new file mode 100644 index 0000000..98df9e0 Binary files /dev/null and b/examples/scenes/narrated-demo-vo/intro.wav differ diff --git a/examples/scenes/narrated-demo-vo/point.wav b/examples/scenes/narrated-demo-vo/point.wav new file mode 100644 index 0000000..c30fdf0 Binary files /dev/null and b/examples/scenes/narrated-demo-vo/point.wav differ diff --git a/examples/scenes/narrated-demo-vo/script.json b/examples/scenes/narrated-demo-vo/script.json new file mode 100644 index 0000000..ec84705 --- /dev/null +++ b/examples/scenes/narrated-demo-vo/script.json @@ -0,0 +1,23 @@ +[ + { + "at": "intro", + "text": "This is reframe.", + "file": "narrated-demo-vo/intro.wav", + "voice": "af_heart", + "duration": 1.775 + }, + { + "at": "point", + "text": "Anchored to the timeline, it survives.", + "file": "narrated-demo-vo/point.wav", + "voice": "af_heart", + "duration": 2.775 + }, + { + "at": "close", + "text": "Open source.", + "file": "narrated-demo-vo/close.wav", + "voice": "af_heart", + "duration": 1.65 + } +] diff --git a/examples/scenes/narrated-demo.ts b/examples/scenes/narrated-demo.ts new file mode 100644 index 0000000..736c133 --- /dev/null +++ b/examples/scenes/narrated-demo.ts @@ -0,0 +1,45 @@ +// Narrated demo — a pure-vector scene whose voiceover is authored as a sibling +// `narrated-demo-vo/script.json` (imported into `audio.narration`) and synthesized +// + fitted to the timeline by `reframe narrate`. Each line anchors to a timeline +// label, so the VO survives retiming/regen; `narrate` reads the label clock and +// fits each line's speech rate to its slot. +// +// reframe narrate examples/scenes/narrated-demo.ts --dry-run # fit table (no synth) +// reframe narrate examples/scenes/narrated-demo.ts # synth + fit (python+kokoro) +// reframe render examples/scenes/narrated-demo.ts # mp4, bed ducks under the VO +// +// The .wav are out-of-band assets (not bundled to npm, not golden) — commit +// script.json + the generated wavs together. Image/audio file cues don't render +// in player/artifacts; mp4 only. + +import { scene, rect, text, seq, tween, wait, linearGradient } from "@reframe/core"; +import vo from "./narrated-demo-vo/script.json"; + +const W = 1920, H = 1080; + +export default scene({ + id: "narrated-demo", + size: { width: W, height: H }, + fps: 30, + background: "#06070C", + nodes: [ + rect({ id: "bg", x: 0, y: 0, width: W, height: H, fill: linearGradient(["#0A1430", "#06070C"], { angle: 90 }) }), + text({ id: "title", x: W / 2, y: H / 2 - 40, anchor: "center", content: "reframe", fontFamily: "Inter", fontSize: 200, fontWeight: 800, fill: "#FFFFFF", opacity: 0 }), + text({ id: "sub", x: W / 2, y: H / 2 + 110, anchor: "center", content: "voice that fits the scene", fontFamily: "Inter", fontSize: 46, fontWeight: 500, fill: "#7FB4FF", opacity: 0 }), + ], + // labels (intro / point / close) are the stable anchors the narration lines bind to + timeline: seq( + wait(0.4), + tween("title", { opacity: 1 }, { duration: 0.6, ease: "easeOutCubic", label: "intro" }), + wait(2.4), + tween("sub", { opacity: 1 }, { duration: 0.5, ease: "easeOutCubic", label: "point" }), + wait(3.2), + tween("title", { opacity: 0 }, { duration: 0.6, ease: "easeInCubic", label: "close" }), + tween("sub", { opacity: 0 }, { duration: 0.6, ease: "easeInCubic", label: "close-sub" }), + wait(0.6), + ), + audio: { + bgm: { synth: "ambient-pad", gain: 0.25, fadeIn: 1, fadeOut: 1.5, duck: { depth: 0.6 } }, + narration: vo, + }, +}); diff --git a/examples/tsconfig.json b/examples/tsconfig.json index 5919ed7..51bc6cf 100644 --- a/examples/tsconfig.json +++ b/examples/tsconfig.json @@ -2,7 +2,8 @@ "extends": "../tsconfig.base.json", "compilerOptions": { "lib": ["ES2022", "DOM", "DOM.Iterable"], - "types": ["node"] + "types": ["node"], + "resolveJsonModule": true }, "include": ["scenes", "scripts"] } diff --git a/packages/core/src/audio.ts b/packages/core/src/audio.ts index 907e167..7919529 100644 --- a/packages/core/src/audio.ts +++ b/packages/core/src/audio.ts @@ -115,7 +115,8 @@ export function resolveAudioPlan(compiled: CompiledScene): AudioPlan | null { ? autoFoley(compiled, audio.autoFoley === true ? {} : audio.autoFoley) : []; const manualCues = [...(audio?.cues ?? []), ...autoCues]; - if (!audio || (!audio.bgm && manualCues.length === 0)) { + const narrationLines = audio?.narration ?? []; + if (!audio || (!audio.bgm && manualCues.length === 0 && narrationLines.length === 0)) { // a scene with only video-clip audio still gets a plan return clipAudio.length === 0 ? null @@ -160,6 +161,46 @@ export function resolveAudioPlan(compiled: CompiledScene): AudioPlan | null { : { kind: "file", path: cue.file! }, }); } + + // Narration lines render as label-anchored file cues (after `reframe narrate` + // bakes their wav). Each carries a real `duration`, so the bed ducks under the + // whole utterance. An un-synthesized line (no `file`) warns and is skipped. + for (const [index, line] of narrationLines.entries()) { + let anchor: number; + if (typeof line.at === "number") { + anchor = line.at; + } else { + const span = compiled.labelTimes.get(line.at); + if (!span) { + warnings.push(`narration[${index}]: unknown label "${line.at}" — dropped`); + continue; + } + anchor = span.t0; + } + if (!line.file) { + warnings.push(`narration "${line.at}" not synthesized — run reframe narrate`); + continue; + } + const t = Math.max(0, anchor + (line.offset ?? 0)); + if (t >= duration) { + warnings.push(`narration "${line.at}" at ${t.toFixed(2)}s starts past the scene end (${duration.toFixed(2)}s) — dropped`); + continue; + } + const lineDuration = line.duration ?? FILE_CUE_DURATION; + if (t + lineDuration > duration) { + warnings.push(`narration "${line.at}" at ${t.toFixed(2)}s extends past the scene end — it will be truncated`); + } + cues.push({ + t, + gain: line.gain ?? 1.15, + duration: lineDuration, + fadeIn: 0, + fadeOut: 0, + pan: 0, + source: { kind: "file", path: line.file }, + }); + } + cues.sort((a, b) => a.t - b.t); return { diff --git a/packages/core/src/ir.ts b/packages/core/src/ir.ts index b668971..b3ae22a 100644 --- a/packages/core/src/ir.ts +++ b/packages/core/src/ir.ts @@ -468,6 +468,33 @@ export interface AudioCueIR { params?: Record; } +/** + * A narration line — a spoken voiceover anchored to the timeline, fitted to the + * scene. The author writes `at` + `text` (+ optional `voice`/`gain`); the + * `reframe narrate` generator reads the label clock, synthesizes the line with a + * Kokoro TTS sidecar, fits its speech rate to the slot, and bakes `file`/`speed`/ + * `duration` back. At render it behaves as a label-anchored `file` cue (so it + * survives retiming/regen), with `duration` sizing the bed's duck window. + */ +export interface NarrationLineIR { + /** Anchor: a timeline label (the step's start) or absolute seconds. */ + at: string | number; + /** The line to speak. */ + text: string; + /** Kokoro voice (e.g. "af_heart", "am_michael"); default chosen by `narrate`. */ + voice?: string; + /** Linear gain, default ~1.15 (voiceover sits above the bed). */ + gain?: number; + /** Seconds relative to the anchor (default 0). */ + offset?: number; + /** BAKED by `reframe narrate`: scene-relative wav path (e.g. "demo-vo/intro.wav"). */ + file?: string; + /** BAKED by `reframe narrate`: the fitted speech rate (default 1). */ + speed?: number; + /** BAKED by `reframe narrate`: measured wav length (s) — sizes the duck window. */ + duration?: number; +} + export interface AudioIR { bgm?: { file?: string; @@ -480,6 +507,12 @@ export interface AudioIR { duck?: { depth?: number; attack?: number; release?: number } | false; }; cues?: AudioCueIR[]; + /** + * Spoken voiceover lines, each anchored to the timeline and fitted to the scene + * by `reframe narrate`. Render-equivalent to label-anchored `file` cues (after + * synthesis), so they survive retiming/regen. See {@link NarrationLineIR}. + */ + narration?: NarrationLineIR[]; /** * Auto-generate sound cues from node motion (move→whoosh, settle→impact, * scale-in→pop, panned by position). Deterministic + retime-safe (re-derived diff --git a/packages/core/src/validate.ts b/packages/core/src/validate.ts index 8377148..be907cf 100644 --- a/packages/core/src/validate.ts +++ b/packages/core/src/validate.ts @@ -332,6 +332,25 @@ export function validateScene(ir: SceneIR): void { add("audio-range", cp, `${cp}: pan must be in [-1, 1] (-1 left … +1 right)`); } } + for (const [i, line] of (ir.audio?.narration ?? []).entries()) { + const np = `audio.narration[${i}]`; + if (typeof line.at === "string" && !labels.has(line.at)) { + add("unknown-timeline-label", np, `${np}: unknown timeline label "${line.at}" — known labels: ${[...labels].join(", ") || "(none)"}`); + } + if (typeof line.at === "number" && line.at < 0) { + add("bad-duration", np, `${np}: "at" must be >= 0`); + } + if (typeof line.text !== "string" || line.text.trim() === "") { + add("narration-text", np, `${np}: "text" is required and must be non-empty`); + } + if (line.gain !== undefined && line.gain < 0) { + add("audio-range", np, `${np}: gain must be >= 0`); + } + if (line.speed !== undefined && line.speed <= 0) { + add("narration-speed", np, `${np}: speed must be > 0`); + } + } + const duck = ir.audio?.bgm?.duck; if (typeof duck === "object" && duck !== null && duck.depth !== undefined && (duck.depth < 0 || duck.depth > 1)) { add("audio-range", "audio.bgm.duck.depth", "audio.bgm.duck.depth must be in [0, 1]"); diff --git a/packages/core/test/audio.test.ts b/packages/core/test/audio.test.ts index 2b6c286..f3c58a4 100644 --- a/packages/core/test/audio.test.ts +++ b/packages/core/test/audio.test.ts @@ -103,6 +103,42 @@ describe("resolveAudioPlan", () => { }); }); +describe("narration", () => { + it("resolves a synthesized line to a label-anchored file cue with a real duck window", () => { + const plan = resolveAudioPlan( + base({ + narration: [{ at: "move", text: "hello", file: "t-vo/move.wav", duration: 0.8 }], + }), + )!; + expect(plan.cues).toHaveLength(1); + expect(plan.cues[0]).toMatchObject({ + t: 0.5, + gain: 1.15, + duration: 0.8, + source: { kind: "file", path: "t-vo/move.wav" }, + }); + // the bed ducks under the whole utterance, not the 0.4s file default + expect(plan.duckWindows).toEqual([{ t0: 0.5, t1: 1.3 }]); + }); + + it("warns and drops an un-synthesized line (no file yet)", () => { + const plan = resolveAudioPlan(base({ narration: [{ at: "fade", text: "not yet" }] }))!; + expect(plan).not.toBeNull(); + expect(plan.cues).toEqual([]); + expect(plan.warnings.some((w) => w.includes('narration "fade" not synthesized'))).toBe(true); + }); + + it("coexists with cues and sorts by time", () => { + const plan = resolveAudioPlan( + base({ + cues: [{ at: "tail", sfx: "pop" }], + narration: [{ at: "lead", text: "intro", file: "t-vo/lead.wav", duration: 0.4 }], + }), + )!; + expect(plan.cues.map((c) => c.t)).toEqual([0, 2.0]); + }); +}); + describe("clip audio (video nodes)", () => { const vscene = (props: Record, audio?: AudioIR) => compileScene( @@ -165,4 +201,16 @@ describe("audio validation", () => { expect(() => make({ cues: [{ at: "w", sfx: "pop", file: "x.wav" }] })).toThrowError(/exactly one/); expect(() => make({ cues: [{ at: "w", sfx: "kaboom" as never }] })).toThrowError(/unknown sfx "kaboom"/); }); + + it("validates narration: known label, non-empty text, positive speed", () => { + expect(() => make({ narration: [{ at: "nope", text: "hi" }] })).toThrowError( + /unknown timeline label "nope"/, + ); + expect(() => make({ narration: [{ at: "w", text: " " }] })).toThrowError( + /"text" is required and must be non-empty/, + ); + expect(() => make({ narration: [{ at: "w", text: "hi", speed: 0 }] })).toThrowError( + /speed must be > 0/, + ); + }); }); diff --git a/packages/reframe-video/package.json b/packages/reframe-video/package.json index 77e869a..01ac88e 100644 --- a/packages/reframe-video/package.json +++ b/packages/reframe-video/package.json @@ -1,6 +1,6 @@ { "name": "reframe-video", - "version": "0.6.43", + "version": "0.6.44", "description": "Declarative motion graphics that AI can write and humans can tweak — human edits survive AI regeneration. Deterministic mp4 renders from a plain-data scene format.", "keywords": [ "motion-graphics", diff --git a/packages/reframe-video/scripts/build.ts b/packages/reframe-video/scripts/build.ts index 57242d6..2263c56 100644 --- a/packages/reframe-video/scripts/build.ts +++ b/packages/reframe-video/scripts/build.ts @@ -36,6 +36,7 @@ const nodeBundles: [entry: string, out: string][] = [ ["packages/render-cli/src/labels.ts", "labels.js"], ["packages/render-cli/src/compile.ts", "compile.js"], ["packages/render-cli/src/assemble.ts", "assemble.js"], + ["packages/render-cli/src/narrate.ts", "narrate.js"], ["packages/render-cli/src/manifest.ts", "manifest.js"], ["packages/render-cli/src/lint.ts", "lint.js"], ["packages/render-cli/src/verifyOverlay.ts", "verifyOverlay.js"], @@ -175,6 +176,9 @@ if (!/from\s*["']reframe-video["']/.test(rcJs)) throw new Error("renderer-canvas // --- assets & guides ------------------------------------------------------- await cp(join(REPO, "assets", "fonts"), join(PKG, "assets", "fonts"), { recursive: true }); await cp(join(REPO, "assets", "sfx"), join(PKG, "assets", "sfx"), { recursive: true }); +// the Kokoro TTS sidecar ships beside dist/narrate.js (narrate.ts resolves it via +// `new URL("./narrate.py", import.meta.url)`). +await cp(join(REPO, "packages/render-cli/src/narrate.py"), join(PKG, "dist", "narrate.py")); await mkdir(join(PKG, "guides"), { recursive: true }); // Guides ship flat under guides/; sources are the authoring docs under docs/. // Keep this set in sync with the GUIDE map in render-cli/src/reframe.ts. diff --git a/packages/render-cli/src/narrate.py b/packages/render-cli/src/narrate.py new file mode 100644 index 0000000..6b39267 --- /dev/null +++ b/packages/render-cli/src/narrate.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# Kokoro-TTS sidecar for `reframe narrate`. Reads a JSON request on stdin: +# { "outDir": "...", "lang": "a", "lines": [{ "stem", "text", "voice", "speed" }] } +# synthesizes each line at the given speed, writes /.wav @24kHz, +# and prints { "durations": { "": } } on stdout. +# +# Out-of-band by design: the .wav are external assets (not part of reframe's +# golden/determinism contract), like images. Requires `kokoro` + espeak-ng. +import sys, os, json, warnings + +warnings.filterwarnings("ignore") + + +def main(): + req = json.load(sys.stdin) + out_dir = req["outDir"] + lang = req.get("lang", "a") + os.makedirs(out_dir, exist_ok=True) + + try: + import numpy as np + import soundfile as sf + from kokoro import KPipeline + except Exception as e: # pragma: no cover - environment dependent + print(json.dumps({"error": f"kokoro import failed: {e}"})) + sys.exit(3) + + pipe = KPipeline(lang_code=lang) + durations = {} + for line in req["lines"]: + stem = line["stem"] + text = line["text"] + voice = line.get("voice", "af_heart") + speed = float(line.get("speed", 1.0)) + chunks = [a for _, _, a in pipe(text, voice=voice, speed=speed)] + audio = np.concatenate(chunks) if len(chunks) > 1 else chunks[0] + path = os.path.join(out_dir, f"{stem}.wav") + sf.write(path, audio, 24000) + durations[stem] = round(len(audio) / 24000, 4) + + print(json.dumps({"durations": durations})) + + +if __name__ == "__main__": + main() diff --git a/packages/render-cli/src/narrate.ts b/packages/render-cli/src/narrate.ts new file mode 100644 index 0000000..7524171 --- /dev/null +++ b/packages/render-cli/src/narrate.ts @@ -0,0 +1,217 @@ +#!/usr/bin/env tsx +/** + * `reframe narrate [--voice ] [--lang a] [--max-speed 1.3] + * [--script ] [--dry-run]` — scene-fitted Kokoro voiceover. + * + * Reads a narration script (a sibling `-vo/script.json` of `{ at, text }` + * lines the scene imports into `audio.narration`), computes each line's time slot + * from the compiled label clock, synthesizes it with a Kokoro python sidecar, and + * AUTO-FITS its speech rate so it fits the slot (bounded; warns if even the max + * speed overruns). Bakes `file` / `voice` / `speed` / `duration` back into the + * script.json — the scene then plays each line as a label-anchored `file` cue that + * survives retiming/regen, with the bed ducking under the whole utterance. + * + * Determinism: the .wav are external assets (same-machine, Kokoro-version + * dependent), not part of the golden contract — commit script.json + wavs together. + */ +import { spawn } from "node:child_process"; +import { existsSync } from "node:fs"; +import { mkdir, readFile, writeFile } from "node:fs/promises"; +import { basename, dirname, isAbsolute, join, relative, resolve } from "node:path"; +import { fileURLToPath } from "node:url"; +import { compileScene } from "@reframe/core"; +import { loadScene } from "./loadScene.js"; + +const NARRATE_PY = fileURLToPath(new URL("./narrate.py", import.meta.url)); +const CWD = process.env.INIT_CWD ?? process.cwd(); +const userPath = (p: string) => (isAbsolute(p) ? p : resolve(CWD, p)); + +interface Line { + at: string | number; + text: string; + voice?: string; + gain?: number; + offset?: number; + file?: string; + speed?: number; + duration?: number; +} + +interface Args { + scene?: string; + voice: string; + lang: string; + maxSpeed: number; + script?: string; + dryRun: boolean; +} + +function fail(msg: string): never { + console.error(`error: ${msg}`); + process.exit(1); +} + +function parseArgs(argv: string[]): Args { + const a: Args = { voice: "af_heart", lang: "a", maxSpeed: 1.3, dryRun: false }; + for (let i = 0; i < argv.length; i++) { + const arg = argv[i]!; + const next = () => argv[++i] ?? fail(`${arg} needs a value`); + if (arg === "--voice") a.voice = next(); + else if (arg === "--lang") a.lang = next(); + else if (arg === "--max-speed") a.maxSpeed = Number(next()); + else if (arg === "--script") a.script = next(); + else if (arg === "--dry-run") a.dryRun = true; + else if (arg.startsWith("-")) fail(`unknown flag "${arg}"`); + else if (!a.scene) a.scene = arg; + else fail(`unexpected argument "${arg}"`); + } + return a; +} + +const slug = (s: string) => s.replace(/[^a-zA-Z0-9_-]+/g, "-").replace(/^-+|-+$/g, "") || "line"; +const stemOf = (line: Line, i: number) => (typeof line.at === "string" ? slug(line.at) : `line${i}`); +const posix = (p: string) => p.split("\\").join("/"); +// ~2.6 words/sec is a typical narration pace — a rough length estimate for --dry-run. +const estimateSecs = (text: string) => Math.max(0.4, text.trim().split(/\s+/).length / 2.6); + +/** Run narrate.py with a JSON request on stdin, parse the JSON result. */ +function synth(req: unknown): Promise<{ durations?: Record; error?: string }> { + return new Promise((res, rej) => { + const proc = spawn("python3", [NARRATE_PY], { stdio: ["pipe", "pipe", "inherit"] }); + let stdout = ""; + proc.stdout.on("data", (d: Buffer) => (stdout += d.toString())); + proc.on("error", rej); // ENOENT — python3 missing + proc.on("close", (code) => { + try { + res(JSON.parse(stdout.trim().split("\n").pop() ?? "{}")); + } catch { + rej(new Error(`narrate.py produced no JSON (exit ${code})`)); + } + }); + proc.stdin.write(JSON.stringify(req)); + proc.stdin.end(); + }); +} + +async function main() { + const args = parseArgs(process.argv.slice(2)); + if (!args.scene) { + fail('narrate needs a scene file\nusage: reframe narrate [--voice ] [--lang a] [--max-speed 1.3] [--script ] [--dry-run]'); + } + const scenePath = userPath(args.scene); + if (!existsSync(scenePath)) fail(`no such file: ${scenePath}`); + + // the label clock — every line's slot is a window in the scene's own timeline + const scene = await loadScene(scenePath); + const compiled = compileScene(scene); + const duration = compiled.duration; + + const sceneDir = dirname(scenePath); + const sceneBase = basename(scenePath).replace(/\.(ts|json)$/, ""); + const scriptPath = args.script ? userPath(args.script) : join(sceneDir, `${sceneBase}-vo`, "script.json"); + if (!existsSync(scriptPath)) { + fail( + `no narration script at ${scriptPath}\n` + + `create it as a JSON array and import it into your scene's audio.narration, e.g.:\n` + + ` [ { "at": "