From 35d3eb64b91d1f9480892ada5065188281ebeb24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alby=20Hern=C3=A1ndez?= <donfumero@gmail.com>
Date: Thu, 23 Apr 2026 22:47:14 +0100
Subject: [PATCH] feat: Support multiple input formats w/ ffmpeg

---
 .agents/AGENTS.md           |  93 ++++++++++++----
 .agents/DESIGN_DECISIONS.md |  55 +++++++--
 .agents/TODO.md             |   5 +-
 Dockerfile                  |   3 +-
 README.md                   |  55 +++++----
 internal/asr/audio.go       |  10 ++
 internal/asr/audio_test.go  | 216 ++++++++++++++++++++++++++++++++++++
 internal/asr/ffmpeg.go      | 205 ++++++++++++++++++++++++++++++++++
 internal/asr/transcriber.go |  50 +++++++--
 internal/server/handlers.go |   8 ++
 internal/server/server.go   |  21 +++-
 main.go                     |   3 +
 12 files changed, 661 insertions(+), 63 deletions(-)
 create mode 100644 internal/asr/audio_test.go
 create mode 100644 internal/asr/ffmpeg.go

diff --git a/.agents/AGENTS.md b/.agents/AGENTS.md
index d5c659d..18bfa3f 100644
--- a/.agents/AGENTS.md
+++ b/.agents/AGENTS.md
@@ -7,6 +7,7 @@ This document helps AI agents work effectively in this codebase.
 **Parakeet ASR Server** - A Go-based automatic speech recognition (ASR) server using NVIDIA's Parakeet TDT 0.6B model in ONNX format. Provides an OpenAI Whisper-compatible API for audio transcription.
 
 ### Key Technologies
+
 - **Language**: Go 1.25+
 - **ML Runtime**: ONNX Runtime 1.21.x (CPU inference)
 - **Model**: NVIDIA Parakeet TDT 0.6B (Conformer-based encoder with Token-and-Duration Transducer decoder)
@@ -65,7 +66,9 @@ parakeet/
 │   ├── asr/
 │   │   ├── transcriber.go  # ONNX inference pipeline, TDT decoding
 │   │   ├── mel.go          # Mel filterbank feature extraction (FFT, windowing)
-│   │   └── audio.go        # WAV parsing, resampling to 16kHz
+│   │   ├── audio.go        # WAV parsing, magic-byte detection, resampling to 16kHz
+│   │   ├── ffmpeg.go       # Optional ffmpeg-backed converter for non-WAV inputs
+│   │   └── audio_test.go   # Unit + concurrency tests for audio/ffmpeg logic
 │   └── server/
 │       ├── server.go       # HTTP server, route setup, lifecycle management
 │       ├── handlers.go     # API endpoint handlers, response formatting
@@ -88,26 +91,29 @@ parakeet/
 ## Code Organization
 
 ### `main.go` (Entry Point)
-- Parses CLI flags: `-port`, `-models`, `-log-level`, `-log-format`, `-workers`
+
+- Parses CLI flags: `-port`, `-models`, `-log-level`, `-log-format`, `-workers`, `-ffmpeg`, `-ffmpeg-path`, `-ffmpeg-timeout`
 - Configures `slog` global logger (text or JSON handler, four log levels)
 - Runs server in background goroutine, listens for SIGINT/SIGTERM
 - Graceful shutdown: waits up to 30s for in-flight requests via `http.Server.Shutdown`
 - Calls `srv.Close()` after shutdown to release ONNX resources
-- Default port: 5092, default models dir: `./models`, default log level: `info`, default log format: `text`, default workers: `4`
+- Default port: 5092, default models dir: `./models`, default log level: `info`, default log format: `text`, default workers: `4`, ffmpeg fallback enabled by default, ffmpeg timeout: `60s`
 
 ### `internal/server/` (HTTP Server Package)
 
 #### `server.go`
-- `Config` struct: Port, ModelsDir, LogLevel, LogFormat, Workers settings
+
+- `Config` struct: Port, ModelsDir, LogLevel, LogFormat, Workers, FFmpegEnabled, FFmpegPath, FFmpegTimeout
 - `Server` struct: wraps config, transcriber, `http.Server`, HTTP mux, and API key
-- `New()` - Initializes transcriber with worker pool, reads `PARAKEET_API_KEY` env var, and sets up routes
+- `New()` - Initializes transcriber with worker pool and optional ffmpeg converter, reads `PARAKEET_API_KEY` env var, and sets up routes
 - `Run()` - Starts HTTP listener (blocks until shutdown or error)
 - `Shutdown(ctx)` - Graceful HTTP shutdown, waits for in-flight requests to finish
 - `Close()` - Releases transcriber and ONNX resources (must be called after Shutdown)
 - `requireAuth()` - Middleware that validates `Authorization: Bearer <key>` on `/v1/*` routes
 
 #### `handlers.go`
-- `handleTranscription()` - Main endpoint, parses multipart form, returns transcription
+
+- `handleTranscription()` - Main endpoint, parses multipart form, returns transcription. Maps `asr.ErrUnsupportedAudio` to HTTP 400 `invalid_request_error`; other errors fall back to HTTP 500 `server_error`.
 - `handleTranslation()` - Delegates to transcription (Parakeet is English-focused)
 - `handleModels()` - Returns available models (parakeet-tdt-0.6b, whisper-1 alias)
 - `handleHealth()` - Health check endpoint
@@ -115,6 +121,7 @@ parakeet/
 - CORS and error response utilities
 
 #### `types.go`
+
 - `TranscriptionResponse` - Simple JSON response with text
 - `VerboseTranscriptionResponse` - Detailed response with segments, timing
 - `Segment` - Transcription segment with timing info
@@ -124,18 +131,29 @@ parakeet/
 ### `internal/asr/` (ASR Package)
 
 #### `transcriber.go`
+
 - `DebugMode` - Global flag for verbose logging
 - `Config` - Model configuration (features_size, subsampling_factor)
+- `Options` - Optional knobs passed to `NewTranscriber` (currently wraps `FFmpegConfig`)
+- `ErrUnsupportedAudio` - Sentinel error returned when input is neither WAV nor convertible. Used by the HTTP layer to map to 400.
 - `decoderWorker` - Holds a persistent decoder ONNX session with pre-allocated reusable tensors
-- `Transcriber` - Main inference struct with a pool of `decoderWorker`s
-- `NewTranscriber(modelsDir, workers)` - Loads config, vocab, initializes ONNX Runtime, creates decoder pool
+- `Transcriber` - Main inference struct with a pool of `decoderWorker`s and optional `ffmpegConverter`
+- `NewTranscriber(modelsDir, workers, opts)` - Loads config, vocab, initializes ONNX Runtime, creates decoder pool and (optionally) probes ffmpeg
 - `Transcribe()` - Main entry: audio -> mel -> encoder -> TDT decode -> text
-- `loadAudio()` - Format detection and parsing
+- `loadAudio()` - Detects WAV by magic bytes (RIFF/WAVE); falls back to ffmpeg conversion when available, otherwise returns `ErrUnsupportedAudio`
 - `runInference()` - Encoder ONNX session (per-request, variable shape), then acquires pool worker for decode
 - `tdtDecode()` - TDT greedy decoding loop reusing pooled session and tensors
 - `tokensToText()` - Token IDs to text with cleanup
 
+#### `ffmpeg.go`
+
+- `FFmpegConfig` - Public struct with `Enabled`, `BinaryPath`, `Timeout`
+- `ffmpegConverter` - Encapsulates an ffmpeg binary path and a conversion timeout; safe for concurrent use
+- `newFFmpegConverter()` - Probes the binary once with `exec.LookPath`; returns `nil` (logging a warning) when ffmpeg is disabled or missing
+- `Convert()` - Writes input to `os.CreateTemp` (unique path per call), runs `ffmpeg` via `exec.CommandContext` with captured stderr, reads the resulting WAV. Wraps non-zero exits and timeouts in `ErrUnsupportedAudio`.
+
 #### `mel.go`
+
 - `MelFilterbank` - Mel-scale filterbank feature extractor
 - `NewMelFilterbank()` - Creates filterbank with NeMo defaults (128 mels, 512 FFT)
 - `Extract()` - Computes mel features with Hann windowing
@@ -144,20 +162,23 @@ parakeet/
 - Mel/Hz conversion helpers
 
 #### `audio.go`
+
+- `isWAV()` - Magic-byte check (RIFF/WAVE) used for content-based format detection
 - `parseWAV()` - WAV parser supporting multiple chunk layouts
 - `convertToFloat32()` - Supports 8/16/24/32-bit PCM and 32-bit float
 - `resample()` - Linear interpolation resampling to 16kHz
 
 ## API Endpoints
 
-| Method | Path | Description |
-|--------|------|-------------|
-| POST | `/v1/audio/transcriptions` | Transcribe audio (OpenAI-compatible) |
-| POST | `/v1/audio/translations` | Translate audio (delegates to transcription) |
-| GET | `/v1/models` | List available models |
-| GET | `/health` | Health check |
+| Method | Path                       | Description                                  |
+| ------ | -------------------------- | -------------------------------------------- |
+| POST   | `/v1/audio/transcriptions` | Transcribe audio (OpenAI-compatible)         |
+| POST   | `/v1/audio/translations`   | Translate audio (delegates to transcription) |
+| GET    | `/v1/models`               | List available models                        |
+| GET    | `/health`                  | Health check                                 |
 
 ### Transcription Parameters
+
 - `file` (required) - Audio file (multipart form, max 25MB)
 - `model` - Accepted but ignored (only one model)
 - `language` - ISO-639-1 code (default: "en")
@@ -167,35 +188,40 @@ parakeet/
 ## Code Patterns & Conventions
 
 ### Naming
+
 - Go standard naming (camelCase for private, PascalCase for exported)
 - Descriptive function names: `parseWAV`, `convertToFloat32`, `tdtDecode`
 - Type suffixes for ONNX tensors: `inputTensor`, `outputTensor`, `lengthTensor`
 
 ### Error Handling
+
 - Wrap errors with `fmt.Errorf("context: %w", err)`
 - Return early on error
 - Cleanup resources with `defer` (tensor.Destroy(), file.Close())
 
 ### ONNX Runtime Usage
+
 - Create tensors with `ort.NewTensor(shape, data)`
 - Use `ort.NewAdvancedSession()` for named inputs/outputs
 - Always call `.Destroy()` on tensors and sessions after use
 - Memory-conscious: tensors created and destroyed per inference step in decode loop
 
 ### Response Formats
+
 - JSON structs use tags: `json:"field_name"` with `omitempty` where appropriate
 - OpenAI-compatible response structures
 
 ## Environment Variables
 
-| Variable | Description | Default |
-|----------|-------------|---------|
-| `ONNXRUNTIME_LIB` | Path to libonnxruntime.so | Auto-detect |
+| Variable           | Description                                 | Default               |
+| ------------------ | ------------------------------------------- | --------------------- |
+| `ONNXRUNTIME_LIB`  | Path to libonnxruntime.so                   | Auto-detect           |
 | `PARAKEET_API_KEY` | API key for `/v1/*` endpoint authentication | Empty (auth disabled) |
 
 ## Dependencies
 
 From `go.mod`:
+
 ```
 go 1.25.5
 github.com/yalue/onnxruntime_go v1.19.0
@@ -206,17 +232,20 @@ No other external Go dependencies. Standard library used for HTTP, JSON, audio p
 ## CI/CD
 
 ### CI Pipeline (`.github/workflows/ci.yaml`)
+
 - Runs on push/PR to main/master
 - Jobs: lint (Go 1.22), test (Go 1.25), build (Go 1.25)
 - Lint checks: go vet, gofmt
 
 ### Release Pipeline (`.github/workflows/release.yaml`)
-- Triggers on version tags (v*)
+
+- Triggers on version tags (v\*)
 - Builds binaries for linux/darwin/windows (amd64/arm64)
 - Creates GitHub release with checksums
 - Pushes Docker images to ghcr.io (int8 and fp32 variants)
 
 ### Docker Build
+
 - Multi-stage build with golang:1.25-bookworm builder
 - Runtime: debian:bookworm-slim with ONNX Runtime 1.21.0
 - Models embedded in image during build
@@ -226,32 +255,40 @@ No other external Go dependencies. Standard library used for HTTP, JSON, audio p
 ## Common Tasks for Agents
 
 ### Adding a New Audio Format
-1. Add case in `internal/asr/transcriber.go:loadAudio()`
-2. Implement parser in `internal/asr/audio.go`
-3. Ensure output is `[]float32` normalized to [-1, 1] at 16kHz
+
+- If ffmpeg supports it (most common cases), no code change is needed: `loadAudio` automatically delegates any non-WAV input to the `ffmpegConverter`. Install `ffmpeg` on the target system and keep `-ffmpeg=true` (default).
+- To add a first-class (no-ffmpeg) parser:
+  1. Extend `isWAV`-style detection in `internal/asr/audio.go` with a new magic-byte helper.
+  2. Implement a parser returning `[]float32` normalized to `[-1, 1]` at 16kHz mono.
+  3. Plug it into `Transcriber.loadAudio` before the ffmpeg fallback.
 
 ### Modifying API Response
+
 1. Add/modify structs in `internal/server/types.go`
 2. Update relevant handler in `internal/server/handlers.go`
 3. Follow OpenAI response format conventions
 
 ### Adding a New Endpoint
+
 1. Add handler method to `internal/server/handlers.go`
 2. Register route in `internal/server/server.go:setupRoutes()` — wrap with `s.requireAuth()` for authenticated endpoints
 3. Add types to `internal/server/types.go` if needed
 
 ### Changing Inference Parameters
+
 - Encoder dim: `internal/asr/transcriber.go:247` (`encoderDim := int64(1024)`)
 - LSTM state: `internal/asr/transcriber.go:314-315` (`stateDim`, `numLayers`)
 - Max tokens per step: `internal/asr/transcriber.go:39` (`maxTokensPerStep: 10`)
 - Mel features: `internal/asr/mel.go:25-27` (nFFT, hopLength, winLength)
 
 ### Adding a New Makefile Target
+
 1. Add target with `## Description` comment for help
 2. Use `@` prefix for silent commands
 3. Add to `.PHONY` if not a file target
 
 ### Creating a Release
+
 1. Tag with semver: `git tag v1.0.0`
 2. Push tag: `git push origin v1.0.0`
 3. Release pipeline builds and publishes automatically
@@ -259,6 +296,7 @@ No other external Go dependencies. Standard library used for HTTP, JSON, audio p
 ## Important Gotchas
 
 ### ONNX Runtime Library
+
 - Must be installed separately (not vendored)
 - Set `ONNXRUNTIME_LIB` env var if not in standard paths
 - Auto-detection checks common paths in Makefile and transcriber.go
@@ -266,12 +304,23 @@ No other external Go dependencies. Standard library used for HTTP, JSON, audio p
 - Compatible version: 1.21.x for onnxruntime_go v1.19.0
 
 ### Model Files Required
+
 - `encoder-model.int8.onnx` (~652MB) or `encoder-model.onnx` (~2.5GB)
 - `decoder_joint-model.int8.onnx` (~18MB) or `decoder_joint-model.onnx` (~72MB)
 - `config.json`, `vocab.txt`, `nemo128.onnx`
 - Download via `make models` or manually from HuggingFace
 
 ### Tensor Memory Management
+
 - Tensors must be destroyed manually (no GC)
 - The TDT decode loop creates/destroys tensors each iteration
 - Memory usage: ~2GB RAM for int8 models, ~6GB for fp32
+
+### ffmpeg Conversion
+
+- `ffmpeg` is an optional system dependency. When present (and `-ffmpeg=true`, the default), non-WAV inputs are transcoded to 16 kHz mono PCM WAV on the fly.
+- Detection is done by magic bytes on the uploaded bytes, not by filename extension. Clients can upload without a valid extension.
+- The converter is safe for concurrent use: each conversion allocates its own input/output via `os.CreateTemp`, so two simultaneous requests never collide on disk.
+- Conversions run under `exec.CommandContext` with a timeout (`-ffmpeg-timeout`, default 60s). Timeouts and non-zero exits are wrapped in `ErrUnsupportedAudio` and surface as HTTP 400.
+- When ffmpeg is missing or disabled, only WAV input is accepted; other formats return HTTP 400 with a clear message. The server never crashes because of a missing ffmpeg.
+- The official Docker image installs ffmpeg by default; binary releases rely on the host system having it available.
diff --git a/.agents/DESIGN_DECISIONS.md b/.agents/DESIGN_DECISIONS.md
index 90c0953..ba03f22 100644
--- a/.agents/DESIGN_DECISIONS.md
+++ b/.agents/DESIGN_DECISIONS.md
@@ -11,6 +11,7 @@ Architectural and design decisions made in this project.
 **Rationale**: Allows drop-in replacement for applications already using the OpenAI Whisper API. Reduces integration effort for adopters.
 
 **Consequences**:
+
 - `model`, `prompt`, and `temperature` parameters are accepted but ignored (single model, no prompt conditioning, deterministic greedy decoding)
 - Translation endpoint delegates to transcription since Parakeet is English-focused
 - Error responses follow OpenAI's format (`ErrorResponse`/`ErrorDetail` structs)
@@ -24,6 +25,7 @@ Architectural and design decisions made in this project.
 **Rationale**: Compact model (~670MB int8) with strong English transcription accuracy. TDT decoder predicts both tokens and durations, enabling efficient greedy decoding without beam search.
 
 **Architecture details**:
+
 - Encoder: Conformer, 1024-dim output, 8x subsampling factor
 - Decoder: TDT with 8193 vocab (8192 tokens + blank), 5 duration classes
 - LSTM state: 2 layers × 640 dim
@@ -39,25 +41,29 @@ Architectural and design decisions made in this project.
 **Rationale**: ONNX is a portable, vendor-neutral format. CPU inference avoids GPU dependency, simplifying deployment. The Go bindings (`onnxruntime_go v1.19.0`) provide direct integration without CGo complexity.
 
 **Consequences**:
+
 - ONNX Runtime library (1.21.x) must be installed separately on the host
 - `ONNXRUNTIME_LIB` env var needed if not in standard paths
 - Tensors must be destroyed manually (no GC integration)
 - Memory: ~2GB RAM for int8, ~6GB for fp32
 
-## DD-004: WAV-Only Audio Input
+## DD-004: WAV-First Audio Input (with ffmpeg fallback)
+
+> **Note**: originally "WAV-Only Audio Input". Superseded in scope by **DD-012** when ffmpeg-backed conversion was introduced; this entry is kept for historical context.
 
 **Context**: The server needs to accept audio files for transcription.
 
-**Decision**: Support only WAV format natively. Other formats (WebM, OGG, MP3, M4A) return an error suggesting ffmpeg conversion.
+**Decision**: Natively support WAV in pure Go. Delegate any other format to an optional external `ffmpeg` binary (see DD-012).
 
-**Rationale**: WAV parsing is straightforward with no external dependencies. Adding format support via ffmpeg would introduce a heavy system dependency. Keeps the binary self-contained.
+**Rationale**: WAV parsing is straightforward with no external dependencies. Keeping the fast path in-process preserves the "no external dependencies at runtime" value for the common case while still enabling broad format support when ffmpeg is available.
 
 **Consequences**:
-- Clients must convert non-WAV audio before sending
-- `loadAudio()` in `transcriber.go:207` returns explicit "not yet implemented" for unsupported formats
-- Supports 8/16/24/32-bit PCM and 32-bit float WAV
-- All audio resampled to 16kHz mono internally
-- Minimum audio length: 100ms (1600 samples at 16kHz)
+
+- `loadAudio()` in `transcriber.go` detects WAV by magic bytes (RIFF/WAVE) and parses it in-process.
+- Non-WAV input is routed to the ffmpeg converter; when ffmpeg is unavailable the request returns HTTP 400 with `ErrUnsupportedAudio`.
+- Supports 8/16/24/32-bit PCM and 32-bit float WAV natively.
+- All audio resampled to 16kHz mono internally.
+- Minimum audio length: 100ms (1600 samples at 16kHz).
 
 ## DD-005: Pure Go Audio Processing
 
@@ -68,6 +74,7 @@ Architectural and design decisions made in this project.
 **Rationale**: Zero external dependencies for audio processing. The NeMo-compatible defaults (128 mels, 512-point FFT, Hann window) ensure model compatibility.
 
 **Consequences**:
+
 - Radix-2 Cooley-Tukey FFT implementation in `mel.go`
 - Linear interpolation resampling (simple but sufficient for speech)
 - Per-utterance mean/variance normalization matches NeMo pipeline
@@ -81,6 +88,7 @@ Architectural and design decisions made in this project.
 **Rationale**: ~4x smaller model size (~670MB vs ~2.5GB) with minimal accuracy loss. Significantly reduces download time, disk usage, and memory footprint.
 
 **Consequences**:
+
 - Docker images tagged `latest` use int8
 - fp32 available via `make models-fp32` for maximum accuracy
 - Both variants use the same code paths
@@ -94,6 +102,7 @@ Architectural and design decisions made in this project.
 **Rationale**: Minimal runtime image size. Embedding models avoids runtime downloads and volume mounts for simpler deployment.
 
 **Consequences**:
+
 - Image includes ONNX Runtime 1.21.0
 - Separate images for int8 and fp32 model variants
 - Health check endpoint (`/health`) included for orchestration
@@ -108,6 +117,7 @@ Architectural and design decisions made in this project.
 **Rationale**: Simplest possible auth that covers the common case (single deployment, one key). No database, no user management, no token rotation. Matches how most self-hosted AI APIs work (e.g., Ollama, LocalAI). The OpenAI client libraries already send `Authorization: Bearer` headers, so compatibility is automatic.
 
 **Consequences**:
+
 - `/health` endpoint remains unauthenticated (needed for orchestration probes)
 - Implemented as `requireAuth()` middleware wrapping `/v1/*` route handlers in `server.go`
 - Returns OpenAI-compatible 401 error (`authentication_error`) on invalid/missing key
@@ -124,6 +134,7 @@ Architectural and design decisions made in this project.
 **Encoder**: Kept per-request because input shape varies with audio length (dynamic T dimension). The encoder runs once per request — not per timestep — so the overhead is acceptable. The model file is OS page-cached after first load.
 
 **Consequences**:
+
 - `-workers` flag added (default 4); each worker holds ~18MB for decoder + session overhead
 - Memory is predictable: `workers × ~670MB` (int8) instead of unbounded concurrent loads
 - Throughput: up to `workers` requests processed in parallel
@@ -139,6 +150,7 @@ Architectural and design decisions made in this project.
 **Rationale**: `slog` is stdlib (no new dependencies), provides structured key-value logging, native log levels, and switchable handlers. JSON output is essential for log aggregation in production (ELK, Loki, CloudWatch). Text output stays human-readable for development.
 
 **Consequences**:
+
 - `-log-format` flag added (`text` default, `json` for structured output)
 - `-log-level` flag added (`debug`, `info`, `warn`, `error`; default `info`)
 - `asr.DebugMode` global derived from `log-level == "debug"`, gates expensive debug logs to avoid unnecessary allocations
@@ -152,3 +164,30 @@ Architectural and design decisions made in this project.
 **Decision**: Only one external Go dependency (`onnxruntime_go`). Everything else uses the standard library.
 
 **Rationale**: Reduces supply chain risk, simplifies builds, and minimizes binary size. Go's stdlib is sufficient for HTTP server, JSON handling, audio processing, and math operations.
+
+## DD-012: ffmpeg-Backed Conversion for Non-WAV Audio
+
+**Context**: The original implementation (DD-004) accepted only WAV input and returned a hard error for anything else. That worked, but forced every client to preprocess audio, which is awkward for a Whisper-compatible API (OpenAI clients upload MP3/WebM/M4A routinely). A previous community attempt (PR #5) added ffmpeg but had three problems: it shared temp-file paths across concurrent requests (breaking DD-011's worker pool guarantees), had no timeout/stderr capture, and mapped any failure to HTTP 500.
+
+**Decision**: Introduce an optional ffmpeg-backed converter encapsulated in `internal/asr/ffmpeg.go`.
+
+Key properties:
+
+1. **Detection by content, not extension.** `loadAudio` inspects the first 12 bytes of the payload. If it is a `RIFF ... WAVE` header, parse in-process (zero-deps fast path). Otherwise, hand the bytes to the converter.
+2. **Startup probe.** The ffmpeg binary is resolved once via `exec.LookPath` when the transcriber is built. If it is missing, the converter is simply `nil`: the server starts normally, logs a warning, and rejects non-WAV uploads with a clear HTTP 400 (`ErrUnsupportedAudio`). No crash, no surprise runtime failure.
+3. **Per-request unique temp files.** Each `Convert()` call uses `os.CreateTemp` for both input and output. This is required because DD-011's worker pool allows up to `-workers` concurrent inferences, and each of them may be preceded by a conversion.
+4. **Bounded execution.** `exec.CommandContext` with a configurable timeout (`-ffmpeg-timeout`, default 60s). `stderr` is captured and trimmed into the error message so operators can diagnose bad input.
+5. **Typed errors.** Conversion failures (bad input, timeout, binary missing) are wrapped in `ErrUnsupportedAudio`. The HTTP handler checks with `errors.Is` and returns `400 invalid_request_error`. Everything else stays as `500 server_error`.
+
+**Configuration surface**:
+
+- `-ffmpeg` (bool, default `true`) — toggles the fallback.
+- `-ffmpeg-path` (string, default empty → resolve via `PATH`).
+- `-ffmpeg-timeout` (duration, default `60s`).
+
+**Consequences**:
+
+- Binary releases remain self-contained but optionally leverage a system-installed ffmpeg. The Docker image ships with ffmpeg by default.
+- `DD-004` is superseded in scope: we still support WAV in pure Go as the fast path, but we no longer reject every other format outright.
+- Concurrency semantics of DD-011 are preserved: conversions are independent per request, so `-workers N` continues to bound both decoding _and_ converter parallelism naturally.
+- OpenAI-compatibility improves: clients can upload MP3/WebM/M4A directly, matching the behavior of the real Whisper API.
diff --git a/.agents/TODO.md b/.agents/TODO.md
index 913c5eb..09495ec 100644
--- a/.agents/TODO.md
+++ b/.agents/TODO.md
@@ -4,10 +4,7 @@ Pending tasks and improvements for the project.
 
 ## Audio Format Support
 
-- [ ] **Add ffmpeg-based audio conversion** — Support WebM, OGG, MP3, M4A formats via ffmpeg. Currently `internal/asr/transcriber.go:207` returns "not yet implemented" for these formats. Options:
-  - Shell out to ffmpeg binary (adds system dependency)
-  - Use a pure Go decoder library per format
-  - Accept only WAV and document client-side conversion (current approach)
+- [x] **ffmpeg-based audio conversion** — Implemented in `internal/asr/ffmpeg.go`. WAV is parsed in-process (magic-byte detection); any other format is transcoded via an external `ffmpeg` binary. Configurable with `-ffmpeg`, `-ffmpeg-path`, `-ffmpeg-timeout`. See DD-012 for rationale.
 
 ## API Completeness
 
diff --git a/Dockerfile b/Dockerfile
index 1c575df..da08ab8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -35,12 +35,13 @@ FROM debian:bookworm-slim
 # Model precision: "int8" (default, ~670MB) or "fp32" (~2.5GB)
 ARG MODEL_PRECISION=int8
 
-# Install ONNX Runtime
+# Install ONNX Runtime and ffmpeg (used for non-WAV audio conversion)
 ARG ONNXRUNTIME_VERSION=1.21.0
 ARG TARGETARCH
 RUN apt-get update && apt-get install -y --no-install-recommends \
     ca-certificates \
     curl \
+    ffmpeg \
     && ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64") \
     && curl -L -o /tmp/onnxruntime.tgz \
     "https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-${ARCH}-${ONNXRUNTIME_VERSION}.tgz" \
diff --git a/README.md b/README.md
index b094814..b002fe1 100644
--- a/README.md
+++ b/README.md
@@ -37,11 +37,12 @@ Key features:
 - OpenAI Whisper-compatible REST API
 - API key authentication (optional, via environment variable)
 - ONNX Runtime inference (CPU)
-- No Python or external dependencies at runtime
+- No Python dependency at runtime (ffmpeg is an optional system dependency for non-WAV audio)
 - Structured logging with `slog` (text and JSON formats, configurable log level)
 - Support for multiple response formats (JSON, text, SRT, VTT)
 - Multilingual support (English and 25+ languages)
 - Quantized model support for reduced memory footprint
+- Automatic audio conversion for non-WAV formats (MP3, OGG, WebM, FLAC, M4A, AAC, Opus, ...) when ffmpeg is installed
 
 ## Model Architecture
 
@@ -73,6 +74,7 @@ Parakeet TDT uses Token-and-Duration Transducer decoding, which predicts both th
 
 - **ONNX Runtime 1.17.0 or later** (required at runtime)
 - Parakeet TDT ONNX models (downloaded separately)
+- **ffmpeg** (optional) — enables automatic conversion of MP3, OGG, WebM, FLAC, M4A, AAC, Opus and any other ffmpeg-supported format. When ffmpeg is not present, only WAV input is accepted and non-WAV uploads return a 400 error. The official Docker image already ships with ffmpeg.
 
 For building from source:
 
@@ -248,13 +250,16 @@ services:
 
 ### Command Line Flags
 
-| Flag          | Description                                             | Default    | Example                        |
-| ------------- | ------------------------------------------------------- | ---------- | ------------------------------ |
-| `-port`       | HTTP server port                                        | `5092`     | `-port 8080`                   |
-| `-models`     | Path to models directory                                | `./models` | `-models /opt/parakeet/models` |
-| `-log-level`  | Log level: debug, info, warn, error                     | `info`     | `-log-level debug`             |
-| `-log-format` | Log output format: text or json                         | `text`     | `-log-format json`             |
-| `-workers`    | Concurrent inference workers (each ~670MB RAM for int8) | `4`        | `-workers 2`                   |
+| Flag              | Description                                             | Default    | Example                        |
+| ----------------- | ------------------------------------------------------- | ---------- | ------------------------------ |
+| `-port`           | HTTP server port                                        | `5092`     | `-port 8080`                   |
+| `-models`         | Path to models directory                                | `./models` | `-models /opt/parakeet/models` |
+| `-log-level`      | Log level: debug, info, warn, error                     | `info`     | `-log-level debug`             |
+| `-log-format`     | Log output format: text or json                         | `text`     | `-log-format json`             |
+| `-workers`        | Concurrent inference workers (each ~670MB RAM for int8) | `4`        | `-workers 2`                   |
+| `-ffmpeg`         | Enable ffmpeg fallback for non-WAV audio                | `true`     | `-ffmpeg=false`                |
+| `-ffmpeg-path`    | Path to the ffmpeg binary (empty = resolve from `PATH`) | ``         | `-ffmpeg-path /usr/bin/ffmpeg` |
+| `-ffmpeg-timeout` | Maximum wall-clock time for a single ffmpeg conversion  | `60s`      | `-ffmpeg-timeout 30s`          |
 
 **Examples:**
 
@@ -322,14 +327,14 @@ Transcribes audio into text. Compatible with OpenAI's Whisper API.
 
 Content-Type: `multipart/form-data`
 
-| Parameter         | Type   | Required | Description                                       |
-| ----------------- | ------ | -------- | ------------------------------------------------- |
-| `file`            | file   | Yes      | Audio file (WAV format, max 25MB)                 |
-| `model`           | string | No       | Model name (accepted but ignored)                 |
-| `language`        | string | No       | ISO-639-1 language code (default: en)             |
-| `response_format` | string | No       | Output format: json, text, srt, vtt, verbose_json |
-| `prompt`          | string | No       | Accepted but ignored                              |
-| `temperature`     | float  | No       | Accepted but ignored                              |
+| Parameter         | Type   | Required | Description                                                                            |
+| ----------------- | ------ | -------- | -------------------------------------------------------------------------------------- |
+| `file`            | file   | Yes      | Audio file (WAV always supported; MP3/OGG/WebM/FLAC/M4A/AAC/Opus via ffmpeg, max 25MB) |
+| `model`           | string | No       | Model name (accepted but ignored)                                                      |
+| `language`        | string | No       | ISO-639-1 language code (default: en)                                                  |
+| `response_format` | string | No       | Output format: json, text, srt, vtt, verbose_json                                      |
+| `prompt`          | string | No       | Accepted but ignored                                                                   |
+| `temperature`     | float  | No       | Accepted but ignored                                                                   |
 
 **Response**
 
@@ -487,11 +492,21 @@ Use the int8 quantized models (default) instead of fp32. The int8 models require
 
 ### Unsupported audio format
 
-Currently only WAV format is supported. Convert other formats using ffmpeg:
+WAV is always supported natively. Any other format (MP3, OGG, WebM, FLAC, M4A, AAC, Opus, ...) is transcoded on the fly to 16 kHz mono WAV using a local `ffmpeg` binary.
 
-```bash
-ffmpeg -i input.mp3 -ar 16000 -ac 1 output.wav
-```
+If the server responds with `400 Unsupported or malformed audio`:
+
+1. Install `ffmpeg` and make sure it is in `PATH` (or pass `-ffmpeg-path /absolute/path/to/ffmpeg`). The official Docker image already includes ffmpeg.
+2. Check the server logs. On startup you will see one of:
+   - `ffmpeg conversion enabled binary=/usr/bin/ffmpeg timeout=60s` — ready.
+   - `ffmpeg not found, non-WAV inputs will be rejected` — install it or disable conversion with `-ffmpeg=false` if you only need WAV.
+3. As a manual alternative, convert client-side before uploading:
+
+   ```bash
+   ffmpeg -i input.mp3 -ar 16000 -ac 1 output.wav
+   ```
+
+Audio is detected by content (magic bytes), not by filename extension, so clients that upload files without an extension still work.
 
 ## License
 
diff --git a/internal/asr/audio.go b/internal/asr/audio.go
index b3a4506..4f30633 100644
--- a/internal/asr/audio.go
+++ b/internal/asr/audio.go
@@ -7,6 +7,16 @@ import (
 	"math"
 )
 
+// isWAV returns true when data starts with a RIFF/WAVE header. It inspects
+// the first 12 bytes, which is enough to distinguish a WAV container from
+// any other audio format without parsing it.
+func isWAV(data []byte) bool {
+	if len(data) < 12 {
+		return false
+	}
+	return string(data[0:4]) == "RIFF" && string(data[8:12]) == "WAVE"
+}
+
 // parseWAV parses a WAV file and returns float32 samples normalized to [-1, 1]
 func parseWAV(data []byte) ([]float32, error) {
 	if len(data) < 44 {
diff --git a/internal/asr/audio_test.go b/internal/asr/audio_test.go
new file mode 100644
index 0000000..f5f805d
--- /dev/null
+++ b/internal/asr/audio_test.go
@@ -0,0 +1,216 @@
+package asr
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"os/exec"
+	"sync"
+	"testing"
+	"time"
+)
+
+// buildMinimalWAV produces a tiny but valid 16-bit PCM WAV blob suitable
+// for exercising the magic-byte detection and parsing path without any
+// external dependency.
+func buildMinimalWAV(t *testing.T, sampleRate uint32, samples int) []byte {
+	t.Helper()
+	var buf bytes.Buffer
+
+	bitsPerSample := uint16(16)
+	numChannels := uint16(1)
+	byteRate := sampleRate * uint32(numChannels) * uint32(bitsPerSample) / 8
+	blockAlign := numChannels * bitsPerSample / 8
+	dataSize := uint32(samples) * uint32(blockAlign)
+
+	buf.WriteString("RIFF")
+	_ = binary.Write(&buf, binary.LittleEndian, uint32(36+dataSize))
+	buf.WriteString("WAVE")
+
+	buf.WriteString("fmt ")
+	_ = binary.Write(&buf, binary.LittleEndian, uint32(16))
+	_ = binary.Write(&buf, binary.LittleEndian, uint16(1)) // PCM
+	_ = binary.Write(&buf, binary.LittleEndian, numChannels)
+	_ = binary.Write(&buf, binary.LittleEndian, sampleRate)
+	_ = binary.Write(&buf, binary.LittleEndian, byteRate)
+	_ = binary.Write(&buf, binary.LittleEndian, blockAlign)
+	_ = binary.Write(&buf, binary.LittleEndian, bitsPerSample)
+
+	buf.WriteString("data")
+	_ = binary.Write(&buf, binary.LittleEndian, dataSize)
+	for i := 0; i < samples; i++ {
+		_ = binary.Write(&buf, binary.LittleEndian, int16(i%32000))
+	}
+	return buf.Bytes()
+}
+
+func TestIsWAV(t *testing.T) {
+	cases := []struct {
+		name string
+		in   []byte
+		want bool
+	}{
+		{"valid WAV header", buildMinimalWAV(t, 16000, 4), true},
+		{"too short", []byte{0x01, 0x02}, false},
+		{"wrong RIFF", append([]byte("XXXX\x00\x00\x00\x00WAVE"), make([]byte, 100)...), false},
+		{"wrong WAVE", append([]byte("RIFF\x00\x00\x00\x00XXXX"), make([]byte, 100)...), false},
+		{"ogg magic", []byte("OggS\x00\x02\x00\x00\x00\x00\x00\x00foo"), false},
+		{"id3 mp3", []byte("ID3\x03\x00\x00\x00\x00\x00\x00\x00\x00foo"), false},
+		{"empty", nil, false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := isWAV(tc.in); got != tc.want {
+				t.Fatalf("isWAV(%q) = %v, want %v", tc.name, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestLoadAudioAcceptsWAV(t *testing.T) {
+	tr := &Transcriber{}
+	wav := buildMinimalWAV(t, 16000, 100)
+
+	samples, err := tr.loadAudio(wav, "")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(samples) == 0 {
+		t.Fatalf("expected decoded samples, got 0")
+	}
+}
+
+func TestLoadAudioRejectsNonWAVWhenFFmpegDisabled(t *testing.T) {
+	tr := &Transcriber{ffmpeg: nil}
+
+	// Clearly non-WAV payload. Without ffmpeg this must surface
+	// ErrUnsupportedAudio so the HTTP handler can map it to 400.
+	_, err := tr.loadAudio([]byte("OggS\x00\x02\x00\x00\x00\x00\x00\x00this is not wav"), ".ogg")
+	if err == nil {
+		t.Fatal("expected error, got nil")
+	}
+	if !errors.Is(err, ErrUnsupportedAudio) {
+		t.Fatalf("expected ErrUnsupportedAudio, got %v", err)
+	}
+}
+
+// TestLoadAudioConcurrentWAV ensures that the WAV fast path is safe to call
+// from many goroutines at once. This matches what the worker pool does in
+// practice (up to `-workers` concurrent inferences, each preceded by
+// loadAudio). It is the regression test for the PR #5 tempfile collision
+// bug: we run it many times in parallel and expect no data races or
+// spurious failures.
+func TestLoadAudioConcurrentWAV(t *testing.T) {
+	tr := &Transcriber{}
+	wav := buildMinimalWAV(t, 16000, 1000)
+
+	const goroutines = 32
+	const iterations = 16
+
+	var wg sync.WaitGroup
+	errs := make(chan error, goroutines*iterations)
+
+	for g := 0; g < goroutines; g++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for i := 0; i < iterations; i++ {
+				samples, err := tr.loadAudio(wav, "")
+				if err != nil {
+					errs <- err
+					return
+				}
+				if len(samples) == 0 {
+					errs <- errors.New("empty samples")
+					return
+				}
+			}
+		}()
+	}
+
+	wg.Wait()
+	close(errs)
+	for err := range errs {
+		t.Fatalf("concurrent loadAudio failed: %v", err)
+	}
+}
+
+// TestFFmpegConverterUnique verifies that two concurrent conversions never
+// share temporary files. We don't require ffmpeg to actually succeed — we
+// only care that os.CreateTemp hands us unique paths. The test is skipped
+// when ffmpeg is not available to keep CI green on runners without it.
+func TestFFmpegConverterConcurrentTempFiles(t *testing.T) {
+	if _, err := exec.LookPath("ffmpeg"); err != nil {
+		t.Skip("ffmpeg not available in PATH, skipping")
+	}
+
+	conv := newFFmpegConverter(FFmpegConfig{
+		Enabled: true,
+		Timeout: 10 * time.Second,
+	})
+	if conv == nil {
+		t.Skip("converter did not initialize, skipping")
+	}
+
+	// Feed garbage so ffmpeg errors out quickly, but through the real code
+	// path that creates temp files. The payload is intentionally invalid
+	// to keep the test fast. We only care that each invocation errors
+	// with ErrUnsupportedAudio and never panics or races.
+	payload := []byte("not a real audio file, just checking concurrency safety")
+
+	const goroutines = 16
+	var wg sync.WaitGroup
+	for i := 0; i < goroutines; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			_, err := conv.Convert(payload)
+			if err != nil && !errors.Is(err, ErrUnsupportedAudio) {
+				t.Errorf("unexpected error class: %v", err)
+			}
+		}()
+	}
+	wg.Wait()
+}
+
+func TestNewFFmpegConverterReturnsNilWhenDisabled(t *testing.T) {
+	if c := newFFmpegConverter(FFmpegConfig{Enabled: false}); c != nil {
+		t.Fatalf("expected nil converter when disabled, got %#v", c)
+	}
+}
+
+func TestNewFFmpegConverterReturnsNilWhenMissing(t *testing.T) {
+	// Use a name that is overwhelmingly unlikely to resolve in PATH.
+	c := newFFmpegConverter(FFmpegConfig{
+		Enabled:    true,
+		BinaryPath: "__definitely_not_a_real_binary_parakeet_test__",
+	})
+	if c != nil {
+		t.Fatalf("expected nil converter when binary missing, got %#v", c)
+	}
+}
+
+func TestTrimStderr(t *testing.T) {
+	cases := []struct {
+		in   string
+		want string
+	}{
+		{"", "conversion failed"},
+		{"short error", "short error"},
+		{"line1\nline2\nline3", "line1 line2 line3"},
+		{"   \n  \r\n", ""},
+	}
+	for _, tc := range cases {
+		got := trimStderr(tc.in)
+		// Accept "conversion failed" when the normalized result is empty.
+		if got == "" || (tc.want == "" && got != "conversion failed") {
+			if tc.want != "" || got != "conversion failed" {
+				t.Errorf("trimStderr(%q) = %q, want %q", tc.in, got, tc.want)
+			}
+			continue
+		}
+		if tc.want != "" && got != tc.want {
+			t.Errorf("trimStderr(%q) = %q, want %q", tc.in, got, tc.want)
+		}
+	}
+}
diff --git a/internal/asr/ffmpeg.go b/internal/asr/ffmpeg.go
new file mode 100644
index 0000000..8702ca0
--- /dev/null
+++ b/internal/asr/ffmpeg.go
@@ -0,0 +1,205 @@
+package asr
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"fmt"
+	"log/slog"
+	"os"
+	"os/exec"
+	"time"
+)
+
+// ErrUnsupportedAudio is returned when the input is neither a parsable WAV
+// nor convertible via ffmpeg (either because ffmpeg is disabled/missing or
+// because ffmpeg itself rejected the input). Callers can use errors.Is to
+// detect this condition and map it to HTTP 400.
+var ErrUnsupportedAudio = errors.New("unsupported audio")
+
+// FFmpegConfig controls optional ffmpeg-backed conversion of non-WAV inputs.
+//
+// When Enabled is true, loadAudio will attempt to transcode unknown inputs
+// to 16 kHz mono PCM WAV via an external ffmpeg binary. When Enabled is
+// false (the default outside of environments where ffmpeg was found), only
+// WAV input is accepted.
+type FFmpegConfig struct {
+	// Enabled toggles ffmpeg-backed conversion.
+	Enabled bool
+
+	// BinaryPath is the resolved absolute path to the ffmpeg executable.
+	BinaryPath string
+
+	// Timeout bounds the wall-clock time of a single conversion.
+	Timeout time.Duration
+}
+
+// ffmpegConverter performs audio transcoding using an external ffmpeg binary.
+//
+// It is concurrency-safe: each call to Convert writes to its own temporary
+// input and output files created via os.CreateTemp, so simultaneous requests
+// never share paths. This matters because the decoder worker pool allows up
+// to `-workers` inferences in parallel, and each of them may be preceded by
+// a conversion.
+type ffmpegConverter struct {
+	binaryPath string
+	timeout    time.Duration
+}
+
+// newFFmpegConverter returns a ready-to-use converter or nil when ffmpeg is
+// unavailable. A nil converter is not an error; it means non-WAV inputs will
+// be rejected with ErrUnsupportedAudio. The probing is done once at startup
+// to fail fast and surface a clear log line instead of discovering the
+// problem on the first request.
+func newFFmpegConverter(cfg FFmpegConfig) *ffmpegConverter {
+	if !cfg.Enabled {
+		return nil
+	}
+
+	bin := cfg.BinaryPath
+	if bin == "" {
+		bin = "ffmpeg"
+	}
+
+	resolved, err := exec.LookPath(bin)
+	if err != nil {
+		slog.Warn("ffmpeg not found, non-WAV inputs will be rejected",
+			"requested", bin,
+			"error", err,
+		)
+		return nil
+	}
+
+	timeout := cfg.Timeout
+	if timeout <= 0 {
+		timeout = 60 * time.Second
+	}
+
+	slog.Info("ffmpeg conversion enabled",
+		"binary", resolved,
+		"timeout", timeout,
+	)
+
+	return &ffmpegConverter{
+		binaryPath: resolved,
+		timeout:    timeout,
+	}
+}
+
+// Convert transcodes arbitrary audio bytes into 16 kHz mono PCM WAV bytes
+// by shelling out to ffmpeg. It returns the raw WAV payload so the caller
+// can feed it into parseWAV and reuse the existing decode path.
+//
+// The function is safe for concurrent use: it allocates unique temporary
+// files for each invocation and cleans them up on return.
+func (c *ffmpegConverter) Convert(data []byte) ([]byte, error) {
+	if c == nil {
+		return nil, ErrUnsupportedAudio
+	}
+
+	// Unique temp files per call. os.CreateTemp randomizes the suffix so
+	// concurrent workers never collide on disk.
+	in, err := os.CreateTemp("", "parakeet-in-*.bin")
+	if err != nil {
+		return nil, fmt.Errorf("ffmpeg: create temp input: %w", err)
+	}
+	inputPath := in.Name()
+	defer os.Remove(inputPath)
+
+	if _, err := in.Write(data); err != nil {
+		in.Close()
+		return nil, fmt.Errorf("ffmpeg: write temp input: %w", err)
+	}
+	if err := in.Close(); err != nil {
+		return nil, fmt.Errorf("ffmpeg: close temp input: %w", err)
+	}
+
+	out, err := os.CreateTemp("", "parakeet-out-*.wav")
+	if err != nil {
+		return nil, fmt.Errorf("ffmpeg: create temp output: %w", err)
+	}
+	outputPath := out.Name()
+	// Close the file handle immediately; ffmpeg will rewrite it.
+	out.Close()
+	defer os.Remove(outputPath)
+
+	ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
+	defer cancel()
+
+	// -nostdin: never read from stdin (defensive, avoids hangs).
+	// -y: overwrite output without prompting.
+	// -hide_banner -loglevel error: keep stderr focused on real errors.
+	// -ac 1 -ar 16000 -acodec pcm_s16le: match the pipeline expectation.
+	// -f wav: force WAV container regardless of output filename.
+	cmd := exec.CommandContext(ctx, c.binaryPath,
+		"-nostdin",
+		"-hide_banner",
+		"-loglevel", "error",
+		"-y",
+		"-i", inputPath,
+		"-ac", "1",
+		"-ar", "16000",
+		"-acodec", "pcm_s16le",
+		"-f", "wav",
+		outputPath,
+	)
+
+	var stderr bytes.Buffer
+	cmd.Stderr = &stderr
+
+	if err := cmd.Run(); err != nil {
+		if ctx.Err() == context.DeadlineExceeded {
+			return nil, fmt.Errorf("ffmpeg: conversion timed out after %s: %w", c.timeout, ErrUnsupportedAudio)
+		}
+		// ffmpeg exited non-zero: the input is either corrupted or in a
+		// format ffmpeg can't decode. Treat both as client-side errors.
+		return nil, fmt.Errorf("ffmpeg: %s: %w", trimStderr(stderr.String()), ErrUnsupportedAudio)
+	}
+
+	wavData, err := os.ReadFile(outputPath)
+	if err != nil {
+		return nil, fmt.Errorf("ffmpeg: read converted output: %w", err)
+	}
+
+	if DebugMode {
+		slog.Debug("ffmpeg conversion succeeded",
+			"inputBytes", len(data),
+			"outputBytes", len(wavData),
+		)
+	}
+
+	return wavData, nil
+}
+
+// trimStderr shortens ffmpeg stderr to a single line with a sensible cap so
+// it fits in an HTTP error response without leaking a wall of text.
+func trimStderr(s string) string {
+	s = stripNewlines(s)
+	const maxLen = 200
+	if len(s) > maxLen {
+		return s[:maxLen] + "..."
+	}
+	if s == "" {
+		return "conversion failed"
+	}
+	return s
+}
+
+func stripNewlines(s string) string {
+	out := make([]byte, 0, len(s))
+	for i := 0; i < len(s); i++ {
+		c := s[i]
+		if c == '\n' || c == '\r' {
+			if len(out) > 0 && out[len(out)-1] != ' ' {
+				out = append(out, ' ')
+			}
+			continue
+		}
+		out = append(out, c)
+	}
+	// Trim trailing space.
+	for len(out) > 0 && out[len(out)-1] == ' ' {
+		out = out[:len(out)-1]
+	}
+	return string(out)
+}
diff --git a/internal/asr/transcriber.go b/internal/asr/transcriber.go
index f1b0ed6..6c7e14c 100644
--- a/internal/asr/transcriber.go
+++ b/internal/asr/transcriber.go
@@ -157,12 +157,24 @@ type Transcriber struct {
 	mel              *MelFilterbank
 	encoderPath      string
 	decoderPool      chan *decoderWorker
+	ffmpeg           *ffmpegConverter
 }
 
-func NewTranscriber(modelsDir string, workers int) (*Transcriber, error) {
+// Options groups optional knobs passed to NewTranscriber. Zero values keep
+// the previous behavior: WAV-only input, no ffmpeg conversion.
+type Options struct {
+	FFmpeg FFmpegConfig
+}
+
+// NewTranscriber loads models and initializes the decoder worker pool.
+// When opts.FFmpeg.Enabled is true and the ffmpeg binary is resolvable,
+// non-WAV inputs will be transcoded on the fly. Otherwise, only WAV is
+// accepted and non-WAV inputs return ErrUnsupportedAudio.
+func NewTranscriber(modelsDir string, workers int, opts Options) (*Transcriber, error) {
 	t := &Transcriber{
 		maxTokensPerStep: 10,
 		blankIdx:         8192,
+		ffmpeg:           newFFmpegConverter(opts.FFmpeg),
 	}
 
 	// Load config
@@ -345,15 +357,39 @@ func (t *Transcriber) Transcribe(audioData []byte, format, language string) (str
 	return t.tokensToText(tokens), nil
 }
 
+// loadAudio decodes raw request bytes into mono 16 kHz float32 samples.
+//
+// Detection is done by content, not by filename extension: an OpenAI client
+// is free to upload a file without an extension or with a misleading one,
+// and the transcription endpoint only ever sees bytes. WAV inputs are
+// parsed in-process with zero external dependencies. Anything else is
+// delegated to the optional ffmpeg converter; when ffmpeg is unavailable
+// the call fails with ErrUnsupportedAudio so the HTTP layer can surface a
+// 400 response instead of a generic 500.
+//
+// The `format` parameter is kept for logging and future heuristics, but it
+// is intentionally not used to pick the decoder.
 func (t *Transcriber) loadAudio(data []byte, format string) ([]float32, error) {
-	switch format {
-	case ".wav":
-		return parseWAV(data)
-	case ".webm", ".ogg", ".mp3", ".m4a":
-		return nil, fmt.Errorf("format %s requires ffmpeg conversion - not yet implemented", format)
-	default:
+	if isWAV(data) {
 		return parseWAV(data)
 	}
+
+	if t.ffmpeg == nil {
+		return nil, fmt.Errorf("input is not WAV and ffmpeg conversion is disabled: %w", ErrUnsupportedAudio)
+	}
+
+	if DebugMode {
+		slog.Debug("converting audio via ffmpeg",
+			"format", format,
+			"bytes", len(data),
+		)
+	}
+
+	wavData, err := t.ffmpeg.Convert(data)
+	if err != nil {
+		return nil, err
+	}
+	return parseWAV(wavData)
 }
 
 func (t *Transcriber) runInference(features [][]float32) ([]int, error) {
diff --git a/internal/server/handlers.go b/internal/server/handlers.go
index 9e90948..7d9a100 100644
--- a/internal/server/handlers.go
+++ b/internal/server/handlers.go
@@ -2,6 +2,7 @@ package server
 
 import (
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"log/slog"
@@ -122,6 +123,13 @@ func (s *Server) handleTranscription(w http.ResponseWriter, r *http.Request) {
 	// Transcribe
 	text, err := s.transcriber.Transcribe(audioData, ext, language)
 	if err != nil {
+		// Unsupported or malformed audio is a client error: the request
+		// body we received cannot be decoded. Everything else is treated
+		// as an internal failure.
+		if errors.Is(err, asr.ErrUnsupportedAudio) {
+			sendError(w, "Unsupported or malformed audio: "+err.Error(), "invalid_request_error", http.StatusBadRequest)
+			return
+		}
 		sendError(w, "Transcription failed: "+err.Error(), "server_error", http.StatusInternalServerError)
 		return
 	}
diff --git a/internal/server/server.go b/internal/server/server.go
index 6a9573b..a44a90a 100644
--- a/internal/server/server.go
+++ b/internal/server/server.go
@@ -7,6 +7,7 @@ import (
 	"net/http"
 	"os"
 	"strings"
+	"time"
 
 	"parakeet/internal/asr"
 )
@@ -20,6 +21,18 @@ type Config struct {
 	LogLevel  string
 	LogFormat string
 	Workers   int
+
+	// FFmpegEnabled toggles the ffmpeg-backed fallback for non-WAV audio.
+	// When true, unknown input formats are transcoded to 16 kHz mono WAV
+	// before transcription. When false, only WAV input is accepted.
+	FFmpegEnabled bool
+
+	// FFmpegPath is the name or absolute path of the ffmpeg binary.
+	// Empty means "ffmpeg", resolved against PATH.
+	FFmpegPath string
+
+	// FFmpegTimeout bounds the duration of a single conversion.
+	FFmpegTimeout time.Duration
 }
 
 // Server represents the HTTP server for the ASR service
@@ -37,7 +50,13 @@ func New(cfg Config) (*Server, error) {
 	asr.DebugMode = cfg.LogLevel == "debug"
 
 	// Initialize transcriber
-	transcriber, err := asr.NewTranscriber(cfg.ModelsDir, cfg.Workers)
+	transcriber, err := asr.NewTranscriber(cfg.ModelsDir, cfg.Workers, asr.Options{
+		FFmpeg: asr.FFmpegConfig{
+			Enabled:    cfg.FFmpegEnabled,
+			BinaryPath: cfg.FFmpegPath,
+			Timeout:    cfg.FFmpegTimeout,
+		},
+	})
 	if err != nil {
 		return nil, fmt.Errorf("failed to initialize transcriber: %w", err)
 	}
diff --git a/main.go b/main.go
index 2fb58f9..443efd1 100644
--- a/main.go
+++ b/main.go
@@ -21,6 +21,9 @@ func main() {
 	flag.StringVar(&cfg.LogLevel, "log-level", "info", "Log level: debug, info, warn, error")
 	flag.StringVar(&cfg.LogFormat, "log-format", "text", "Log format: text or json")
 	flag.IntVar(&cfg.Workers, "workers", 4, "Number of concurrent inference workers (each uses ~670MB RAM for int8 models)")
+	flag.BoolVar(&cfg.FFmpegEnabled, "ffmpeg", true, "Enable ffmpeg fallback for non-WAV audio (requires ffmpeg in PATH)")
+	flag.StringVar(&cfg.FFmpegPath, "ffmpeg-path", "", "Path to the ffmpeg binary (default: resolved from PATH)")
+	flag.DurationVar(&cfg.FFmpegTimeout, "ffmpeg-timeout", 60*time.Second, "Maximum wall-clock time for a single ffmpeg conversion")
 	flag.Parse()
 
 	setupLogger(cfg.LogFormat, cfg.LogLevel)