From 7f732459f649fdef5e2143bce1b499750c23ef99 Mon Sep 17 00:00:00 2001
From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com>
Date: Sun, 3 May 2026 02:29:22 +0530
Subject: [PATCH] docs(docs-next): port guides/observability (phase 4e)

---
 .../guides/observability/dashboard-api.mdx    | 234 +++++++++++++++
 .../docs/guides/observability/dashboard.mdx   | 265 +++++++++++++++++
 .../docs/guides/observability/index.mdx       |  13 +-
 .../docs/guides/observability/logging.mdx     | 131 +++++++++
 .../docs/guides/observability/meta.json       |   8 +-
 .../docs/guides/observability/monitoring.mdx  | 266 ++++++++++++++++++
 6 files changed, 911 insertions(+), 6 deletions(-)
 create mode 100644 docs-next/content/docs/guides/observability/dashboard-api.mdx
 create mode 100644 docs-next/content/docs/guides/observability/dashboard.mdx
 create mode 100644 docs-next/content/docs/guides/observability/logging.mdx
 create mode 100644 docs-next/content/docs/guides/observability/monitoring.mdx

diff --git a/docs-next/content/docs/guides/observability/dashboard-api.mdx b/docs-next/content/docs/guides/observability/dashboard-api.mdx
new file mode 100644
index 0000000..2e3028e
--- /dev/null
+++ b/docs-next/content/docs/guides/observability/dashboard-api.mdx
@@ -0,0 +1,234 @@
+---
+title: Dashboard REST API
+description: "JSON endpoints for stats, jobs, dead letters, metrics, logs, infrastructure, observability."
+---
+
+The dashboard exposes a JSON API you can use independently of the UI. All
+endpoints return `application/json` with `Access-Control-Allow-Origin: *`.
+
+## Stats
+
+### `GET /api/stats`
+
+Queue statistics snapshot.
+
+```json
+{
+  "pending": 12,
+  "running": 3,
+  "completed": 450,
+  "failed": 2,
+  "dead": 1,
+  "cancelled": 0
+}
+```
+
+### `GET /api/stats/queues`
+
+Per-queue statistics. Pass `?queue=name` for a single queue, or omit for all
+queues.
+
+```bash
+curl http://localhost:8080/api/stats/queues
+curl http://localhost:8080/api/stats/queues?queue=emails
+```
+
+## Jobs
+
+### `GET /api/jobs`
+
+Paginated list of jobs with filtering.
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `status` | `string` | all | Filter by status |
+| `queue` | `string` | all | Filter by queue name |
+| `task` | `string` | all | Filter by task name |
+| `metadata` | `string` | — | Search metadata (LIKE) |
+| `error` | `string` | — | Search error text (LIKE) |
+| `created_after` | `int` | — | Unix ms timestamp |
+| `created_before` | `int` | — | Unix ms timestamp |
+| `limit` | `int` | `20` | Page size |
+| `offset` | `int` | `0` | Pagination offset |
+
+```bash
+curl http://localhost:8080/api/jobs?status=running&limit=10
+```
+
+### `GET /api/jobs/{id}`
+
+Full detail for a single job.
+
+### `GET /api/jobs/{id}/errors`
+
+Error history for a job (one entry per failed attempt).
+
+### `GET /api/jobs/{id}/logs`
+
+Task execution logs for a specific job.
+
+### `GET /api/jobs/{id}/replay-history`
+
+Replay history for a job that has been replayed.
+
+### `GET /api/jobs/{id}/dag`
+
+Dependency graph for a job (nodes and edges).
+
+### `POST /api/jobs/{id}/cancel`
+
+Cancel a pending job.
+
+```json
+{ "cancelled": true }
+```
+
+### `POST /api/jobs/{id}/replay`
+
+Replay a completed or failed job with the same payload.
+
+```json
+{ "replay_job_id": "01H5K7Y..." }
+```
+
+## Dead letters
+
+### `GET /api/dead-letters`
+
+Paginated list of dead letter entries. Supports `limit` and `offset`
+parameters.
+
+### `POST /api/dead-letters/{id}/retry`
+
+Re-enqueue a dead letter job.
+
+```json
+{ "new_job_id": "01H5K7Y..." }
+```
+
+### `POST /api/dead-letters/purge`
+
+Purge all dead letters.
+
+```json
+{ "purged": 42 }
+```
+
+## Metrics
+
+### `GET /api/metrics`
+
+Per-task execution metrics.
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `task` | `string` | all | Filter by task name |
+| `since` | `int` | `3600` | Lookback window in seconds |
+
+### `GET /api/metrics/timeseries`
+
+Time-bucketed metrics for charts.
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `task` | `string` | all | Filter by task name |
+| `since` | `int` | `3600` | Lookback window in seconds |
+| `bucket` | `int` | `60` | Bucket size in seconds |
+
+## Logs
+
+### `GET /api/logs`
+
+Query task execution logs across all jobs.
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `task` | `string` | all | Filter by task name |
+| `level` | `string` | all | Filter by log level |
+| `since` | `int` | `3600` | Lookback window in seconds |
+| `limit` | `int` | `100` | Max entries |
+
+## Infrastructure
+
+### `GET /api/workers`
+
+List registered workers with heartbeat status.
+
+### `GET /api/circuit-breakers`
+
+Current state of all circuit breakers.
+
+### `GET /api/resources`
+
+Worker resource health and pool status.
+
+### `GET /api/queues/paused`
+
+List paused queue names.
+
+### `POST /api/queues/{name}/pause`
+
+Pause a queue (jobs stop being dequeued).
+
+### `POST /api/queues/{name}/resume`
+
+Resume a paused queue.
+
+## Observability
+
+### `GET /api/proxy-stats`
+
+Per-handler proxy reconstruction metrics.
+
+### `GET /api/interception-stats`
+
+Interception strategy performance metrics.
+
+### `GET /api/scaler`
+
+KEDA-compatible autoscaler payload. Pass `?queue=name` for a specific queue.
+
+### `GET /health`
+
+Liveness check. Always returns `{"status": "ok"}`.
+
+### `GET /readiness`
+
+Readiness check with storage, worker, and resource health.
+
+### `GET /metrics`
+
+Prometheus metrics endpoint (requires `prometheus-client` package).
+
+## Using the API programmatically
+
+```python
+import requests
+
+# Health check script
+stats = requests.get("http://localhost:8080/api/stats").json()
+
+if stats["dead"] > 0:
+    print(f"WARNING: {stats['dead']} dead letter(s)")
+
+if stats["running"] > 100:
+    print(f"WARNING: {stats['running']} jobs running, possible backlog")
+```
+
+```python
+# Pause a queue during deployment
+requests.post("http://localhost:8080/api/queues/default/pause")
+
+# ... deploy ...
+
+# Resume after deployment
+requests.post("http://localhost:8080/api/queues/default/resume")
+```
+
+```python
+# Retry all dead letters
+dead = requests.get("http://localhost:8080/api/dead-letters?limit=100").json()
+for entry in dead:
+    requests.post(f"http://localhost:8080/api/dead-letters/{entry['id']}/retry")
+    print(f"Retried {entry['task_name']}")
+```
diff --git a/docs-next/content/docs/guides/observability/dashboard.mdx b/docs-next/content/docs/guides/observability/dashboard.mdx
new file mode 100644
index 0000000..dd8e905
--- /dev/null
+++ b/docs-next/content/docs/guides/observability/dashboard.mdx
@@ -0,0 +1,265 @@
+---
+title: Web Dashboard
+description: "Zero-dependency built-in web UI for browsing jobs, metrics, workers, and managing the queue."
+---
+
+import { Callout } from "fumadocs-ui/components/callout";
+import { Tab, Tabs } from "fumadocs-ui/components/tabs";
+
+taskito ships with a built-in web dashboard for monitoring jobs, inspecting
+dead letters, and managing your task queue in real time. The dashboard is a
+single-page application served directly from the Python package — **zero
+extra dependencies required**.
+
+## Launching the dashboard
+
+<Tabs items={["CLI", "Programmatic"]}>
+  <Tab value="CLI">
+
+```bash
+taskito dashboard --app myapp:queue
+```
+
+The `--app` argument uses the same `module:attribute` format as the worker.
+
+  </Tab>
+  <Tab value="Programmatic">
+
+```python
+from taskito.dashboard import serve_dashboard
+from myapp import queue
+
+serve_dashboard(queue, host="0.0.0.0", port=8000)
+```
+
+  </Tab>
+</Tabs>
+
+By default the dashboard starts on `http://localhost:8080`.
+
+### CLI options
+
+| Flag | Default | Description |
+|---|---|---|
+| `--app` | *required* | Module path to your `Queue` instance, e.g. `myapp:queue` |
+| `--host` | `127.0.0.1` | Bind address |
+| `--port` | `8080` | Bind port |
+
+```bash
+# Bind to all interfaces on port 9000
+taskito dashboard --app myapp:queue --host 0.0.0.0 --port 9000
+```
+
+<Callout type="info" title="Running alongside the worker">
+  The dashboard reads directly from the same SQLite database as the worker.
+  You can run them side by side without any coordination:
+
+  ```bash
+  # Terminal 1
+  taskito worker --app myapp:queue
+
+  # Terminal 2
+  taskito dashboard --app myapp:queue
+  ```
+</Callout>
+
+## Dashboard features
+
+The dashboard is a React + Vite + TypeScript SPA routed via TanStack Router,
+styled with Tailwind v4 and shadcn/ui, and shipped as hash-busted multi-file
+assets under `py_src/taskito/static/dashboard/`.
+
+### Design
+
+- **Dark and light mode** — Toggle between themes via the sun/moon button in the header. Preference is stored in `localStorage` and follows the system scheme by default.
+- **Auto-refresh** — Configurable refresh interval (2s, 5s, 10s, or off) via the header dropdown. All pages auto-refresh at the selected interval; TanStack Query handles caching and background revalidation.
+- **Command palette** — `⌘K` / `Ctrl+K` opens a cmdk palette for route navigation and common actions.
+- **Icons** — Lucide icons throughout for visual clarity.
+- **Toast notifications** — Every action shows a success or error toast via sonner. Optimistic mutations update the UI immediately and roll back on error.
+- **Destructive confirms** — Irreversible actions (purge, retry all) use a type-to-confirm dialog.
+- **Loading states** — Skeleton screens for tables and cards, error boundaries with retry.
+- **Responsive layout** — Sidebar navigation with grouped sections (Monitoring, Infrastructure, Advanced). The main content area scrolls independently.
+
+### Pages
+
+| Page | Description |
+|---|---|
+| **Overview** | Stats cards with status icons, throughput sparkline chart, recent jobs table |
+| **Jobs** | Filterable job listing (status, queue, task, metadata, error, date range) with pagination |
+| **Job Detail** | Full job info, error history, task logs, replay history, dependency DAG visualization |
+| **Metrics** | Per-task performance table (avg, P50, P95, P99) with timeseries chart and time range selector |
+| **Logs** | Structured task execution logs with task/level filters |
+| **Workers** | Worker cards with heartbeat status, queue assignments, and tags |
+| **Queues** | Per-queue stats (pending/running), pause and resume controls |
+| **Resources** | Worker DI runtime status — health, scope, init duration, pool stats, dependencies |
+| **Circuit Breakers** | Automatic failure protection state (closed/open/half_open), thresholds, cooldowns |
+| **Dead Letters** | Failed jobs that exhausted retries — retry individual entries or purge all |
+| **System** | Proxy reconstruction and interception strategy metrics |
+
+<Callout type="info" title="Zero extra dependencies at runtime">
+  The built SPA ships inside the Python wheel under
+  `py_src/taskito/static/dashboard/` and is served by the Python dashboard
+  process. No Node.js, no pnpm, no CDN at runtime — just `pip install
+  taskito`. Node.js and pnpm are only needed by contributors rebuilding the
+  dashboard source in `dashboard/`.
+</Callout>
+
+## Tutorial
+
+This walkthrough covers every dashboard page and how to use it.
+
+### Step 1: start the dashboard
+
+Start a worker and the dashboard in two terminals:
+
+```bash
+# Terminal 1 — start the worker
+taskito worker --app myapp:queue
+
+# Terminal 2 — start the dashboard
+taskito dashboard --app myapp:queue
+```
+
+You should see:
+
+```
+taskito dashboard → http://127.0.0.1:8080
+Press Ctrl+C to stop
+```
+
+Open `http://localhost:8080` in your browser.
+
+### Step 2: Overview page
+
+The first page you see is the **Overview**. It shows:
+
+- **Stats cards** — Six cards at the top showing pending, running, completed, failed, dead, and cancelled job counts.
+- **Throughput chart** — A green sparkline showing jobs processed per second over the last 60 refresh intervals.
+- **Recent jobs table** — The 10 most recent jobs. Click any row to open its detail view.
+
+The stats update automatically based on the refresh interval you select in
+the header (default: 5 seconds).
+
+### Step 3: browsing and filtering jobs
+
+Click **Jobs** in the sidebar. This page shows:
+
+- **Stats grid** — Same six stat cards as the overview.
+- **Filter panel** — Status dropdown, queue, task, metadata, error text, created-after/before pickers.
+- **Results table** — Paginated list showing ID, task, queue, status, priority, progress, retries, and created time.
+
+Use the **Prev / Next** buttons at the bottom to paginate.
+
+### Step 4: inspecting a job
+
+Click any job row to open the **Job Detail** page. The detail card shows:
+
+- A colored top border matching the job status (green for complete, red for failed, etc.)
+- Full job ID, status badge, task name, queue, priority, progress bar, retries, timestamps
+- **Error** field (if the job failed) displayed in a red-highlighted box
+- Unique key and metadata (if set)
+
+**Actions:**
+
+- **Cancel Job** — Visible only for pending jobs. Sends a cancel request and shows a toast.
+- **Replay** — Re-enqueue the job with the same payload. Navigates to the new job's detail page.
+
+**Sections below the detail card:** Error History, Task Logs, Replay
+History, and a Dependency Graph visualization for jobs with dependencies.
+
+### Step 5: monitoring metrics
+
+Click **Metrics** in the sidebar. This page shows a time-range selector (1h
+/ 6h / 24h), a stacked bar chart of success/failure counts per time bucket,
+and a per-task table with avg / P50 / P95 / P99 / min / max latency.
+
+### Step 6: viewing logs
+
+Click **Logs** in the sidebar. Filter by task name or level. Each log entry
+shows time, level badge, task name, job ID, message, and structured extra
+data.
+
+### Step 7: workers
+
+Click **Workers**. Each active worker is displayed as a card showing the
+green dot for liveness, worker ID, queues consumed, last heartbeat,
+registration time, and tags.
+
+### Step 8: managing queues
+
+Click **Queues**. Per-queue table with pending/running counts, pause/resume
+buttons, and status badges.
+
+<Callout type="info" title="What pausing does">
+  Pausing a queue prevents the scheduler from dequeuing new jobs from it.
+  Jobs already running will complete normally. Enqueuing new jobs still
+  works — they'll be picked up when the queue is resumed.
+</Callout>
+
+### Step 9: resources
+
+Click **Resources**. Shows registered worker DI runtime entries (name,
+scope, health, init duration, recreations, dependencies, pool stats).
+
+### Step 10: circuit breakers
+
+Click **Circuit Breakers**. State badge (closed/open/half_open), failure
+count, threshold, window, cooldown.
+
+### Step 11: dead letter queue
+
+Click **Dead Letters**. Retry individual entries with the **Retry** button,
+or purge all with the type-to-confirm **Purge All** in the header.
+
+### Step 12: system internals
+
+Click **System**. Two tables: Proxy Reconstruction (per-handler metrics)
+and Interception (per-strategy metrics).
+
+### Step 13: switching themes
+
+Click the sun/moon icon in the top-right of the header.
+
+### Step 14: changing refresh rate
+
+Use the **Refresh** dropdown in the header — 2s, 5s, 10s, or off.
+
+<Callout type="info" title="REST API">
+  The dashboard also exposes a full JSON API. See the
+  [Dashboard REST API](/docs/guides/observability/dashboard-api) reference
+  for all endpoints.
+</Callout>
+
+## Development
+
+Contributors who want to modify the dashboard source:
+
+```bash
+# Install dependencies (pnpm is pinned via the `packageManager` field)
+cd dashboard && pnpm install
+
+# Start Vite dev server (proxies /api/* to localhost:8080)
+pnpm run dev
+
+# In another terminal, start the backend
+taskito dashboard --app myapp:queue
+
+# Build and copy to Python package
+pnpm run build
+```
+
+<Callout type="info" title="Don't have pnpm?">
+  Run `corepack enable` once (Node 16+) and pnpm will be provisioned
+  automatically from the version pinned in `dashboard/package.json`.
+</Callout>
+
+The build produces a static `index.html` plus hashed JS/CSS chunks under
+`py_src/taskito/static/dashboard/`. The built assets aren't committed —
+release tooling runs `pnpm -C dashboard build` before packaging so the
+wheel ships them.
+
+<Callout type="warn" title="Authentication">
+  The dashboard does not include authentication. If you expose it beyond
+  `localhost`, place it behind a reverse proxy with authentication (e.g.
+  nginx with basic auth, or an OAuth2 proxy).
+</Callout>
diff --git a/docs-next/content/docs/guides/observability/index.mdx b/docs-next/content/docs/guides/observability/index.mdx
index e808a43..2a42e90 100644
--- a/docs-next/content/docs/guides/observability/index.mdx
+++ b/docs-next/content/docs/guides/observability/index.mdx
@@ -1,10 +1,13 @@
 ---
 title: Observability
-description: "Events, metrics, logs, OTel/Sentry/Prometheus."
+description: "Monitor, log, and inspect your task queue in real time."
 ---
 
-import { Callout } from 'fumadocs-ui/components/callout';
+Monitor, log, and inspect your task queue in real time.
 
-<Callout title="Phase 1 stub" type="info">
-  Content port pending. See the [Zensical source](https://github.com/ByteVeda/taskito/tree/master/docs) for current text.
-</Callout>
+| Guide | Description |
+|---|---|
+| [Monitoring & Hooks](/docs/guides/observability/monitoring) | Queue stats, progress tracking, worker heartbeat, and alerting hooks |
+| [Structured Logging](/docs/guides/observability/logging) | Per-task structured logs with automatic context |
+| [Web Dashboard](/docs/guides/observability/dashboard) | Built-in web UI for browsing jobs, metrics, and worker status |
+| [Dashboard REST API](/docs/guides/observability/dashboard-api) | Programmatic access to all dashboard data via REST endpoints |
diff --git a/docs-next/content/docs/guides/observability/logging.mdx b/docs-next/content/docs/guides/observability/logging.mdx
new file mode 100644
index 0000000..183bc7c
--- /dev/null
+++ b/docs-next/content/docs/guides/observability/logging.mdx
@@ -0,0 +1,131 @@
+---
+title: Structured Task Logging
+description: "Per-task structured logs with current_job.log() — queryable and visible in the dashboard."
+---
+
+taskito provides structured logging from within tasks via
+`current_job.log()`. Logs are stored in the database alongside job data,
+making them queryable and visible in the dashboard.
+
+## Writing logs
+
+Use `current_job.log()` inside any task:
+
+```python
+from taskito import current_job
+
+@queue.task()
+def process_order(order_id: int):
+    current_job.log("Starting order processing", extra={"order_id": order_id})
+
+    items = fetch_items(order_id)
+    current_job.log(f"Found {len(items)} items", level="debug")
+
+    for item in items:
+        try:
+            process_item(item)
+        except ValueError as e:
+            current_job.log(f"Skipping invalid item: {e}", level="warning", extra={"item": item})
+
+    current_job.log("Order processing complete")
+```
+
+### Parameters
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `message` | `str` | *required* | The log message |
+| `level` | `str` | `"info"` | Log level: `"debug"`, `"info"`, `"warning"`, `"error"` |
+| `extra` | `dict \| None` | `None` | Structured data to attach as JSON |
+
+## Querying logs
+
+### Per-job logs
+
+```python
+logs = queue.task_logs(job_id)
+for log in logs:
+    print(f"[{log['level']}] {log['message']}")
+```
+
+### Cross-job log query
+
+```python
+logs = queue.query_logs(
+    task_name="myapp.tasks.process_order",
+    level="error",
+    since=1700000000,
+    limit=50,
+)
+```
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `task_name` | `str \| None` | Filter by task name |
+| `level` | `str \| None` | Filter by log level |
+| `since` | `int \| None` | Unix timestamp — only logs after this time |
+| `limit` | `int` | Maximum number of logs to return |
+
+## Dashboard
+
+Logs are accessible via the dashboard REST API:
+
+- **`GET /api/jobs/{id}/logs`** — logs for a specific job
+- **`GET /api/logs`** — query logs across all jobs (supports `limit` and `offset` parameters)
+
+```bash
+# Logs for a specific job
+curl http://localhost:8080/api/jobs/01H5K6X.../logs
+
+# Recent logs across all jobs
+curl http://localhost:8080/api/logs?limit=20
+```
+
+## Examples
+
+### ETL pipeline with progress logging
+
+```python
+from taskito import current_job
+
+@queue.task()
+def etl_pipeline(source: str, destination: str):
+    current_job.log("Starting extraction", extra={"source": source})
+
+    records = extract(source)
+    current_job.log(f"Extracted {len(records)} records", level="info")
+    current_job.update_progress(33)
+
+    transformed = []
+    for i, record in enumerate(records):
+        try:
+            transformed.append(transform(record))
+        except Exception as e:
+            current_job.log(
+                f"Transform failed for record {i}",
+                level="warning",
+                extra={"record_id": record.get("id"), "error": str(e)},
+            )
+    current_job.update_progress(66)
+
+    loaded = load(destination, transformed)
+    current_job.log(
+        "Pipeline complete",
+        extra={"extracted": len(records), "loaded": loaded, "skipped": len(records) - loaded},
+    )
+    current_job.update_progress(100)
+```
+
+### Debugging failed jobs
+
+```python
+# After a job fails, inspect its logs to understand what happened
+job = queue.get_job(failed_job_id)
+logs = queue.task_logs(failed_job_id)
+
+print(f"Job {job.id} ({job.task_name}): {job.status}")
+for log in logs:
+    print(f"  [{log['level'].upper()}] {log['message']}")
+    if log.get("extra"):
+        print(f"    {log['extra']}")
+```
diff --git a/docs-next/content/docs/guides/observability/meta.json b/docs-next/content/docs/guides/observability/meta.json
index a2f7e51..6eb266f 100644
--- a/docs-next/content/docs/guides/observability/meta.json
+++ b/docs-next/content/docs/guides/observability/meta.json
@@ -1,4 +1,10 @@
 {
   "title": "Observability",
-  "pages": ["index"]
+  "pages": [
+    "index",
+    "monitoring",
+    "logging",
+    "dashboard",
+    "dashboard-api"
+  ]
 }
diff --git a/docs-next/content/docs/guides/observability/monitoring.mdx b/docs-next/content/docs/guides/observability/monitoring.mdx
new file mode 100644
index 0000000..c9c0eb5
--- /dev/null
+++ b/docs-next/content/docs/guides/observability/monitoring.mdx
@@ -0,0 +1,266 @@
+---
+title: Monitoring & Hooks
+description: "Queue stats, progress tracking, worker heartbeat, hooks, and Prometheus/Grafana setup."
+---
+
+import { Callout } from "fumadocs-ui/components/callout";
+
+## Queue statistics
+
+Get a snapshot of job counts by status:
+
+```python
+stats = queue.stats()
+# {'pending': 12, 'running': 3, 'completed': 450, 'failed': 2, 'dead': 1, 'cancelled': 0}
+```
+
+Async variant:
+
+```python
+stats = await queue.astats()
+```
+
+## CLI monitoring
+
+### One-shot stats
+
+```bash
+taskito info --app myapp:queue
+```
+
+```
+taskito queue statistics
+------------------------------
+  pending      12
+  running      3
+  completed    450
+  failed       2
+  dead         1
+  cancelled    0
+------------------------------
+  total        468
+```
+
+### Live dashboard
+
+```bash
+taskito info --app myapp:queue --watch
+```
+
+Refreshes every 2 seconds with throughput calculation (completed jobs per second).
+
+## Progress tracking
+
+Report progress from inside tasks using `current_job`:
+
+```python
+from taskito import current_job
+
+@queue.task()
+def process_batch(items):
+    total = len(items)
+    for i, item in enumerate(items):
+        process(item)
+        current_job.update_progress(int((i + 1) / total * 100))
+    return f"Processed {total} items"
+```
+
+Read progress from outside:
+
+```python
+job = process_batch.delay(items)
+
+# Poll progress
+fetched = queue.get_job(job.id)
+print(fetched.progress)  # 0-100 or None
+```
+
+### Job context
+
+Inside a running task, `current_job` provides:
+
+| Property | Type | Description |
+|---|---|---|
+| `current_job.id` | `str` | The current job ID |
+| `current_job.task_name` | `str` | The registered task name |
+| `current_job.retry_count` | `int` | Current retry attempt (0 = first run) |
+| `current_job.queue_name` | `str` | The queue this job is running on |
+
+```python
+from taskito import current_job
+
+@queue.task()
+def my_task():
+    print(f"Running job {current_job.id}")
+    print(f"Task: {current_job.task_name}")
+    print(f"Attempt: {current_job.retry_count}")
+    print(f"Queue: {current_job.queue_name}")
+```
+
+<Callout type="warn">
+  `current_job` properties raise `RuntimeError` when accessed outside of a
+  running task.
+</Callout>
+
+## Worker heartbeat
+
+Monitor active workers and their health:
+
+```python
+workers = queue.workers()
+for w in workers:
+    print(f"Worker {w['worker_id']}: {w['status']} (last seen: {w['last_heartbeat']})")
+```
+
+Async variant:
+
+```python
+workers = await queue.aworkers()
+```
+
+The worker heartbeat is also available via the dashboard REST API at
+`GET /api/workers`. See the
+[Dashboard](/docs/guides/observability/dashboard) guide for details.
+
+## Events system
+
+taskito includes an in-process event bus for reacting to job lifecycle
+events (`JOB_ENQUEUED`, `JOB_COMPLETED`, `JOB_FAILED`, `JOB_RETRYING`,
+`JOB_DEAD`, `JOB_CANCELLED`). Events can also be delivered as signed HTTP
+webhooks to external systems.
+
+[Events & Webhooks guide →](/docs/guides/extensibility)
+
+## Prometheus metrics
+
+For production monitoring, the optional Prometheus integration provides
+counters, histograms, and gauges for task execution:
+
+```bash
+pip install taskito[prometheus]
+```
+
+[Prometheus integration →](/docs/guides/integrations)
+
+## Hooks
+
+Run code before/after every task, or on success/failure.
+
+### `@queue.before_task`
+
+Called before each task executes:
+
+```python
+@queue.before_task
+def log_start(task_name, args, kwargs):
+    print(f"[START] {task_name}")
+```
+
+### `@queue.after_task`
+
+Called after each task, regardless of success or failure:
+
+```python
+@queue.after_task
+def log_end(task_name, args, kwargs, result, error):
+    status = "OK" if error is None else f"FAILED: {error}"
+    print(f"[END] {task_name} - {status}")
+```
+
+### `@queue.on_success`
+
+Called only when a task succeeds:
+
+```python
+@queue.on_success
+def track_metrics(task_name, args, kwargs, result):
+    metrics.increment(f"task.{task_name}.success")
+```
+
+### `@queue.on_failure`
+
+Called only when a task raises an exception:
+
+```python
+@queue.on_failure
+def alert_on_error(task_name, args, kwargs, error):
+    sentry_sdk.capture_exception(error)
+```
+
+### Hook signatures
+
+| Hook | Signature |
+|---|---|
+| `before_task` | `fn(task_name, args, kwargs)` |
+| `after_task` | `fn(task_name, args, kwargs, result, error)` |
+| `on_success` | `fn(task_name, args, kwargs, result)` |
+| `on_failure` | `fn(task_name, args, kwargs, error)` |
+
+<Callout type="info" title="Multiple hooks">
+  You can register multiple hooks of the same type. They execute in
+  registration order.
+</Callout>
+
+## Grafana setup
+
+A minimal Prometheus + Grafana stack for monitoring taskito:
+
+```yaml
+# docker-compose.monitoring.yml
+services:
+  prometheus:
+    image: prom/prometheus
+    volumes:
+      - ./prometheus.yml:/etc/prometheus/prometheus.yml
+    ports:
+      - "9090:9090"
+
+  grafana:
+    image: grafana/grafana
+    ports:
+      - "3000:3000"
+    environment:
+      - GF_SECURITY_ADMIN_PASSWORD=admin
+```
+
+```yaml
+# prometheus.yml
+scrape_configs:
+  - job_name: taskito
+    static_configs:
+      - targets: ["host.docker.internal:8080"]
+    metrics_path: /metrics
+```
+
+### Essential Grafana panels
+
+**Queue depth** (gauge):
+
+```text
+taskito_queue_depth{queue="default"}
+```
+
+**Job processing rate** (rate):
+
+```text
+rate(taskito_jobs_completed_total[5m])
+```
+
+**Job duration p99** (histogram):
+
+```text
+histogram_quantile(0.99, rate(taskito_job_duration_seconds_bucket[5m]))
+```
+
+### Alert rules
+
+```yaml
+# Alert if queue depth stays above 1000 for 5 minutes
+- alert: TaskitoQueueBacklog
+  expr: taskito_queue_depth > 1000
+  for: 5m
+
+# Alert if p99 latency exceeds 5 seconds
+- alert: TaskitoHighLatency
+  expr: histogram_quantile(0.99, rate(taskito_job_duration_seconds_bucket[5m])) > 5
+```