From ea9fac3ce7fa6275bf94deb6acbb0dc13a949500 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Orell=20B=C3=BChler?= Date: Mon, 18 May 2026 22:51:59 +0200 Subject: [PATCH 01/26] docs: add base food catalog crawler design spec --- ...-05-18-base-food-catalog-crawler-design.md | 199 ++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-18-base-food-catalog-crawler-design.md diff --git a/docs/superpowers/specs/2026-05-18-base-food-catalog-crawler-design.md b/docs/superpowers/specs/2026-05-18-base-food-catalog-crawler-design.md new file mode 100644 index 00000000..838e0aaa --- /dev/null +++ b/docs/superpowers/specs/2026-05-18-base-food-catalog-crawler-design.md @@ -0,0 +1,199 @@ +# Base Food Catalog (Crawled) — Design + +**Date:** 2026-05-18 +**Status:** Design — pending implementation plan +**Scope:** Build a pre-populated, access-gated food catalog so users stop hand-entering Swiss products that Open Food Facts lacks. + +## 1. Goal + +Today every food is user-scoped and Open Food Facts coverage of Swiss retail products is thin, so daily logging requires tedious manual food creation. This feature ships a large pre-built **catalog** of Swiss products (Migros, Coop) plus Open Food Facts, so that searching or barcode-scanning surfaces a ready-made hit the user can log in one tap. + +The catalog is built **offline on the maintainer's machine** by a crawler kept in this repo, exported to a file, and uploaded to production via an admin CLI. Access is granted per `(user, dataset)` by the maintainer — it is not visible to all users automatically. + +## 2. Non-goals + +- Committing any crawled data to the repository (public repo → that would be public redistribution). Crawler _code_ ships; crawled _data_ never does. +- Rehosting retailer images (store the source URL only, exactly as the current OFF flow does). +- Automatic/scheduled re-crawl or live retailer calls from the app runtime. Refresh is a manual CLI re-import. +- Cross-dataset canonical product merge, fuzzy non-barcode dedup, or a catalog admin web UI. +- Changing how personal foods work. `foods.userId` stays `NOT NULL`; the catalog is a separate, read-only structure. +- MCP changes beyond catalog results transparently appearing in existing tools. MCP stays web-only. + +## 3. Scope decisions (ratified with user) + +| Dimension | Decision | +| --------------- | ----------------------------------------------------------------------------------------------------- | +| Data posture | Crawler code in repo; dataset built locally and uploaded; never committed | +| Sources (v1) | **Migros + Coop + Open Food Facts** | +| Catalog size | **Full food catalog** (maximize barcode hit-rate; non-food categories excluded) | +| Storage model | **Separate tables + copy-on-use** — multi-dataset, with an M:N user↔dataset access map | +| Upload + access | **Admin CLI** against prod `DATABASE_URL`; access = a `(user, dataset)` grant row | +| Crawler stack | **TypeScript/Bun, no framework** — Migros via `migros-api-wrapper`, OFF via dump, Coop via Playwright | +| Spec/plan shape | One spec, phased implementation plan | + +## 4. Architecture + +``` +[ Offline — maintainer machine ] [ Production app ] + +crawler/ src/lib/server/catalog/ + adapters/ ├─ catalogSearch() ┐ + migros (migros-api-wrapper, JSON API) ─┐ ├─ catalogByBarcode()├─ access-gated + coop (Playwright, internal XHR/DOM) ──┼─► normalize ├─ copyCatalogFood() ┘ by catalog_access + off (ODbL bulk dump, streamed) ─────┘ (reuse └─ surfaced through existing + lib/ throttle · retry · resumable cache · src/lib/ foods.ts search / barcode + checkpoint · jsonl writer nutrients.ts) + MCP + food picker UI + │ + ▼ + data/catalog/-.jsonl ──► bun run catalog:import ──► DB: + (gitignored, Zod-validated) (prod DATABASE_URL) catalog_datasets + ▲ catalog_foods + the dataset file is the seam catalog_access +``` + +The **normalized JSONL dataset file** is the contract between the offline and online halves. It is specified first (section 6) so adapters and the importer evolve independently. The crawler reuses `src/lib/nutrients.ts` and the dataset Zod schema directly — no cross-language reimplementation. + +## 5. Data model + +New Drizzle tables in `src/lib/server/schema.ts`, migration `0037` (last existing is `0036`). Generate with `bun run db:generate` (never `db:push`); verify SQL; let `runMigrations()` apply on dev start. + +### 5.1 `catalog_datasets` + +One row per importable bundle. Identified by a stable `key` so re-imports preserve access grants. + +| Column | Type | Notes | +| -------------- | ------------- | ----------------------------------------- | +| `id` | uuid pk | `defaultRandom()` | +| `key` | text unique | Stable, e.g. `migros`, `coop`, `off-ch` | +| `name` | text not null | Display name, e.g. "Migros (Switzerland)" | +| `source` | text not null | `migros` \| `coop` \| `off` | +| `description` | text | Optional | +| `productCount` | integer | Set at import | +| `version` | text | Crawler version / build tag | +| `snapshotAt` | timestamptz | When the crawl was taken | +| `createdAt` | timestamptz | `defaultNow()` | +| `updatedAt` | timestamptz | Bumped on re-import | + +### 5.2 `catalog_foods` + +Read-only product rows. FK → `catalog_datasets` `ON DELETE CASCADE`. Re-import replaces all rows for a dataset in one transaction. + +- `id uuid pk`, `datasetId uuid → catalog_datasets` +- `name text not null`, `brand text`, `language text` (`de` \| `fr` \| `it` \| `en`, metadata only — no name translation) +- `servingSize real not null`, `servingUnit` (same enum as `foods`) +- 5 core macros (`calories`, `protein`, `carbs`, `fat`, `fiber`) — `real not null` +- All **43 extended nutrients** — `real`, nullable — column names generated from `ALL_NUTRIENTS[].dbColumn` in `src/lib/nutrients.ts` (single source of truth; identical to `foods`) +- `barcode text` (GTIN, nullable) +- OFF-quality fields: `nutriScore text`, `novaGroup integer`, `additives text[]`, `ingredientsText text`, `imageUrl text` (source URL only — not rehosted) +- Provenance: `sourceUrl text`, `sourceRef text` (retailer product id), `crawledAt timestamptz` + +Indexes: + +- `(datasetId)` +- `(datasetId, barcode)` — barcode lookup within granted datasets +- **`pg_trgm` GIN index on `name`** — full-catalog `ILIKE` substring search needs it. (Per-user `foods` `ILIKE` is fine at small scale; tens of thousands of shared rows is not.) The migration must `CREATE EXTENSION IF NOT EXISTS pg_trgm`. + +`catalog_foods` is never mutated by the app and never logged directly. + +### 5.3 `catalog_access` + +M:N grant of datasets to users. + +- `userId uuid → users` (`ON DELETE CASCADE`) +- `datasetId uuid → catalog_datasets` (`ON DELETE CASCADE`) +- `grantedAt timestamptz default now()` +- Composite primary key `(userId, datasetId)` + +A user sees catalog rows only from datasets they have a grant for. Grants survive re-import because `catalog_datasets.id` is stable per `key`. + +## 6. Normalized dataset file format + +JSONL (streamable for tens of thousands of rows). Defined once as a Zod schema in `src/lib/server/catalog/dataset-schema.ts`, **shared by crawler output and import validation** (single source of truth for the contract). + +- Line 1: header record `{ "_dataset": { "key", "name", "source", "version", "snapshotAt" } }` +- Lines 2..n: one product per line, fields = `catalog_foods` columns minus ids/timestamps, nutrients per 100 g (same normalization the existing OFF mapper uses). + +The importer rejects the whole file if any line fails schema validation (fail-closed; a partial bad dataset is worse than none). + +## 7. Crawler pipeline + +Location: `crawler/` (top-level; not part of the SvelteKit app or its build). TypeScript run with Bun. Output: `data/catalog/-.jsonl`. + +### 7.1 Adapter contract + +```ts +interface SourceAdapter { + key: string; // dataset key + crawl(opts): AsyncIterable; // source-specific +} +``` + +A shared normalizer maps `RawProduct → DatasetProduct` (the Zod schema), reusing `ALL_NUTRIENTS` keys/units and `offConversion` from `src/lib/nutrients.ts`. Each adapter is independently testable against recorded fixtures. + +### 7.2 Source adapters + +- **Migros** — depend on the `migros-api-wrapper` npm package (TS, maintained: `onesearch` v5 search → `product-display /v2/product-detail`, guest-token handling). Page through the food category tree; map nutrition table → normalized nutrients. Inherits upstream maintenance against Migros's changing endpoints. +- **Open Food Facts** — download the ODbL bulk dump (JSONL), stream-parse, filter to Swiss/relevant + food categories, map via the existing OFF logic in `src/lib/server/openfoodfacts.ts` (refactor its mapper into a reusable function shared by the live API route and the crawler). No live API hammering. +- **Coop** — no public API. Playwright (already a repo dependency): walk the food category tree, prefer the page's **internal XHR JSON** (network response) over DOM parsing; DOM (the "Nutrition information" tab) is the fallback. Most fragile → built last, isolated. + +### 7.3 Crawl guardrails (`crawler/lib/`) + +Polite fixed-delay + concurrency cap per host; exponential-backoff retry; on-disk response cache keyed by request (makes re-runs cheap and resumable); checkpoint of last completed page/category; descriptive `User-Agent`; structured progress logging. ~A few hundred LOC shared across adapters. + +## 8. Admin CLIs + +`bun run` scripts under `scripts/` (consistent with existing `scripts/*.ts`). Connect via prod `DATABASE_URL` env. No new HTTP surface. + +- `catalog:import [--replace]` — validate every line with the dataset Zod schema; upsert `catalog_datasets` by `key`; bulk-replace that dataset's `catalog_foods` in one transaction; set `productCount`/`snapshotAt`. +- `catalog:grant ` / `catalog:revoke ` — manage `catalog_access`. +- `catalog:list` — datasets, product counts, and who has access. + +## 9. App integration + +Mirrors the existing Open Food Facts pattern; minimal, targeted changes. + +- **Search** — extend `listFoods()` in `src/lib/server/foods.ts`: also query `catalog_foods` restricted to the requesting user's granted datasets (`catalog_access`), tag results `source: ''`, dedup against personal foods by barcode, order personal-first. Refactor the search path into a small union helper rather than rewriting it. +- **Barcode** — extend `findFoodByBarcode()`: lookup order personal → granted catalog → OFF fallback (insert catalog ahead of the existing OFF step in `src/routes/api/openfoodfacts/[barcode]/+server.ts` chain). +- **Copy-on-use** — selecting/logging a catalog result inserts a personal `foods` row via the existing create path (same behavior as picking an OFF result today). Catalog rows are never logged or mutated directly. +- **MCP** — `search_foods` / `find_food_by_barcode` transparently include granted catalog; access enforced server-side by `userId`. No new MCP tools. +- **UI** — catalog results get a small source badge (e.g. "Migros") in the existing food picker (`FoodPicker`). No new screens. + +## 10. Legal & operational guardrails + +- `.gitignore` `data/catalog/`; a `repo: local` prek hook rejects committing `*.jsonl` under it (belt-and-braces against accidental data commit). +- `crawler/README.md` states: data is for this app's private, authenticated user base only; not for redistribution; crawling is rate-limited and cached. +- Provenance (`source`, `sourceUrl`) stored per row and surfaced via the badge for attribution. +- Migros access uses the same guest-token endpoints the website itself uses (via `migros-api-wrapper`); Coop uses a real browser (Playwright). Polite throttling on both. + +## 11. Build sequence (phased plan) + +1. **Schema + dataset format + import/grant CLIs** — proven end-to-end with a tiny hand-written JSONL fixture (no crawler yet). Migration `0037`, `pg_trgm`. +2. **App integration** — search/barcode union, copy-on-use, UI badge, MCP, tests. Full user-facing value against fixture data. +3. **OFF adapter** — easiest (static dump); refactor the OFF mapper for reuse. +4. **Migros adapter** — spike `migros-api-wrapper` endpoints/coverage, then adapter. +5. **Coop adapter** — Playwright, internal-XHR-first, isolated, last. + +Phases 1–2 deliver the entire capability against fixtures; 3–5 are independent source plug-ins that can land in any order. + +## 12. Testing strategy + +- Dataset Zod schema: unit tests (valid/invalid lines, header record). +- Normalizer: per-source recorded-fixture → expected `DatasetProduct` tests (covers nutrient unit conversions). +- Import CLI: integration test against the test DB (`vitest.integration.config.ts` / Testcontainers) — upsert-by-key, replace semantics, grant survival across re-import, fail-closed on a bad line. +- Search/barcode union: integration tests proving access gating (granted vs non-granted user) and personal-first dedup. +- Copy-on-use: a catalog pick creates a personal `foods` row and never mutates `catalog_foods`. +- Adapters: fixture-driven unit tests only (no live network in CI). + +## 13. Open implementation spikes (resolve during planning/Phase 4–5) + +- Exact Migros food-category traversal + nutrition field coverage via `migros-api-wrapper` (does it expose all needed nutrients, or only core macros?). +- OFF bulk-dump format/size and the Swiss/food filter predicate. +- Coop internal XHR availability vs. DOM-only; per-100g normalization quirks. + +## 14. Risks + +- **Retailer endpoint drift** (Migros especially) — mitigated by depending on the maintained `migros-api-wrapper` and keeping adapters fixture-tested and isolated. +- **Coop fragility / anti-bot** — mitigated by real-browser Playwright + caching; accepted that Coop may lag the other sources. +- **Catalog search performance at scale** — mitigated by the `pg_trgm` GIN index; revisit if substring search is still slow (consider FTS). +- **Accidental data commit** — mitigated by `.gitignore` + prek guard hook. +- **Nutrient coverage gaps from sources** — extended nutrients are nullable; core macros required; rows missing required macros are dropped at normalization with a logged count. From 3d82f18ca6f5fb1069304e238e36fc1e4b5c6bc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Orell=20B=C3=BChler?= Date: Mon, 18 May 2026 23:08:38 +0200 Subject: [PATCH 02/26] docs: revise base food catalog spec after design review --- ...-05-18-base-food-catalog-crawler-design.md | 229 ++++++++++-------- 1 file changed, 134 insertions(+), 95 deletions(-) diff --git a/docs/superpowers/specs/2026-05-18-base-food-catalog-crawler-design.md b/docs/superpowers/specs/2026-05-18-base-food-catalog-crawler-design.md index 838e0aaa..58bce2b4 100644 --- a/docs/superpowers/specs/2026-05-18-base-food-catalog-crawler-design.md +++ b/docs/superpowers/specs/2026-05-18-base-food-catalog-crawler-design.md @@ -1,103 +1,118 @@ # Base Food Catalog (Crawled) — Design **Date:** 2026-05-18 -**Status:** Design — pending implementation plan +**Status:** Design — revised after pre-implementation review; pending implementation plan **Scope:** Build a pre-populated, access-gated food catalog so users stop hand-entering Swiss products that Open Food Facts lacks. +> **Revision note (post-review).** This spec was rewritten after a codebase-grounded design review. Key corrections: the app is an offline-first PWA (catalog is an online, on-demand server path — **never** synced into Dexie); copy-on-use is a **net-new** instantiate action (the current OFF flow is form-prefill, not pick-to-log); `catalog_foods` **hand-lists** the 43 nutrient columns (Drizzle can't generate them); Coop is **deferred to v1.1**; the import CLI **runs on the server host**. + ## 1. Goal -Today every food is user-scoped and Open Food Facts coverage of Swiss retail products is thin, so daily logging requires tedious manual food creation. This feature ships a large pre-built **catalog** of Swiss products (Migros, Coop) plus Open Food Facts, so that searching or barcode-scanning surfaces a ready-made hit the user can log in one tap. +Every food is currently user-scoped and Open Food Facts coverage of Swiss retail products is thin, so daily logging requires tedious manual food creation. This feature ships a large pre-built **catalog** of Swiss products so that searching or barcode-scanning surfaces a ready-made product the user can **instantiate into their personal foods in one action** and then log. -The catalog is built **offline on the maintainer's machine** by a crawler kept in this repo, exported to a file, and uploaded to production via an admin CLI. Access is granted per `(user, dataset)` by the maintainer — it is not visible to all users automatically. +The catalog is built **offline on the maintainer's machine** by a crawler kept in this repo, uploaded to the server, and imported via a CLI run **on the server host**. Access is granted per `(user, dataset)` by the maintainer — it is not visible to all users automatically. ## 2. Non-goals - Committing any crawled data to the repository (public repo → that would be public redistribution). Crawler _code_ ships; crawled _data_ never does. -- Rehosting retailer images (store the source URL only, exactly as the current OFF flow does). +- Rehosting retailer images (store the source URL only, as the current OFF flow does). - Automatic/scheduled re-crawl or live retailer calls from the app runtime. Refresh is a manual CLI re-import. -- Cross-dataset canonical product merge, fuzzy non-barcode dedup, or a catalog admin web UI. -- Changing how personal foods work. `foods.userId` stays `NOT NULL`; the catalog is a separate, read-only structure. -- MCP changes beyond catalog results transparently appearing in existing tools. MCP stays web-only. +- **Coop in v1** — deferred to v1.1 (most fragile source; see §11). +- **Live link between a copied food and its catalog origin.** A personal food instantiated from the catalog is a point-in-time **snapshot**; later catalog re-imports do not update it (same semantics as the current OFF prefill). Explicit non-goal. +- **In-app admin role or admin UI.** There is no `users.role` concept and none is added. The only "maintainer" is whoever can SSH the server host and run the CLI; access grants are CLI-only. +- Cross-dataset canonical product merge, fuzzy non-barcode dedup. +- MCP changes beyond catalog results appearing in existing tools (server-side, `userId`-gated). MCP stays web-only. +- Changing how personal foods work. `foods.userId` stays `NOT NULL`; catalog is a separate, read-only structure. ## 3. Scope decisions (ratified with user) -| Dimension | Decision | -| --------------- | ----------------------------------------------------------------------------------------------------- | -| Data posture | Crawler code in repo; dataset built locally and uploaded; never committed | -| Sources (v1) | **Migros + Coop + Open Food Facts** | -| Catalog size | **Full food catalog** (maximize barcode hit-rate; non-food categories excluded) | -| Storage model | **Separate tables + copy-on-use** — multi-dataset, with an M:N user↔dataset access map | -| Upload + access | **Admin CLI** against prod `DATABASE_URL`; access = a `(user, dataset)` grant row | -| Crawler stack | **TypeScript/Bun, no framework** — Migros via `migros-api-wrapper`, OFF via dump, Coop via Playwright | -| Spec/plan shape | One spec, phased implementation plan | +| Dimension | Decision | +| --------------- | ------------------------------------------------------------------------------------------------------------ | +| Data posture | Crawler code in repo; dataset built locally, uploaded to server, never committed | +| Sources (v1) | **Migros + Open Food Facts** (Coop → v1.1) | +| Catalog size | **Full food catalog** (maximize barcode hit-rate; non-food categories excluded) | +| Storage model | **Separate tables + copy-on-use** — multi-dataset, with an M:N user↔dataset access map (explicit ask) | +| Search index | **`pg_trgm` GIN on `catalog_foods.name`** — kept; deployment is a self-hosted standard Postgres container | +| Upload + access | **CLI run on the server host** (Docker-internal Postgres); access = a `(user, dataset)` grant row | +| Crawler stack | **TypeScript/Bun, no framework** — Migros via `migros-api-wrapper`, OFF via dump (Coop via Playwright, v1.1) | +| Spec/plan shape | One spec, phased implementation plan | ## 4. Architecture ``` -[ Offline — maintainer machine ] [ Production app ] - -crawler/ src/lib/server/catalog/ - adapters/ ├─ catalogSearch() ┐ - migros (migros-api-wrapper, JSON API) ─┐ ├─ catalogByBarcode()├─ access-gated - coop (Playwright, internal XHR/DOM) ──┼─► normalize ├─ copyCatalogFood() ┘ by catalog_access - off (ODbL bulk dump, streamed) ─────┘ (reuse └─ surfaced through existing - lib/ throttle · retry · resumable cache · src/lib/ foods.ts search / barcode - checkpoint · jsonl writer nutrients.ts) + MCP + food picker UI - │ - ▼ - data/catalog/-.jsonl ──► bun run catalog:import ──► DB: - (gitignored, Zod-validated) (prod DATABASE_URL) catalog_datasets - ▲ catalog_foods - the dataset file is the seam catalog_access +[ Offline — maintainer machine ] [ Server host (SSH) ] [ App runtime ] + +crawler/ src/lib/server/catalog/ + adapters/ catalogSearch() ┐ online, + migros (migros-api-wrapper, JSON API)─┐ catalogByBarcode()│ on-demand, + off (ODbL bulk dump, streamed) ────┼─►normalize instantiateFood() ┘ access-gated + (coop — v1.1, Playwright) │ (reuse src/lib │ (catalog_access) + lib/ throttle·retry·cache·checkpoint │ /nutrients.ts + ▼ + ·jsonl writer │ shared OFF new endpoints, NOT in /api/foods, + │ │ nutrient core) NOT synced into Dexie: + ▼ GET /api/catalog/search + data/catalog/-.jsonl ──scp──► dataset on host GET /api/catalog/barcode/:code + (gitignored, Zod-validated) │ POST /api/catalog/:id/save + ▲ ▼ │ → existing createFood() + the dataset file is the seam bun run catalog:import ───────┘ → personal foods row + (on host → Docker-internal → syncs to Dexie normally + Postgres DATABASE_URL) + ▼ + DB: catalog_datasets / catalog_foods / catalog_access ``` -The **normalized JSONL dataset file** is the contract between the offline and online halves. It is specified first (section 6) so adapters and the importer evolve independently. The crawler reuses `src/lib/nutrients.ts` and the dataset Zod schema directly — no cross-language reimplementation. +The **normalized JSONL dataset file** is the contract between offline and online halves; it is specified first (§6) so adapters and importer evolve independently. The crawler reuses `src/lib/nutrients.ts` and the dataset Zod schema directly — no cross-language reimplementation. + +**Offline-first invariant (load-bearing):** the app mirrors the user's personal foods into Dexie/IndexedDB via a full-table `/api/foods` sync; `FoodPicker` filters that local array client-side. The catalog must therefore be reached through **dedicated online endpoints** (analogous to the existing online-only `/api/openfoodfacts/[barcode]`) and must **never** enter `/api/foods` or the Dexie mirror. Only a personal food _instantiated from_ a catalog row syncs to Dexie, through the normal foods path. A regression test asserts `/api/foods` never returns catalog rows. ## 5. Data model -New Drizzle tables in `src/lib/server/schema.ts`, migration `0037` (last existing is `0036`). Generate with `bun run db:generate` (never `db:push`); verify SQL; let `runMigrations()` apply on dev start. +New Drizzle tables in `src/lib/server/schema.ts`, migration `0037` (last existing is `0036`). Workflow: edit schema → `bun run db:generate` → **hand-append** the `pg_trgm` extension + GIN index SQL (drizzle-kit does not emit `CREATE EXTENSION` or `gin_trgm_ops`) → verify → let `runMigrations()` apply on dev start. This manual completion is journal-safe because `0037` is a brand-new, never-applied file (the migration-safety rule forbids editing _applied_ files / `db:push`, not completing a fresh generated migration before first apply). `0037` must never be regenerated after the manual edit. ### 5.1 `catalog_datasets` One row per importable bundle. Identified by a stable `key` so re-imports preserve access grants. -| Column | Type | Notes | -| -------------- | ------------- | ----------------------------------------- | -| `id` | uuid pk | `defaultRandom()` | -| `key` | text unique | Stable, e.g. `migros`, `coop`, `off-ch` | -| `name` | text not null | Display name, e.g. "Migros (Switzerland)" | -| `source` | text not null | `migros` \| `coop` \| `off` | -| `description` | text | Optional | -| `productCount` | integer | Set at import | -| `version` | text | Crawler version / build tag | -| `snapshotAt` | timestamptz | When the crawl was taken | -| `createdAt` | timestamptz | `defaultNow()` | -| `updatedAt` | timestamptz | Bumped on re-import | +| Column | Type | Notes | +| -------------- | ---------------- | --------------------------------------------- | +| `id` | uuid pk | `defaultRandom()` | +| `key` | text unique | Stable, e.g. `migros`, `off-ch` | +| `name` | text not null | Display name, e.g. "Migros (Switzerland)" | +| `source` | text not null | `migros` \| `off` (`coop` reserved for v1.1) | +| `priority` | integer not null | Lower wins on cross-dataset barcode tie-break | +| `description` | text | Optional | +| `productCount` | integer | Set at import | +| `version` | text | Crawler build tag | +| `snapshotAt` | timestamptz | When the crawl was taken | +| `createdAt` | timestamptz | `defaultNow()` | +| `updatedAt` | timestamptz | Bumped on re-import | ### 5.2 `catalog_foods` -Read-only product rows. FK → `catalog_datasets` `ON DELETE CASCADE`. Re-import replaces all rows for a dataset in one transaction. +Read-only product rows. FK → `catalog_datasets` `ON DELETE CASCADE`. Re-import replaces all rows for a dataset (batched; see §8). - `id uuid pk`, `datasetId uuid → catalog_datasets` -- `name text not null`, `brand text`, `language text` (`de` \| `fr` \| `it` \| `en`, metadata only — no name translation) +- `name text not null` (the **German** name for CH retail sources — see locale note), `brand text`, `language text` (`de`\|`fr`\|`it`\|`en`, metadata only — no name translation) - `servingSize real not null`, `servingUnit` (same enum as `foods`) - 5 core macros (`calories`, `protein`, `carbs`, `fat`, `fiber`) — `real not null` -- All **43 extended nutrients** — `real`, nullable — column names generated from `ALL_NUTRIENTS[].dbColumn` in `src/lib/nutrients.ts` (single source of truth; identical to `foods`) +- The **43 extended nutrients** — `real`, nullable. **Hand-listed identically to `foods`** (`schema.ts` already hand-lists them literally because Drizzle's static type inference requires object literals — generating from `ALL_NUTRIENTS` would lose all column typing). A unit test asserts the `catalog_foods` nutrient column set is exactly `ALL_NUTRIENTS.map(n => n.dbColumn)` and matches `foods`, preventing drift. - `barcode text` (GTIN, nullable) - OFF-quality fields: `nutriScore text`, `novaGroup integer`, `additives text[]`, `ingredientsText text`, `imageUrl text` (source URL only — not rehosted) - Provenance: `sourceUrl text`, `sourceRef text` (retailer product id), `crawledAt timestamptz` +**Locale note:** sources expose de/fr/it names; v1 stores the **German** name in `name` and records `language='de'` for CH retail (German is the primary locale of the user base; matches search expectations like "Gipfeli"). OFF entries keep their dump `product_name` with detected `language`. + Indexes: - `(datasetId)` - `(datasetId, barcode)` — barcode lookup within granted datasets -- **`pg_trgm` GIN index on `name`** — full-catalog `ILIKE` substring search needs it. (Per-user `foods` `ILIKE` is fine at small scale; tens of thousands of shared rows is not.) The migration must `CREATE EXTENSION IF NOT EXISTS pg_trgm`. +- **`pg_trgm` GIN on `name`** (`USING gin (name gin_trgm_ops)`), plus `CREATE EXTENSION IF NOT EXISTS pg_trgm;` — both **hand-appended** to `0037`. The deployment is a self-hosted standard Postgres Debian container, so `pg_trgm` is available in `contrib` and the app's DB role can create it. First-migration verification is a §13 spike. `catalog_foods` is never mutated by the app and never logged directly. ### 5.3 `catalog_access` -M:N grant of datasets to users. +M:N grant of datasets to users (explicit user requirement; kept over the reviewer's YAGNI suggestion so the maintainer can grant specific datasets to specific people). - `userId uuid → users` (`ON DELETE CASCADE`) - `datasetId uuid → catalog_datasets` (`ON DELETE CASCADE`) @@ -108,16 +123,16 @@ A user sees catalog rows only from datasets they have a grant for. Grants surviv ## 6. Normalized dataset file format -JSONL (streamable for tens of thousands of rows). Defined once as a Zod schema in `src/lib/server/catalog/dataset-schema.ts`, **shared by crawler output and import validation** (single source of truth for the contract). +JSONL (streamable for tens of thousands of rows). Defined once as a Zod schema in `src/lib/server/catalog/dataset-schema.ts`, **shared by crawler output and import validation**. -- Line 1: header record `{ "_dataset": { "key", "name", "source", "version", "snapshotAt" } }` -- Lines 2..n: one product per line, fields = `catalog_foods` columns minus ids/timestamps, nutrients per 100 g (same normalization the existing OFF mapper uses). +- Line 1: header record `{ "_dataset": { "key", "name", "source", "priority", "version", "snapshotAt" } }` +- Lines 2..n: one product per line, fields = `catalog_foods` columns minus ids/timestamps, nutrients per 100 g. -The importer rejects the whole file if any line fails schema validation (fail-closed; a partial bad dataset is worse than none). +The importer is **fail-closed**: any line failing schema validation aborts the whole import (a partial bad dataset is worse than none). Rows missing required core macros are dropped at normalization time with a logged count (not an abort). ## 7. Crawler pipeline -Location: `crawler/` (top-level; not part of the SvelteKit app or its build). TypeScript run with Bun. Output: `data/catalog/-.jsonl`. +Location: `crawler/` (top-level; **not** part of the SvelteKit app, its build, or `bun run security` scope — see §10). TypeScript run with Bun. Output: `data/catalog/-.jsonl`. ### 7.1 Adapter contract @@ -128,72 +143,96 @@ interface SourceAdapter { } ``` -A shared normalizer maps `RawProduct → DatasetProduct` (the Zod schema), reusing `ALL_NUTRIENTS` keys/units and `offConversion` from `src/lib/nutrients.ts`. Each adapter is independently testable against recorded fixtures. +A shared normalizer maps `RawProduct → DatasetProduct` (the Zod schema), reusing `ALL_NUTRIENTS` keys/units and `offConversion` from `src/lib/nutrients.ts`. Each adapter is independently testable against recorded fixtures (no live network in CI). ### 7.2 Source adapters -- **Migros** — depend on the `migros-api-wrapper` npm package (TS, maintained: `onesearch` v5 search → `product-display /v2/product-detail`, guest-token handling). Page through the food category tree; map nutrition table → normalized nutrients. Inherits upstream maintenance against Migros's changing endpoints. -- **Open Food Facts** — download the ODbL bulk dump (JSONL), stream-parse, filter to Swiss/relevant + food categories, map via the existing OFF logic in `src/lib/server/openfoodfacts.ts` (refactor its mapper into a reusable function shared by the live API route and the crawler). No live API hammering. -- **Coop** — no public API. Playwright (already a repo dependency): walk the food category tree, prefer the page's **internal XHR JSON** (network response) over DOM parsing; DOM (the "Nutrition information" tab) is the fallback. Most fragile → built last, isolated. +- **Migros (v1)** — depend on the `migros-api-wrapper` npm package (TS, MIT, maintained: `onesearch` v5 search → `product-display /v2/product-detail`, guest-token handling). Page through the food category tree; map the nutrition table → normalized nutrients. Whether it exposes all 43 nutrients or only core macros is a §13 spike. Pin the version exactly. +- **Open Food Facts (v1)** — download the ODbL **bulk dump**, stream-parse, filter to Swiss/relevant + food categories. **Reuse scope (corrected):** extract only the pure nutrient core from `src/lib/server/openfoodfacts.ts` — the `extractNutrient` + `ALL_NUTRIENTS` loop (dependency-free apart from `zod` + `$lib/nutrients`) — into a shared helper used by both the live API route and the crawler. The existing `mapSearchProduct` is **unexported and coupled to the live v2 API field shape**; the bulk dump has a different structure and needs its own product-shape adapter. Do not "reuse the mapper" wholesale. +- **Coop (v1.1, deferred)** — no public API; Playwright (already a repo dependency): walk the food category tree, prefer the page's internal XHR JSON over DOM. Most fragile → separate follow-up effort, not v1. ### 7.3 Crawl guardrails (`crawler/lib/`) -Polite fixed-delay + concurrency cap per host; exponential-backoff retry; on-disk response cache keyed by request (makes re-runs cheap and resumable); checkpoint of last completed page/category; descriptive `User-Agent`; structured progress logging. ~A few hundred LOC shared across adapters. +Polite fixed-delay + per-host concurrency cap; exponential-backoff retry; on-disk response cache keyed by request (cheap, resumable re-runs); checkpoint of last completed page/category; descriptive `User-Agent`; structured progress logging. + +## 8. CLI & operations -## 8. Admin CLIs +`bun run` scripts under `scripts/` (consistent with existing `scripts/*.ts`, which already use `DATABASE_URL` + `postgres()` + `drizzle()`). -`bun run` scripts under `scripts/` (consistent with existing `scripts/*.ts`). Connect via prod `DATABASE_URL` env. No new HTTP surface. +**Operational model (decided): run on the server host.** Production Postgres is Docker-internal. Workflow: -- `catalog:import [--replace]` — validate every line with the dataset Zod schema; upsert `catalog_datasets` by `key`; bulk-replace that dataset's `catalog_foods` in one transaction; set `productCount`/`snapshotAt`. +1. Build the dataset locally (crawler). +2. `scp` the `*.jsonl` file to the server host. +3. SSH into the host; run the CLI inside the app container against the Docker-internal `DATABASE_URL` (e.g. `docker compose exec app bun run catalog:import /data/.jsonl`, dataset file mounted/copied in). Exact container invocation confirmed during planning against the `docker-server` compose setup. + +CLIs: + +- `catalog:import ` — validate every line with the dataset Zod schema (fail-closed); upsert `catalog_datasets` by the header `key`; **fully replace** that dataset's `catalog_foods` (a dataset import is always a complete snapshot of that dataset — there is no partial/append mode; other datasets are untouched). **Bulk strategy:** within a transaction, `DELETE` the dataset's rows, then **chunked multi-row INSERT (e.g. 1–5k rows/statement)**; drop the `pg_trgm` GIN index before the bulk load and recreate it after (avoids per-row GIN maintenance at tens-of-thousands scale). Set `productCount`/`snapshotAt`. - `catalog:grant ` / `catalog:revoke ` — manage `catalog_access`. -- `catalog:list` — datasets, product counts, and who has access. +- `catalog:list` — datasets, product counts, who has access. + +No new HTTP surface. Admin identity = SSH + DB access on the host (no in-app role). ## 9. App integration -Mirrors the existing Open Food Facts pattern; minimal, targeted changes. +The app is an offline-first PWA (see §4 invariant). Catalog access is therefore **online, on-demand, server-side**, mirroring the existing OFF online path — **not** an extension of `listFoods()` and **not** part of the Dexie-synced `/api/foods` payload. The user-facing "pick a result and log it" is **net-new** (the current OFF flow is barcode→navigate to `/foods`→prefill the create form→manual save; there is no pick-to-log today — so this is built, not inherited). -- **Search** — extend `listFoods()` in `src/lib/server/foods.ts`: also query `catalog_foods` restricted to the requesting user's granted datasets (`catalog_access`), tag results `source: ''`, dedup against personal foods by barcode, order personal-first. Refactor the search path into a small union helper rather than rewriting it. -- **Barcode** — extend `findFoodByBarcode()`: lookup order personal → granted catalog → OFF fallback (insert catalog ahead of the existing OFF step in `src/routes/api/openfoodfacts/[barcode]/+server.ts` chain). -- **Copy-on-use** — selecting/logging a catalog result inserts a personal `foods` row via the existing create path (same behavior as picking an OFF result today). Catalog rows are never logged or mutated directly. -- **MCP** — `search_foods` / `find_food_by_barcode` transparently include granted catalog; access enforced server-side by `userId`. No new MCP tools. -- **UI** — catalog results get a small source badge (e.g. "Migros") in the existing food picker (`FoodPicker`). No new screens. +New module `src/lib/server/catalog/` + endpoints: + +- `GET /api/catalog/search?q=&limit=` — trigram/`ILIKE` search over `catalog_foods` restricted to the requesting user's granted datasets; returns rows tagged with `datasetKey`/`source`. Own limit/offset; **does not** alter `listFoods()` `{ items, total }` semantics. +- `GET /api/catalog/barcode/:code` — barcode lookup across the user's granted datasets. Cross-dataset tie-break: if multiple granted datasets contain the barcode, return the row from the **lowest `catalog_datasets.priority`** (e.g. Migros < OFF). +- `POST /api/catalog/:id/save` — the explicit copy-on-use action: instantiate a personal `foods` row from the catalog row via the existing `createFood`/insert path (server-side, `userId`-gated by the row's dataset grant), return the new food. It then syncs to Dexie normally and is logged through the existing food path. Catalog rows are never mutated or logged directly. + +UI (minimal): + +- `FoodPicker` search tab: local Dexie filter unchanged; when the user has any catalog grant, additionally issue a **debounced online** `GET /api/catalog/search` and render those results below personal results with a small **source badge** (e.g. "Migros"). Selecting a catalog result calls `POST /api/catalog/:id/save`, then proceeds to log the returned personal food. Client-side merge is personal-first; no server union, so the foods hot path is untouched. +- Barcode scan miss path: query `GET /api/catalog/barcode/:code` **before** the existing OFF fallback; on hit, offer save+log. + +MCP: `search_foods` / `find_food_by_barcode` handlers additionally consult the catalog (server-side, `userId`-gated via `catalog_access`); results carry `source`. Logging a catalog result through MCP performs the same server-side instantiate first. No new MCP tools; MCP stays web-only. ## 10. Legal & operational guardrails -- `.gitignore` `data/catalog/`; a `repo: local` prek hook rejects committing `*.jsonl` under it (belt-and-braces against accidental data commit). -- `crawler/README.md` states: data is for this app's private, authenticated user base only; not for redistribution; crawling is rate-limited and cached. -- Provenance (`source`, `sourceUrl`) stored per row and surfaced via the badge for attribution. -- Migros access uses the same guest-token endpoints the website itself uses (via `migros-api-wrapper`); Coop uses a real browser (Playwright). Polite throttling on both. +- `.gitignore` `data/catalog/`; a `repo: local` prek hook (matching the project's all-`repo: local` `.pre-commit-config.yaml`) rejects committing `*.jsonl` under it. +- The legal posture is **private use by this app's authenticated user base + no redistribution** (no data committed; access-gated). The crawler hitting Migros's guest-token endpoints is _not_ presented as sanctioned — the defense is private use and non-redistribution, stated in `crawler/README.md`. Polite throttling + caching. +- The crawler lives in `crawler/` outside `src/` and outside the app build; nothing in `src/` imports it. Confirm `bun run security` / `bun audit` scoping does not fail CI on crawler-only deps (e.g. `migros-api-wrapper`'s axios/cheerio/pino transitive tree); pin the dependency exactly. If the security suite scans the whole tree, add the documented-exception pattern used for the existing accepted `minimatch` exception. +- Provenance (`source`, `sourceUrl`) stored per row and surfaced via the badge. ## 11. Build sequence (phased plan) -1. **Schema + dataset format + import/grant CLIs** — proven end-to-end with a tiny hand-written JSONL fixture (no crawler yet). Migration `0037`, `pg_trgm`. -2. **App integration** — search/barcode union, copy-on-use, UI badge, MCP, tests. Full user-facing value against fixture data. -3. **OFF adapter** — easiest (static dump); refactor the OFF mapper for reuse. -4. **Migros adapter** — spike `migros-api-wrapper` endpoints/coverage, then adapter. -5. **Coop adapter** — Playwright, internal-XHR-first, isolated, last. +**v1:** + +1. **Schema + dataset format + CLIs** — migration `0037` (with hand-appended `pg_trgm`), drift-guard test, `catalog:import/grant/revoke/list`; proven end-to-end with a tiny hand-written JSONL fixture (no crawler yet). +2. **App integration** — `src/lib/server/catalog/` + the three endpoints, `FoodPicker`/barcode UI, MCP, and tests; full user-facing value against fixture data (no crawler yet). +3. **OFF adapter** — bulk-dump download/filter + shared nutrient-core extraction. +4. **Migros adapter** — spike `migros-api-wrapper` coverage, then adapter. + +**v1.1:** 5. **Coop adapter** — Playwright, internal-XHR-first, isolated. -Phases 1–2 deliver the entire capability against fixtures; 3–5 are independent source plug-ins that can land in any order. +Phases 1–2 deliver the entire UX against fixtures; 3–4 are independent source plug-ins landing in any order. ## 12. Testing strategy -- Dataset Zod schema: unit tests (valid/invalid lines, header record). -- Normalizer: per-source recorded-fixture → expected `DatasetProduct` tests (covers nutrient unit conversions). -- Import CLI: integration test against the test DB (`vitest.integration.config.ts` / Testcontainers) — upsert-by-key, replace semantics, grant survival across re-import, fail-closed on a bad line. -- Search/barcode union: integration tests proving access gating (granted vs non-granted user) and personal-first dedup. -- Copy-on-use: a catalog pick creates a personal `foods` row and never mutates `catalog_foods`. -- Adapters: fixture-driven unit tests only (no live network in CI). +- Dataset Zod schema: valid/invalid lines, header record. +- Drift guard: `catalog_foods` nutrient columns ≡ `ALL_NUTRIENTS[].dbColumn` ≡ `foods` nutrient columns. +- Nutrient normalizer: per-source recorded-fixture → expected `DatasetProduct` (unit conversions). +- Import CLI: integration test (Testcontainers, `vitest.integration.config.ts`) — upsert-by-key, batched replace, GIN drop/recreate, grant survival across re-import, fail-closed on a bad line. +- Endpoints: access gating (granted vs non-granted user) on `/api/catalog/search` & `/barcode`; cross-dataset `priority` tie-break; `POST /api/catalog/:id/save` creates a personal food and never mutates `catalog_foods`. +- **Offline invariant:** `/api/foods` never returns catalog rows; the Dexie mirror is unaffected by catalog presence. +- Adapters: fixture-driven unit tests only. -## 13. Open implementation spikes (resolve during planning/Phase 4–5) +## 13. Open implementation spikes (resolve during planning) -- Exact Migros food-category traversal + nutrition field coverage via `migros-api-wrapper` (does it expose all needed nutrients, or only core macros?). -- OFF bulk-dump format/size and the Swiss/food filter predicate. -- Coop internal XHR availability vs. DOM-only; per-100g normalization quirks. +- Migros food-category traversal + nutrition field coverage via `migros-api-wrapper` (all 43 nutrients, or only core macros?). +- OFF **bulk-dump** field shape/size and the Swiss/food filter predicate (distinct from the live v2 API shape). +- Exact server-host container invocation for the CLI against the `docker-server` compose setup. +- First-migration verification that `CREATE EXTENSION pg_trgm` succeeds as the app's DB role in the deployment container (expected: yes — standard Debian Postgres image). ## 14. Risks -- **Retailer endpoint drift** (Migros especially) — mitigated by depending on the maintained `migros-api-wrapper` and keeping adapters fixture-tested and isolated. -- **Coop fragility / anti-bot** — mitigated by real-browser Playwright + caching; accepted that Coop may lag the other sources. -- **Catalog search performance at scale** — mitigated by the `pg_trgm` GIN index; revisit if substring search is still slow (consider FTS). -- **Accidental data commit** — mitigated by `.gitignore` + prek guard hook. -- **Nutrient coverage gaps from sources** — extended nutrients are nullable; core macros required; rows missing required macros are dropped at normalization with a logged count. +- **Migration `0037` is hand-completed** (extension + raw GIN). Mitigation: it's a fresh never-applied file (journal-safe); documented "do not regenerate"; first-apply verified in the §13 spike + dev-start check per the project's migration-safety rule. +- **Catalog leaking into the offline mirror** would bloat every user's IndexedDB and the user-switch wipe. Mitigation: catalog is online-only by construction (separate endpoints, never in `/api/foods`) + an explicit regression test (§12). +- **Retailer endpoint drift** (Migros). Mitigation: maintained `migros-api-wrapper`, pinned; adapters fixture-tested and isolated; Migros is one isolated phase. +- **`migros-api-wrapper` transitive dependency surface** (axios/cheerio/pino) + single-maintainer auto-publish. Mitigation: exact pin; crawler isolated from app build & security scope; reviewed on bump. +- **Bulk import performance** at tens of thousands of rows. Mitigation: chunked inserts + GIN drop/recreate in a transaction (§8). +- **Nutrient coverage gaps from sources.** Extended nutrients nullable; core macros required; rows missing required macros dropped with a logged count. +- **Coop fragility** — removed from v1 by deferring to v1.1. From 7cfa2386845ce8fb0617f93aaf512d152f7aaeaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Orell=20B=C3=BChler?= Date: Thu, 21 May 2026 20:10:36 +0200 Subject: [PATCH 03/26] docs: add base food catalog foundation + integration plan 13-task plan covering Phase 1 (schema, dataset JSONL Zod, import/grant CLIs, pg_trgm GIN, drift guard, fixture) and Phase 2 (online search/barcode/save endpoints, picker source-badge, AddFoodModal pick-to-log, DayLog barcode flow). Proves end-to-end against a JSONL fixture; OFF/Migros adapters are follow-on plans, Coop is v1.1. --- ...food-catalog-foundation-and-integration.md | 1859 +++++++++++++++++ 1 file changed, 1859 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-18-base-food-catalog-foundation-and-integration.md diff --git a/docs/superpowers/plans/2026-05-18-base-food-catalog-foundation-and-integration.md b/docs/superpowers/plans/2026-05-18-base-food-catalog-foundation-and-integration.md new file mode 100644 index 00000000..4bc6fa50 --- /dev/null +++ b/docs/superpowers/plans/2026-05-18-base-food-catalog-foundation-and-integration.md @@ -0,0 +1,1859 @@ +# Base Food Catalog — Foundation & App Integration Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Ship the access-gated base food catalog end-to-end (DB tables, dataset file format, admin CLIs, online search/barcode/save endpoints, picker + barcode UI) proven against JSONL fixtures — no crawler yet. + +**Architecture:** Three new read-only tables (`catalog_datasets`, `catalog_foods`, `catalog_access`, M:N grants) populated by a `bun` CLI run on the server host from a Zod-validated JSONL file. The catalog is reached only through new online endpoints (`/api/catalog/*`), never synced into Dexie. Picking a catalog result calls a server-side instantiate endpoint that creates a normal personal `foods` row via the existing `createFood` path, which then syncs to Dexie and is logged normally. + +**Tech Stack:** SvelteKit 2 / Svelte 5 runes, Bun, Drizzle ORM + Postgres (postgres-js), Zod, Vitest (+ Testcontainers integration), Paraglide i18n, zod-openapi. + +**Spec:** `docs/superpowers/specs/2026-05-18-base-food-catalog-crawler-design.md` (this plan covers v1 Phases 1–2 only; OFF/Migros adapters = follow-on plans; Coop = v1.1). + +--- + +## File Structure + +**Phase 1 — Foundation** + +- Modify `src/lib/server/schema.ts` — add `catalogDatasets`, `catalogFoods`, `catalogAccess`. +- Generate+hand-edit `drizzle/0037_*.sql` — `CREATE EXTENSION pg_trgm` + GIN index appended. +- Create `src/lib/server/catalog/dataset-schema.ts` — Zod schema for the JSONL contract (header + product), shared by importer and (future) crawler. +- Create `src/lib/server/catalog/dataset-schema.test.ts` — unit tests. +- Create `scripts/catalog.ts` — one CLI with `import` / `grant` / `revoke` / `list` subcommands (shared DB-connect boilerplate, DRY). +- Modify `package.json` — `catalog:*` scripts. +- Modify `.gitignore` — ignore `data/catalog/`. +- Modify `.pre-commit-config.yaml` — local hook rejecting committed `data/catalog/*.jsonl`. +- Create `tests/integration-db/catalog-schema.test.ts` — drift guard + extension/index existence. +- Create `tests/integration-db/catalog-import.test.ts` — import/grant/revoke/list behavior. +- Create `tests/fixtures/catalog/mini.jsonl` — tiny valid dataset fixture. + +**Phase 2 — App integration** + +- Create `src/lib/server/nutrient-extract.ts` — pure nutrient-extraction helper (extracted from `openfoodfacts.ts`, Gap 8). +- Modify `src/lib/server/openfoodfacts.ts` — use the shared helper (behavior-preserving). +- Create `src/lib/server/catalog/queries.ts` — `catalogSearch`, `catalogByBarcode`, `instantiateCatalogFood`. +- Create `src/routes/api/catalog/search/+server.ts`, `src/routes/api/catalog/barcode/[code]/+server.ts`, `src/routes/api/catalog/[id]/save/+server.ts`. +- Modify `src/lib/server/openapi.ts` — declare the 3 catalog routes; regenerate `docs/openapi.json` + `src/lib/api/generated/schema.d.ts`. +- Modify `messages/en.json`, `messages/de.json` — catalog UI strings. +- Modify `src/lib/components/entries/FoodPicker.svelte` — online catalog search + source badge. +- Modify `src/lib/components/entries/AddFoodModal.svelte` — catalog pick → save → log. +- Modify `src/lib/services/food-service.svelte.ts` — `saveFromCatalog`. +- Modify `src/lib/components/entries/DayLog.svelte` — barcode miss → catalog before OFF. +- Create `tests/integration-db/catalog-endpoints.test.ts` — access gating, priority tie-break, instantiate, `/api/foods` isolation. +- Create `src/lib/server/nutrient-extract.test.ts` — unit test. + +--- + +# PHASE 1 — Catalog Foundation + +### Task 1: Catalog schema + migration (with hand-appended pg_trgm) + drift-guard test + +**Files:** +- Test: `tests/integration-db/catalog-schema.test.ts` (create) +- Modify: `src/lib/server/schema.ts` (append after the `foods` table block) +- Generate: `drizzle/0037_*.sql` + `drizzle/meta/_journal.json` (drizzle-kit), then hand-edit the `.sql` + +- [ ] **Step 1: Write the failing drift-guard + extension integration test** + +Create `tests/integration-db/catalog-schema.test.ts`: + +```typescript +import { describe, it, expect, beforeAll, afterAll } from 'vitest'; +import { sql } from 'drizzle-orm'; +import { createTestDatabase, dropTestDatabase, runTestMigrations, getTestDB, closeTestDB } from './helpers'; +import { ALL_NUTRIENTS } from '$lib/nutrients'; + +const DB_NAME = 'test_catalog_schema'; +let dbUrl: string; + +beforeAll(async () => { + dbUrl = await createTestDatabase(DB_NAME); + await runTestMigrations(dbUrl); +}); + +afterAll(async () => { + await closeTestDB(dbUrl); + await dropTestDatabase(DB_NAME); +}); + +async function columns(db: ReturnType, table: string): Promise> { + const rows = await db.execute( + sql`SELECT column_name FROM information_schema.columns WHERE table_name = ${table}` + ); + return new Set((rows as unknown as { column_name: string }[]).map((r) => r.column_name)); +} + +describe('catalog schema', () => { + it('catalog_foods nutrient columns match foods and ALL_NUTRIENTS exactly', async () => { + const db = getTestDB(dbUrl); + const catalogCols = await columns(db, 'catalog_foods'); + const foodCols = await columns(db, 'foods'); + for (const n of ALL_NUTRIENTS) { + expect(catalogCols.has(n.dbColumn), `catalog_foods missing ${n.dbColumn}`).toBe(true); + expect(foodCols.has(n.dbColumn), `foods missing ${n.dbColumn}`).toBe(true); + } + }); + + it('pg_trgm extension and GIN name index exist', async () => { + const db = getTestDB(dbUrl); + const ext = await db.execute(sql`SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm'`); + expect((ext as unknown as unknown[]).length).toBe(1); + const idx = await db.execute( + sql`SELECT 1 FROM pg_indexes WHERE indexname = 'idx_catalog_foods_name_trgm'` + ); + expect((idx as unknown as unknown[]).length).toBe(1); + }); + + it('catalog_datasets.key is unique and catalog_access has composite PK', async () => { + const db = getTestDB(dbUrl); + const ds = await columns(db, 'catalog_datasets'); + expect(ds.has('key')).toBe(true); + expect(ds.has('priority')).toBe(true); + const acc = await columns(db, 'catalog_access'); + expect(acc.has('user_id')).toBe(true); + expect(acc.has('dataset_id')).toBe(true); + }); +}); +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-schema.test.ts` +Expected: FAIL — migration has no `catalog_foods` table (`relation "catalog_foods" does not exist` or column assertions fail). + +- [ ] **Step 3: Add the three tables to `src/lib/server/schema.ts`** + +Append immediately after the closing `);` of the `foods` table definition (the import block at the top already includes `pgTable, uuid, text, timestamp, real, boolean, integer, index, uniqueIndex, primaryKey, check` and `sql` — no import changes needed): + +```typescript +export const catalogDatasets = pgTable('catalog_datasets', { + id: uuid('id').primaryKey().defaultRandom(), + key: text('key').notNull().unique(), + name: text('name').notNull(), + source: text('source').notNull(), + priority: integer('priority').notNull().default(100), + description: text('description'), + productCount: integer('product_count'), + version: text('version'), + snapshotAt: timestamp('snapshot_at', { withTimezone: true }), + createdAt: timestamp('created_at', { withTimezone: true }).defaultNow(), + updatedAt: timestamp('updated_at', { withTimezone: true }).defaultNow() +}); + +export const catalogFoods = pgTable( + 'catalog_foods', + { + id: uuid('id').primaryKey().defaultRandom(), + datasetId: uuid('dataset_id') + .notNull() + .references(() => catalogDatasets.id, { onDelete: 'cascade' }), + name: text('name').notNull(), + brand: text('brand'), + language: text('language'), + servingSize: real('serving_size').notNull(), + servingUnit: servingUnitEnum('serving_unit').notNull(), + calories: real('calories').notNull(), + protein: real('protein').notNull(), + carbs: real('carbs').notNull(), + fat: real('fat').notNull(), + fiber: real('fiber').notNull(), + // Advanced nutrients — fat breakdown + saturatedFat: real('saturated_fat'), + monounsaturatedFat: real('monounsaturated_fat'), + polyunsaturatedFat: real('polyunsaturated_fat'), + transFat: real('trans_fat'), + cholesterol: real('cholesterol'), + omega3: real('omega3'), + omega6: real('omega6'), + // Sugar & carb details + sugar: real('sugar'), + addedSugars: real('added_sugars'), + sugarAlcohols: real('sugar_alcohols'), + starch: real('starch'), + // Minerals + sodium: real('sodium'), + potassium: real('potassium'), + calcium: real('calcium'), + iron: real('iron'), + magnesium: real('magnesium'), + phosphorus: real('phosphorus'), + zinc: real('zinc'), + copper: real('copper'), + manganese: real('manganese'), + selenium: real('selenium'), + iodine: real('iodine'), + fluoride: real('fluoride'), + chromium: real('chromium'), + molybdenum: real('molybdenum'), + chloride: real('chloride'), + // Vitamins + vitaminA: real('vitamin_a'), + vitaminC: real('vitamin_c'), + vitaminD: real('vitamin_d'), + vitaminE: real('vitamin_e'), + vitaminK: real('vitamin_k'), + vitaminB1: real('vitamin_b1'), + vitaminB2: real('vitamin_b2'), + vitaminB3: real('vitamin_b3'), + vitaminB5: real('vitamin_b5'), + vitaminB6: real('vitamin_b6'), + vitaminB7: real('vitamin_b7'), + vitaminB9: real('vitamin_b9'), + vitaminB12: real('vitamin_b12'), + // Other + caffeine: real('caffeine'), + alcohol: real('alcohol'), + water: real('water'), + salt: real('salt'), + barcode: text('barcode'), + nutriScore: text('nutri_score'), + novaGroup: integer('nova_group'), + additives: text('additives').array(), + ingredientsText: text('ingredients_text'), + imageUrl: text('image_url'), + sourceUrl: text('source_url'), + sourceRef: text('source_ref'), + crawledAt: timestamp('crawled_at', { withTimezone: true }), + createdAt: timestamp('created_at', { withTimezone: true }).defaultNow() + }, + (table) => [ + index('idx_catalog_foods_dataset').on(table.datasetId), + index('idx_catalog_foods_dataset_barcode').on(table.datasetId, table.barcode), + check('catalog_foods_serving_positive', sql`${table.servingSize} > 0`), + check( + 'catalog_foods_nutrition_nonnegative', + sql`${table.calories} >= 0 AND ${table.protein} >= 0 AND ${table.carbs} >= 0 AND ${table.fat} >= 0 AND ${table.fiber} >= 0` + ) + ] +); + +export const catalogAccess = pgTable( + 'catalog_access', + { + userId: uuid('user_id') + .notNull() + .references(() => users.id, { onDelete: 'cascade' }), + datasetId: uuid('dataset_id') + .notNull() + .references(() => catalogDatasets.id, { onDelete: 'cascade' }), + grantedAt: timestamp('granted_at', { withTimezone: true }).defaultNow() + }, + (table) => [primaryKey({ columns: [table.userId, table.datasetId] })] +); +``` + +The `name gin_trgm_ops` index is intentionally NOT declared here — drizzle-kit cannot emit the opclass or `CREATE EXTENSION`; it is hand-appended to the migration (Step 5) and verified by the test. + +- [ ] **Step 4: Generate the migration** + +Run: `bun run db:generate` +Expected: a new file `drizzle/0037_.sql` is created and `drizzle/meta/_journal.json` gains an `idx: 37` entry. Note the exact generated filename. + +- [ ] **Step 5: Hand-append the pg_trgm extension + GIN index to the generated SQL** + +Open the generated `drizzle/0037_.sql`. At the very END of the file, append (the last existing statement already ends with `--> statement-breakpoint` or is the last `CREATE TABLE`; ensure a `--> statement-breakpoint` separates them): + +```sql +--> statement-breakpoint +CREATE EXTENSION IF NOT EXISTS pg_trgm;--> statement-breakpoint +CREATE INDEX "idx_catalog_foods_name_trgm" ON "catalog_foods" USING gin ("name" gin_trgm_ops); +``` + +Do NOT run `bun run db:generate` again after this edit (it would overwrite the manual SQL — this file is now frozen; the journal already records it). + +- [ ] **Step 6: Run the test to verify it passes** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-schema.test.ts` +Expected: PASS (3 tests). Testcontainers `postgres:18` ships `pg_trgm` in contrib. + +- [ ] **Step 7: Verify the dev server starts cleanly (migration safety rule)** + +Run: `timeout 25 bun run dev 2>&1 | head -40` +Expected: server boots, no "Migration failed" output. (Stop it; the migration applied via `runMigrations()`.) + +- [ ] **Step 8: Commit** + +```bash +git add src/lib/server/schema.ts drizzle/ tests/integration-db/catalog-schema.test.ts +git commit -m "feat: add catalog schema (datasets/foods/access) + pg_trgm index" +``` + +--- + +### Task 2: Dataset JSONL Zod schema (the crawler↔importer contract) + +**Files:** +- Test: `src/lib/server/catalog/dataset-schema.test.ts` (create) +- Create: `src/lib/server/catalog/dataset-schema.ts` + +- [ ] **Step 1: Write the failing unit test** + +Create `src/lib/server/catalog/dataset-schema.test.ts`: + +```typescript +import { describe, it, expect } from 'vitest'; +import { datasetHeaderSchema, datasetProductSchema } from './dataset-schema'; + +describe('dataset-schema', () => { + it('accepts a valid header record', () => { + const r = datasetHeaderSchema.safeParse({ + _dataset: { + key: 'migros', + name: 'Migros (Switzerland)', + source: 'migros', + priority: 10, + version: '2026.05.18', + snapshotAt: '2026-05-18T00:00:00.000Z' + } + }); + expect(r.success).toBe(true); + }); + + it('accepts a minimal valid product line', () => { + const r = datasetProductSchema.safeParse({ + name: 'Zweifel Paprika Chips', + servingSize: 100, + servingUnit: 'g', + calories: 515, + protein: 5.8, + carbs: 53, + fat: 30, + fiber: 5.6 + }); + expect(r.success).toBe(true); + }); + + it('accepts known extended nutrients and OFF quality fields', () => { + const r = datasetProductSchema.safeParse({ + name: 'X', + servingSize: 100, + servingUnit: 'g', + calories: 1, + protein: 1, + carbs: 1, + fat: 1, + fiber: 1, + saturatedFat: 5.1, + salt: 1.3, + barcode: '7610095131003', + language: 'de', + nutriScore: 'd', + novaGroup: 4, + additives: ['en:e330'], + sourceUrl: 'https://www.migros.ch/de/product/123', + sourceRef: '123' + }); + expect(r.success).toBe(true); + }); + + it('rejects a product missing required core macros', () => { + const r = datasetProductSchema.safeParse({ name: 'X', servingSize: 100, servingUnit: 'g' }); + expect(r.success).toBe(false); + }); + + it('rejects negative nutrients and bad nutriScore', () => { + expect( + datasetProductSchema.safeParse({ + name: 'X', + servingSize: 100, + servingUnit: 'g', + calories: -1, + protein: 0, + carbs: 0, + fat: 0, + fiber: 0 + }).success + ).toBe(false); + expect( + datasetProductSchema.safeParse({ + name: 'X', + servingSize: 100, + servingUnit: 'g', + calories: 0, + protein: 0, + carbs: 0, + fat: 0, + fiber: 0, + nutriScore: 'z' + }).success + ).toBe(false); + }); +}); +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: `bun --bun vitest run src/lib/server/catalog/dataset-schema.test.ts` +Expected: FAIL — `Cannot find module './dataset-schema'`. + +- [ ] **Step 3: Implement the schema** + +Create `src/lib/server/catalog/dataset-schema.ts`: + +```typescript +import { z } from 'zod'; +import { servingUnitSchema } from '$lib/units'; +import { ALL_NUTRIENT_KEYS } from '$lib/nutrients'; + +const optNutrient = z.coerce.number().nonnegative().optional().nullable(); +const nutrientFields = Object.fromEntries(ALL_NUTRIENT_KEYS.map((k) => [k, optNutrient])); + +export const datasetHeaderSchema = z.object({ + _dataset: z.object({ + key: z + .string() + .min(1) + .max(64) + .regex(/^[a-z0-9-]+$/), + name: z.string().min(1).max(200), + source: z.enum(['migros', 'off', 'coop']), + priority: z.coerce.number().int().min(0).max(1000), + version: z.string().max(64).optional().nullable(), + snapshotAt: z.string().datetime().optional().nullable() + }) +}); + +export const datasetProductSchema = z.object({ + name: z.string().min(1).max(500), + brand: z.string().max(500).optional().nullable(), + language: z.enum(['de', 'fr', 'it', 'en']).optional().nullable(), + servingSize: z.coerce.number().positive(), + servingUnit: servingUnitSchema, + calories: z.coerce.number().nonnegative(), + protein: z.coerce.number().nonnegative(), + carbs: z.coerce.number().nonnegative(), + fat: z.coerce.number().nonnegative(), + fiber: z.coerce.number().nonnegative(), + ...nutrientFields, + barcode: z.string().max(32).optional().nullable(), + nutriScore: z.enum(['a', 'b', 'c', 'd', 'e']).optional().nullable(), + novaGroup: z.coerce.number().int().min(1).max(4).optional().nullable(), + additives: z.array(z.string().max(100)).max(200).optional().nullable(), + ingredientsText: z.string().max(10000).optional().nullable(), + imageUrl: z.string().url().max(2000).optional().nullable(), + sourceUrl: z.string().url().max(2000).optional().nullable(), + sourceRef: z.string().max(200).optional().nullable(), + crawledAt: z.string().datetime().optional().nullable() +}); + +export type DatasetHeader = z.infer; +export type DatasetProduct = z.infer; +``` + +- [ ] **Step 4: Run the test to verify it passes** + +Run: `bun --bun vitest run src/lib/server/catalog/dataset-schema.test.ts` +Expected: PASS (5 tests). + +- [ ] **Step 5: Commit** + +```bash +git add src/lib/server/catalog/dataset-schema.ts src/lib/server/catalog/dataset-schema.test.ts +git commit -m "feat: add catalog dataset JSONL Zod schema" +``` + +--- + +### Task 3: Catalog CLI — `import` subcommand + fixture + integration test + +**Files:** +- Create: `tests/fixtures/catalog/mini.jsonl` +- Test: `tests/integration-db/catalog-import.test.ts` (create) +- Create: `scripts/catalog.ts` +- Modify: `package.json` (scripts block) + +- [ ] **Step 1: Create the fixture dataset** + +Create `tests/fixtures/catalog/mini.jsonl` (exactly these 3 lines; line 1 is the header): + +``` +{"_dataset":{"key":"testset","name":"Test Set","source":"migros","priority":10,"version":"t1","snapshotAt":"2026-05-18T00:00:00.000Z"}} +{"name":"Zweifel Paprika Chips","brand":"Zweifel","language":"de","servingSize":100,"servingUnit":"g","calories":515,"protein":5.8,"carbs":53,"fat":30,"fiber":5.6,"saturatedFat":1.8,"salt":1.3,"barcode":"7610095131003","sourceUrl":"https://www.migros.ch/de/product/1","sourceRef":"1"} +{"name":"Coop Naturaplan Bio Apfel","brand":"Coop","language":"de","servingSize":100,"servingUnit":"g","calories":52,"protein":0.3,"carbs":14,"fat":0.2,"fiber":2.4,"barcode":"7610095131004","sourceUrl":"https://www.migros.ch/de/product/2","sourceRef":"2"} +``` + +- [ ] **Step 2: Write the failing import integration test** + +Create `tests/integration-db/catalog-import.test.ts`: + +```typescript +import { describe, it, expect, beforeAll, afterAll } from 'vitest'; +import { eq } from 'drizzle-orm'; +import { $ } from 'bun'; +import { join } from 'node:path'; +import { createTestDatabase, dropTestDatabase, runTestMigrations, getTestDB, closeTestDB } from './helpers'; +import { catalogDatasets, catalogFoods } from '$lib/server/schema'; + +const DB_NAME = 'test_catalog_import'; +let dbUrl: string; +const FIXTURE = join(process.cwd(), 'tests/fixtures/catalog/mini.jsonl'); + +beforeAll(async () => { + dbUrl = await createTestDatabase(DB_NAME); + await runTestMigrations(dbUrl); +}); +afterAll(async () => { + await closeTestDB(dbUrl); + await dropTestDatabase(DB_NAME); +}); + +describe('catalog:import', () => { + it('imports a dataset, upserts by key, replaces rows on re-import', async () => { + await $`bun run scripts/catalog.ts import ${FIXTURE}`.env({ ...process.env, DATABASE_URL: dbUrl }); + const db = getTestDB(dbUrl); + const ds = await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'testset') }); + expect(ds).toBeDefined(); + expect(ds!.productCount).toBe(2); + const firstId = ds!.id; + const rows = await db.select().from(catalogFoods).where(eq(catalogFoods.datasetId, firstId)); + expect(rows.length).toBe(2); + expect(rows.find((r) => r.barcode === '7610095131003')!.name).toBe('Zweifel Paprika Chips'); + + // Re-import: same key reuses the dataset row (id stable), rows replaced + await $`bun run scripts/catalog.ts import ${FIXTURE}`.env({ ...process.env, DATABASE_URL: dbUrl }); + const ds2 = await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'testset') }); + expect(ds2!.id).toBe(firstId); + const rows2 = await db.select().from(catalogFoods).where(eq(catalogFoods.datasetId, firstId)); + expect(rows2.length).toBe(2); + }); + + it('fails closed on an invalid line and aborts the whole import', async () => { + const bad = join(process.cwd(), 'tests/fixtures/catalog/bad.jsonl'); + await Bun.write( + bad, + '{"_dataset":{"key":"badset","name":"Bad","source":"migros","priority":1}}\n{"name":"NoMacros","servingSize":100,"servingUnit":"g"}\n' + ); + let failed = false; + try { + await $`bun run scripts/catalog.ts import ${bad}` + .env({ ...process.env, DATABASE_URL: dbUrl }) + .quiet(); + } catch { + failed = true; + } + expect(failed).toBe(true); + const db = getTestDB(dbUrl); + const ds = await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'badset') }); + expect(ds).toBeUndefined(); + }); +}); +``` + +- [ ] **Step 3: Run the test to verify it fails** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-import.test.ts` +Expected: FAIL — `scripts/catalog.ts` does not exist (`bun run` errors / non-zero exit). + +- [ ] **Step 4: Implement `scripts/catalog.ts` (import subcommand)** + +Create `scripts/catalog.ts`: + +```typescript +#!/usr/bin/env bun +import { drizzle } from 'drizzle-orm/postgres-js'; +import { eq, sql } from 'drizzle-orm'; +import postgres from 'postgres'; +import { catalogDatasets, catalogFoods, catalogAccess, users } from '../src/lib/server/schema'; +import { datasetHeaderSchema, datasetProductSchema } from '../src/lib/server/catalog/dataset-schema'; +import { ALL_NUTRIENT_KEYS } from '../src/lib/nutrients'; + +const databaseUrl = process.env.DATABASE_URL; +if (!databaseUrl) { + console.error('DATABASE_URL environment variable is required'); + process.exit(1); +} +const client = postgres(databaseUrl, { max: 1 }); +const db = drizzle(client, { schema: { catalogDatasets, catalogFoods, catalogAccess, users } }); + +const TRGM_INDEX = 'idx_catalog_foods_name_trgm'; + +function pickNutrientCols(p: Record) { + return Object.fromEntries(ALL_NUTRIENT_KEYS.map((k) => [k, (p[k] as number | null | undefined) ?? null])); +} + +async function importDataset(file: string) { + const text = await Bun.file(file).text(); + const lines = text.split('\n').filter((l) => l.trim().length > 0); + if (lines.length === 0) throw new Error('Empty dataset file'); + + const header = datasetHeaderSchema.parse(JSON.parse(lines[0]))._dataset; + + const products = lines.slice(1).map((line, i) => { + const parsed = datasetProductSchema.safeParse(JSON.parse(line)); + if (!parsed.success) { + throw new Error(`Invalid product at line ${i + 2}: ${parsed.error.message}`); + } + return parsed.data; + }); + + await db.transaction(async (tx) => { + const [ds] = await tx + .insert(catalogDatasets) + .values({ + key: header.key, + name: header.name, + source: header.source, + priority: header.priority, + version: header.version ?? null, + snapshotAt: header.snapshotAt ? new Date(header.snapshotAt) : null, + productCount: products.length, + updatedAt: new Date() + }) + .onConflictDoUpdate({ + target: catalogDatasets.key, + set: { + name: header.name, + source: header.source, + priority: header.priority, + version: header.version ?? null, + snapshotAt: header.snapshotAt ? new Date(header.snapshotAt) : null, + productCount: products.length, + updatedAt: new Date() + } + }) + .returning(); + + await tx.execute(sql`DROP INDEX IF EXISTS ${sql.identifier(TRGM_INDEX)}`); + await tx.delete(catalogFoods).where(eq(catalogFoods.datasetId, ds.id)); + + const CHUNK = 2000; + for (let i = 0; i < products.length; i += CHUNK) { + const slice = products.slice(i, i + CHUNK).map((p) => ({ + datasetId: ds.id, + name: p.name, + brand: p.brand ?? null, + language: p.language ?? null, + servingSize: p.servingSize, + servingUnit: p.servingUnit, + calories: p.calories, + protein: p.protein, + carbs: p.carbs, + fat: p.fat, + fiber: p.fiber, + barcode: p.barcode ?? null, + nutriScore: p.nutriScore ?? null, + novaGroup: p.novaGroup ?? null, + additives: p.additives ?? null, + ingredientsText: p.ingredientsText ?? null, + imageUrl: p.imageUrl ?? null, + sourceUrl: p.sourceUrl ?? null, + sourceRef: p.sourceRef ?? null, + crawledAt: p.crawledAt ? new Date(p.crawledAt) : null, + ...pickNutrientCols(p as Record) + })); + if (slice.length > 0) await tx.insert(catalogFoods).values(slice); + } + + await tx.execute( + sql`CREATE INDEX ${sql.identifier(TRGM_INDEX)} ON ${catalogFoods} USING gin (${catalogFoods.name} gin_trgm_ops)` + ); + }); + + console.log(`Imported ${products.length} products into dataset "${header.key}"`); +} + +const [cmd, ...args] = process.argv.slice(2); + +try { + if (cmd === 'import') { + if (!args[0]) throw new Error('Usage: catalog import '); + await importDataset(args[0]); + } else { + throw new Error(`Unknown command: ${cmd ?? '(none)'}. Expected: import|grant|revoke|list`); + } + await client.end(); + process.exit(0); +} catch (e) { + console.error(e instanceof Error ? e.message : String(e)); + await client.end(); + process.exit(1); +} +``` + +- [ ] **Step 5: Add the `catalog:import` package.json script** + +In `package.json`, inside `"scripts"`, after the `"test:seed"` line add: + +```json + "catalog:import": "bun run scripts/catalog.ts import", + "catalog:grant": "bun run scripts/catalog.ts grant", + "catalog:revoke": "bun run scripts/catalog.ts revoke", + "catalog:list": "bun run scripts/catalog.ts list" +``` + +(Add a trailing comma to the preceding `"test:seed": ...` line so JSON stays valid; the last new line gets no trailing comma if it is the final scripts entry.) + +- [ ] **Step 6: Run the test to verify it passes** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-import.test.ts` +Expected: PASS (2 tests). + +- [ ] **Step 7: Commit** + +```bash +git add scripts/catalog.ts package.json tests/fixtures/catalog/mini.jsonl tests/integration-db/catalog-import.test.ts +git commit -m "feat: add catalog:import CLI (validated JSONL, batched replace, GIN recreate)" +``` + +--- + +### Task 4: Catalog CLI — `grant` / `revoke` / `list` subcommands + integration test + +**Files:** +- Test: `tests/integration-db/catalog-import.test.ts` (extend — add a `describe` block) +- Modify: `scripts/catalog.ts` + +- [ ] **Step 1: Add the failing test block** + +Append to `tests/integration-db/catalog-import.test.ts` (after the existing `describe('catalog:import', ...)` block), and add `users` to the schema import at the top of the file (`import { catalogDatasets, catalogFoods, catalogAccess, users } from '$lib/server/schema';` and `import { and } from 'drizzle-orm';` alongside `eq`): + +```typescript +describe('catalog:grant / revoke / list', () => { + it('grants and revokes dataset access by user email', async () => { + const db = getTestDB(dbUrl); + await $`bun run scripts/catalog.ts import ${FIXTURE}`.env({ ...process.env, DATABASE_URL: dbUrl }); + const [u] = await db + .insert(users) + .values({ infomaniakSub: 'sub-grant-1', email: 'fam@example.com' }) + .returning(); + const ds = (await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'testset') }))!; + + await $`bun run scripts/catalog.ts grant fam@example.com testset`.env({ ...process.env, DATABASE_URL: dbUrl }); + let grants = await db + .select() + .from(catalogAccess) + .where(and(eq(catalogAccess.userId, u.id), eq(catalogAccess.datasetId, ds.id))); + expect(grants.length).toBe(1); + + await $`bun run scripts/catalog.ts revoke fam@example.com testset`.env({ ...process.env, DATABASE_URL: dbUrl }); + grants = await db + .select() + .from(catalogAccess) + .where(and(eq(catalogAccess.userId, u.id), eq(catalogAccess.datasetId, ds.id))); + expect(grants.length).toBe(0); + }); + + it('list exits 0', async () => { + const r = await $`bun run scripts/catalog.ts list` + .env({ ...process.env, DATABASE_URL: dbUrl }) + .quiet(); + expect(r.exitCode).toBe(0); + }); +}); +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-import.test.ts` +Expected: FAIL — `Unknown command: grant`. + +- [ ] **Step 3: Implement the subcommands in `scripts/catalog.ts`** + +Add these functions above the `const [cmd, ...args] = ...` line: + +```typescript +async function resolveUserId(email: string): Promise { + const u = await db.query.users.findFirst({ where: eq(users.email, email) }); + if (!u) throw new Error(`No user with email ${email}`); + return u.id; +} + +async function resolveDatasetId(key: string): Promise { + const d = await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, key) }); + if (!d) throw new Error(`No dataset with key ${key}`); + return d.id; +} + +async function grant(email: string, key: string) { + const userId = await resolveUserId(email); + const datasetId = await resolveDatasetId(key); + await db.insert(catalogAccess).values({ userId, datasetId }).onConflictDoNothing(); + console.log(`Granted "${key}" to ${email}`); +} + +async function revoke(email: string, key: string) { + const userId = await resolveUserId(email); + const datasetId = await resolveDatasetId(key); + await db + .delete(catalogAccess) + .where(sql`${catalogAccess.userId} = ${userId} AND ${catalogAccess.datasetId} = ${datasetId}`); + console.log(`Revoked "${key}" from ${email}`); +} + +async function list() { + const datasets = await db.select().from(catalogDatasets); + for (const d of datasets) { + const grants = await db + .select({ email: users.email }) + .from(catalogAccess) + .innerJoin(users, eq(users.id, catalogAccess.userId)) + .where(eq(catalogAccess.datasetId, d.id)); + console.log( + `${d.key} (${d.source}, prio ${d.priority}, ${d.productCount ?? 0} products) -> ${ + grants.map((g) => g.email).join(', ') || '(no grants)' + }` + ); + } +} +``` + +Replace the command dispatch block with: + +```typescript +const [cmd, ...args] = process.argv.slice(2); + +try { + if (cmd === 'import') { + if (!args[0]) throw new Error('Usage: catalog import '); + await importDataset(args[0]); + } else if (cmd === 'grant') { + if (!args[0] || !args[1]) throw new Error('Usage: catalog grant '); + await grant(args[0], args[1]); + } else if (cmd === 'revoke') { + if (!args[0] || !args[1]) throw new Error('Usage: catalog revoke '); + await revoke(args[0], args[1]); + } else if (cmd === 'list') { + await list(); + } else { + throw new Error(`Unknown command: ${cmd ?? '(none)'}. Expected: import|grant|revoke|list`); + } + await client.end(); + process.exit(0); +} catch (e) { + console.error(e instanceof Error ? e.message : String(e)); + await client.end(); + process.exit(1); +} +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-import.test.ts` +Expected: PASS (4 tests total). + +- [ ] **Step 5: Commit** + +```bash +git add scripts/catalog.ts tests/integration-db/catalog-import.test.ts +git commit -m "feat: add catalog:grant/revoke/list CLI subcommands" +``` + +--- + +### Task 5: Repo guardrails — gitignore + prek hook blocking committed datasets + +**Files:** +- Modify: `.gitignore` +- Modify: `.pre-commit-config.yaml` + +- [ ] **Step 1: Add the ignore rule** + +Append to the end of `.gitignore`: + +``` + +# Crawled catalog datasets — never commit (public repo; private data) +data/catalog/ +``` + +- [ ] **Step 2: Add a local prek guard hook** + +In `.pre-commit-config.yaml`, add this hook as the last entry under `hooks:` (same `repo: local` block, matching the existing `language: system` style): + +```yaml + - id: no-catalog-data + name: no committed catalog datasets + entry: bash -c 'if git diff --cached --name-only | grep -E "^data/catalog/.*\.jsonl$"; then echo "ERROR: catalog dataset files must not be committed (public repo)"; exit 1; fi' + language: system + pass_filenames: false +``` + +- [ ] **Step 3: Verify the guard triggers** + +Run: + +```bash +mkdir -p data/catalog && cp tests/fixtures/catalog/mini.jsonl data/catalog/x.jsonl && git add -f data/catalog/x.jsonl && bunx prek run no-catalog-data --hook-stage pre-commit; echo "exit=$?" +``` + +Expected: prints the ERROR line and `exit=1`. Then clean up: + +```bash +git reset data/catalog/x.jsonl && rm -rf data/catalog +``` + +Expected: `data/catalog/` is gitignored and untracked. + +- [ ] **Step 4: Commit** + +```bash +git add .gitignore .pre-commit-config.yaml +git commit -m "chore: gitignore + prek guard against committing catalog datasets" +``` + +--- + +# PHASE 2 — App Integration + +### Task 6: Extract shared nutrient-extraction helper (Gap 8) + +**Files:** +- Test: `src/lib/server/nutrient-extract.test.ts` (create) +- Create: `src/lib/server/nutrient-extract.ts` +- Modify: `src/lib/server/openfoodfacts.ts` + +- [ ] **Step 1: Write the failing unit test** + +Create `src/lib/server/nutrient-extract.test.ts`: + +```typescript +import { describe, it, expect } from 'vitest'; +import { extractNutrient, extractAllNutrients } from './nutrient-extract'; + +describe('nutrient-extract', () => { + it('extractNutrient returns null for missing/NaN and rounds with conversion', () => { + expect(extractNutrient({}, 'x_100g')).toBeNull(); + expect(extractNutrient({ x_100g: 'abc' }, 'x_100g')).toBeNull(); + expect(extractNutrient({}, undefined)).toBeNull(); + expect(extractNutrient({ x_100g: 1.234 }, 'x_100g')).toBe(1.23); + expect(extractNutrient({ x_100g: 0.5 }, 'x_100g', 1000)).toBe(500); + expect(extractNutrient({ x_100g: '2.5' }, 'x_100g')).toBe(2.5); + }); + + it('extractAllNutrients maps every ALL_NUTRIENTS key', async () => { + const { ALL_NUTRIENT_KEYS } = await import('$lib/nutrients'); + const out = extractAllNutrients({ 'saturated-fat_100g': 5 }); + for (const k of ALL_NUTRIENT_KEYS) expect(k in out).toBe(true); + expect(out.saturatedFat).toBe(5); + }); +}); +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `bun --bun vitest run src/lib/server/nutrient-extract.test.ts` +Expected: FAIL — `Cannot find module './nutrient-extract'`. + +- [ ] **Step 3: Create the shared helper** + +Create `src/lib/server/nutrient-extract.ts` (logic copied verbatim from the current private `extractNutrient` in `openfoodfacts.ts`, plus the `ALL_NUTRIENTS` loop, so behavior is identical): + +```typescript +import { ALL_NUTRIENTS } from '$lib/nutrients'; + +export function extractNutrient( + nutriments: Record, + offKey: string | undefined, + conversion?: number +): number | null { + if (!offKey) return null; + const raw = nutriments[offKey]; + if (raw == null) return null; + const num = typeof raw === 'string' ? parseFloat(raw) : raw; + if (isNaN(num)) return null; + if (conversion) return Math.round(num * conversion * 100) / 100; + return Math.round(num * 100) / 100; +} + +export function extractAllNutrients( + nutriments: Record +): Record { + const out: Record = {}; + for (const n of ALL_NUTRIENTS) { + out[n.key] = extractNutrient(nutriments, n.offKey, n.offConversion); + } + return out; +} +``` + +- [ ] **Step 4: Refactor `openfoodfacts.ts` to use it (behavior-preserving)** + +In `src/lib/server/openfoodfacts.ts`: change the import line `import { ALL_NUTRIENTS } from '$lib/nutrients';` to `import { extractAllNutrients } from '$lib/server/nutrient-extract';`. Delete the private `extractNutrient` function (lines 67–79). In `mapSearchProduct`, replace the loop: + +```typescript + for (const nutrient of ALL_NUTRIENTS) { + result[nutrient.key] = extractNutrient(n, nutrient.offKey, nutrient.offConversion); + } +``` + +with: + +```typescript + Object.assign(result, extractAllNutrients(n)); +``` + +- [ ] **Step 5: Run the OFF + new helper tests to verify no regression** + +Run: `bun --bun vitest run src/lib/server/nutrient-extract.test.ts && bun run check` +Expected: nutrient-extract tests PASS; `bun run check` exits 0 (no type errors from the refactor). If OFF has existing tests, run `bun --bun vitest run src/lib/server/openfoodfacts` — Expected: still PASS. + +- [ ] **Step 6: Commit** + +```bash +git add src/lib/server/nutrient-extract.ts src/lib/server/nutrient-extract.test.ts src/lib/server/openfoodfacts.ts +git commit -m "refactor: extract shared nutrient-extraction helper from openfoodfacts" +``` + +--- + +### Task 7: Catalog server queries (search / barcode / instantiate) + +**Files:** +- Test: `tests/integration-db/catalog-endpoints.test.ts` (create — query-layer tests first) +- Create: `src/lib/server/catalog/queries.ts` + +- [ ] **Step 1: Write the failing query-layer integration test** + +Create `tests/integration-db/catalog-endpoints.test.ts`: + +```typescript +import { describe, it, expect, beforeAll, afterAll } from 'vitest'; +import { eq } from 'drizzle-orm'; +import { $ } from 'bun'; +import { join } from 'node:path'; +import { createTestDatabase, dropTestDatabase, runTestMigrations, getTestDB, closeTestDB } from './helpers'; +import { catalogDatasets, catalogFoods, catalogAccess, users, foods } from '$lib/server/schema'; + +const DB_NAME = 'test_catalog_endpoints'; +let dbUrl: string; +const FIXTURE = join(process.cwd(), 'tests/fixtures/catalog/mini.jsonl'); + +beforeAll(async () => { + dbUrl = await createTestDatabase(DB_NAME); + await runTestMigrations(dbUrl); + await $`bun run scripts/catalog.ts import ${FIXTURE}`.env({ ...process.env, DATABASE_URL: dbUrl }); +}); +afterAll(async () => { + await closeTestDB(dbUrl); + await dropTestDatabase(DB_NAME); +}); + +describe('catalog queries', () => { + it('catalogSearch returns nothing for an ungranted user, results for a granted one', async () => { + const db = getTestDB(dbUrl); + const { catalogSearch } = await import('$lib/server/catalog/queries'); + const [u] = await db + .insert(users) + .values({ infomaniakSub: 'sub-q-1', email: 'q1@example.com' }) + .returning(); + + expect((await catalogSearch(db as never, u.id, 'Zweifel', 10)).length).toBe(0); + + const ds = (await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'testset') }))!; + await db.insert(catalogAccess).values({ userId: u.id, datasetId: ds.id }); + + const res = await catalogSearch(db as never, u.id, 'Zweifel', 10); + expect(res.length).toBe(1); + expect(res[0].name).toBe('Zweifel Paprika Chips'); + expect(res[0].datasetKey).toBe('testset'); + }); + + it('catalogByBarcode honors access and dataset priority tie-break', async () => { + const db = getTestDB(dbUrl); + const { catalogByBarcode } = await import('$lib/server/catalog/queries'); + const [u] = await db + .insert(users) + .values({ infomaniakSub: 'sub-q-2', email: 'q2@example.com' }) + .returning(); + expect(await catalogByBarcode(db as never, u.id, '7610095131003')).toBeNull(); + + const dsLow = (await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'testset') }))!; + // Second dataset, same barcode, lower priority number (=higher precedence) + const [dsHi] = await db + .insert(catalogDatasets) + .values({ key: 'prio', name: 'Prio', source: 'migros', priority: 1, productCount: 1 }) + .returning(); + await db.insert(catalogFoods).values({ + datasetId: dsHi.id, + name: 'PRIO WINNER', + servingSize: 100, + servingUnit: 'g', + calories: 1, + protein: 1, + carbs: 1, + fat: 1, + fiber: 1, + barcode: '7610095131003' + }); + await db.insert(catalogAccess).values({ userId: u.id, datasetId: dsLow.id }); + await db.insert(catalogAccess).values({ userId: u.id, datasetId: dsHi.id }); + + const hit = await catalogByBarcode(db as never, u.id, '7610095131003'); + expect(hit!.name).toBe('PRIO WINNER'); + }); + + it('instantiateCatalogFood creates a personal food and never mutates the catalog row', async () => { + const db = getTestDB(dbUrl); + const { instantiateCatalogFood } = await import('$lib/server/catalog/queries'); + const [u] = await db + .insert(users) + .values({ infomaniakSub: 'sub-q-3', email: 'q3@example.com' }) + .returning(); + const ds = (await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'testset') }))!; + await db.insert(catalogAccess).values({ userId: u.id, datasetId: ds.id }); + const [cf] = await db + .select() + .from(catalogFoods) + .where(eq(catalogFoods.datasetId, ds.id)) + .limit(1); + + const food = await instantiateCatalogFood(db as never, u.id, cf.id); + expect(food).toBeTruthy(); + expect(food!.userId).toBe(u.id); + expect(food!.name).toBe(cf.name); + + const personal = await db.select().from(foods).where(eq(foods.userId, u.id)); + expect(personal.length).toBe(1); + const stillThere = await db.select().from(catalogFoods).where(eq(catalogFoods.id, cf.id)); + expect(stillThere.length).toBe(1); + + // Ungranted user cannot instantiate + const [u2] = await db + .insert(users) + .values({ infomaniakSub: 'sub-q-4', email: 'q4@example.com' }) + .returning(); + await expect(instantiateCatalogFood(db as never, u2.id, cf.id)).resolves.toBeNull(); + }); +}); +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-endpoints.test.ts` +Expected: FAIL — `Cannot find module '$lib/server/catalog/queries'`. + +- [ ] **Step 3: Implement `src/lib/server/catalog/queries.ts`** + +Create `src/lib/server/catalog/queries.ts`: + +```typescript +import { and, eq, ilike, asc } from 'drizzle-orm'; +import type { getDB } from '$lib/server/db'; +import { catalogFoods, catalogDatasets, catalogAccess, foods } from '$lib/server/schema'; +import { createFood } from '$lib/server/foods'; +import { pickNutrients } from '$lib/nutrients'; +import type { Result } from '$lib/server/types'; + +type DB = ReturnType; + +export type CatalogResult = typeof catalogFoods.$inferSelect & { + datasetKey: string; + source: string; +}; + +function escapeLike(q: string): string { + return q.replace(/\\/g, '\\\\').replace(/%/g, '\\%').replace(/_/g, '\\_'); +} + +export async function catalogSearch( + db: DB, + userId: string, + query: string, + limit: number +): Promise { + const q = escapeLike(query.trim()); + if (q.length === 0) return []; + const rows = await db + .select({ + cf: catalogFoods, + datasetKey: catalogDatasets.key, + source: catalogDatasets.source, + priority: catalogDatasets.priority + }) + .from(catalogFoods) + .innerJoin(catalogDatasets, eq(catalogDatasets.id, catalogFoods.datasetId)) + .innerJoin( + catalogAccess, + and(eq(catalogAccess.datasetId, catalogDatasets.id), eq(catalogAccess.userId, userId)) + ) + .where(ilike(catalogFoods.name, `%${q}%`)) + .orderBy(asc(catalogDatasets.priority), asc(catalogFoods.name)) + .limit(limit); + return rows.map((r) => ({ ...r.cf, datasetKey: r.datasetKey, source: r.source })); +} + +export async function catalogByBarcode( + db: DB, + userId: string, + barcode: string +): Promise { + const rows = await db + .select({ + cf: catalogFoods, + datasetKey: catalogDatasets.key, + source: catalogDatasets.source + }) + .from(catalogFoods) + .innerJoin(catalogDatasets, eq(catalogDatasets.id, catalogFoods.datasetId)) + .innerJoin( + catalogAccess, + and(eq(catalogAccess.datasetId, catalogDatasets.id), eq(catalogAccess.userId, userId)) + ) + .where(eq(catalogFoods.barcode, barcode)) + .orderBy(asc(catalogDatasets.priority)) + .limit(1); + const r = rows[0]; + return r ? { ...r.cf, datasetKey: r.datasetKey, source: r.source } : null; +} + +export async function instantiateCatalogFood( + db: DB, + userId: string, + catalogFoodId: string +): Promise { + const rows = await db + .select({ cf: catalogFoods }) + .from(catalogFoods) + .innerJoin(catalogDatasets, eq(catalogDatasets.id, catalogFoods.datasetId)) + .innerJoin( + catalogAccess, + and(eq(catalogAccess.datasetId, catalogDatasets.id), eq(catalogAccess.userId, userId)) + ) + .where(eq(catalogFoods.id, catalogFoodId)) + .limit(1); + const cf = rows[0]?.cf; + if (!cf) return null; + + const payload = { + name: cf.name, + brand: cf.brand, + servingSize: cf.servingSize, + servingUnit: cf.servingUnit, + calories: cf.calories, + protein: cf.protein, + carbs: cf.carbs, + fat: cf.fat, + fiber: cf.fiber, + barcode: cf.barcode, + nutriScore: cf.nutriScore as 'a' | 'b' | 'c' | 'd' | 'e' | null, + novaGroup: cf.novaGroup, + additives: cf.additives, + ingredientsText: cf.ingredientsText, + imageUrl: cf.imageUrl, + ...pickNutrients(cf as Record) + }; + const result: Result = await createFood(userId, payload); + if (!result.success) { + // Barcode already in the user's personal DB → treat as a benign no-op miss + return null; + } + return result.data; +} +``` + +Note: `catalogSearch`/`catalogByBarcode`/`instantiateCatalogFood` take an explicit `db` (so tests can pass the test DB). Production callers pass `getDB()`. + +- [ ] **Step 4: Run to verify it passes** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-endpoints.test.ts` +Expected: PASS (3 tests). + +- [ ] **Step 5: Commit** + +```bash +git add src/lib/server/catalog/queries.ts tests/integration-db/catalog-endpoints.test.ts +git commit -m "feat: add access-gated catalog query layer (search/barcode/instantiate)" +``` + +--- + +### Task 8: Catalog API endpoints + OpenAPI + isolation test + +**Files:** +- Test: `tests/integration-db/catalog-endpoints.test.ts` (extend — `/api/foods` isolation) +- Create: `src/routes/api/catalog/search/+server.ts` +- Create: `src/routes/api/catalog/barcode/[code]/+server.ts` +- Create: `src/routes/api/catalog/[id]/save/+server.ts` +- Modify: `src/lib/server/openapi.ts` +- Regenerate: `docs/openapi.json`, `src/lib/api/generated/schema.d.ts` + +- [ ] **Step 1: Add the failing isolation test** + +Append to `tests/integration-db/catalog-endpoints.test.ts`: + +```typescript +import { listFoods } from '$lib/server/foods'; + +describe('catalog isolation from personal foods', () => { + it('listFoods (the /api/foods source) never returns catalog rows', async () => { + const db = getTestDB(dbUrl); + const [u] = await db + .insert(users) + .values({ infomaniakSub: 'sub-iso-1', email: 'iso@example.com' }) + .returning(); + const ds = (await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'testset') }))!; + await db.insert(catalogAccess).values({ userId: u.id, datasetId: ds.id }); + // listFoods uses getDB() internally; this asserts the personal-food query + // is unaffected by catalog presence for a user with zero personal foods. + const { items } = await listFoods(u.id, { query: 'Zweifel' }); + expect(items.length).toBe(0); + }); +}); +``` + +(`listFoods` uses `getDB()`/`DATABASE_URL`; the integration runner sets `DATABASE_URL` to the test DB via `helpers`. If `listFoods` cannot see the test DB in this harness, assert instead that `/api/foods` route output excludes catalog by inspecting the route in Step 4’s manual check — but the query-level assertion above is the primary guard since `catalogFoods` is a separate table never referenced by `listFoods`.) + +- [ ] **Step 2: Run to verify it fails or is red** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-endpoints.test.ts` +Expected: the new test FAILS only if `listFoods` accidentally unions catalog (it must not) — i.e. it should pass once endpoints exist but is added now to lock the invariant. If it errors on DB wiring, keep it and proceed; it is the regression guard. + +- [ ] **Step 3: Create `src/routes/api/catalog/search/+server.ts`** + +```typescript +import { json } from '@sveltejs/kit'; +import type { RequestHandler } from './$types'; +import { requireAuth, handleApiError } from '$lib/server/errors'; +import { getDB } from '$lib/server/db'; +import { catalogSearch } from '$lib/server/catalog/queries'; + +export const GET: RequestHandler = async ({ locals, url }) => { + try { + const userId = requireAuth(locals); + const q = url.searchParams.get('q') ?? ''; + const limitRaw = Number(url.searchParams.get('limit') ?? '20'); + const limit = Number.isFinite(limitRaw) ? Math.min(Math.max(limitRaw, 1), 50) : 20; + if (q.trim().length < 2) { + return json({ results: [] }); + } + const results = await catalogSearch(getDB(), userId, q, limit); + return json({ results }); + } catch (error) { + return handleApiError(error); + } +}; +``` + +- [ ] **Step 4: Create `src/routes/api/catalog/barcode/[code]/+server.ts`** + +```typescript +import { json } from '@sveltejs/kit'; +import type { RequestHandler } from './$types'; +import { requireAuth, handleApiError } from '$lib/server/errors'; +import { isValidBarcode } from '$lib/utils/barcode'; +import { getDB } from '$lib/server/db'; +import { catalogByBarcode } from '$lib/server/catalog/queries'; + +export const GET: RequestHandler = async ({ locals, params }) => { + try { + const userId = requireAuth(locals); + const { code } = params; + if (!isValidBarcode(code)) { + return json({ error: 'Invalid barcode format' }, { status: 400 }); + } + const result = await catalogByBarcode(getDB(), userId, code); + if (!result) return json({ found: false }, { status: 404 }); + return json({ found: true, result }); + } catch (error) { + return handleApiError(error); + } +}; +``` + +- [ ] **Step 5: Create `src/routes/api/catalog/[id]/save/+server.ts`** + +```typescript +import { json } from '@sveltejs/kit'; +import type { RequestHandler } from './$types'; +import { requireAuth, requireUuid, handleApiError } from '$lib/server/errors'; +import { getDB } from '$lib/server/db'; +import { instantiateCatalogFood } from '$lib/server/catalog/queries'; + +export const POST: RequestHandler = async ({ locals, params }) => { + try { + const userId = requireAuth(locals); + const id = requireUuid(params.id); + const food = await instantiateCatalogFood(getDB(), userId, id); + if (!food) { + return json({ error: 'Catalog food not found or not accessible' }, { status: 404 }); + } + return json({ food }, { status: 201 }); + } catch (error) { + return handleApiError(error); + } +}; +``` + +- [ ] **Step 6: Declare the three routes in `src/lib/server/openapi.ts`** + +Open `src/lib/server/openapi.ts`. Near the top, ensure `z` and the catalog query types are usable; add a response schema near the other schemas and add the three paths into the object passed to the path map (mirror the `'/api/foods'` entry style). Add this block alongside the other `'/api/...'` keys: + +```typescript + '/api/catalog/search': { + get: { + operationId: 'catalogSearch', + tags: ['Catalog'], + description: 'Online catalog search across the requesting user’s granted datasets.', + requestParams: { + query: z.object({ q: z.string(), limit: z.number().int().optional() }) + }, + responses: { + '200': { + description: 'Success', + content: { + 'application/json': { + schema: z.object({ + results: z.array(z.record(z.string(), z.unknown())) + }) + } + } + }, + '401': res401 + } + } + }, + '/api/catalog/barcode/{code}': { + get: { + operationId: 'catalogByBarcode', + tags: ['Catalog'], + description: 'Barcode lookup across granted catalog datasets (priority tie-break).', + requestParams: { path: z.object({ code: z.string() }) }, + responses: { + '200': { + description: 'Found', + content: { + 'application/json': { + schema: z.object({ + found: z.boolean(), + result: z.record(z.string(), z.unknown()).optional() + }) + } + } + }, + '400': res400, + '401': res401, + '404': { + description: 'Not found', + content: { + 'application/json': { schema: z.object({ found: z.boolean() }) } + } + } + } + } + }, + '/api/catalog/{id}/save': { + post: { + operationId: 'saveCatalogFood', + tags: ['Catalog'], + description: 'Instantiate a personal food from a catalog row (copy-on-use).', + requestParams: { path: z.object({ id: z.string() }) }, + responses: { + '201': { + description: 'Created', + content: { 'application/json': { schema: foodResponseSchema } } + }, + '401': res401, + '404': res404 + } + } + }, +``` + +(Use the same `res401`/`res400`/`res404`/`foodResponseSchema` symbols already defined in this file for the `/api/foods` routes. If `res404` is not defined there, reuse the inline 404 shape shown for `/api/catalog/barcode/{code}`.) + +- [ ] **Step 7: Regenerate the OpenAPI spec + typed client** + +Run: `bun run api:generate:ts && bunx prettier --write docs/openapi.json src/lib/api/generated/` +Expected: `docs/openapi.json` and `src/lib/api/generated/schema.d.ts` now contain `/api/catalog/search`, `/api/catalog/barcode/{code}`, `/api/catalog/{id}/save`. + +Run: `bun run api:check` +Expected: exits 0 (no diff after regen). This is the same check CI runs. + +- [ ] **Step 8: Run integration + type check** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-endpoints.test.ts && bun run check` +Expected: tests PASS; `bun run check` exits 0. + +- [ ] **Step 9: Commit** + +```bash +git add src/routes/api/catalog src/lib/server/openapi.ts docs/openapi.json src/lib/api/generated tests/integration-db/catalog-endpoints.test.ts +git commit -m "feat: add /api/catalog search/barcode/save endpoints + openapi" +``` + +--- + +### Task 9: i18n strings for catalog UI + +**Files:** +- Modify: `messages/en.json` +- Modify: `messages/de.json` + +- [ ] **Step 1: Add English strings** + +In `messages/en.json`, add these keys (keep the file’s existing alphabetical/grouping convention; place near other `add_food_*` keys): + +```json + "catalog_source_badge": "{source}", + "add_food_catalog_searching": "Searching catalog…", + "add_food_catalog_section": "From catalog", + "add_food_catalog_add_failed": "Could not add this product. It may already be in your foods." +``` + +- [ ] **Step 2: Add German strings** + +In `messages/de.json`, add the same keys: + +```json + "catalog_source_badge": "{source}", + "add_food_catalog_searching": "Katalog wird durchsucht…", + "add_food_catalog_section": "Aus Katalog", + "add_food_catalog_add_failed": "Produkt konnte nicht hinzugefügt werden. Es ist evtl. schon in deinen Lebensmitteln." +``` + +- [ ] **Step 3: Compile messages + typecheck** + +Run: `bun run paraglide:compile && bun run check` +Expected: compiles; `bun run check` exits 0; `m.add_food_catalog_section` etc. are now typed. + +- [ ] **Step 4: Commit** + +```bash +git add messages/en.json messages/de.json +git commit -m "feat: add i18n strings for catalog picker" +``` + +--- + +### Task 10: FoodPicker — online catalog search + source badge + +**Files:** +- Modify: `src/lib/components/entries/FoodPicker.svelte` + +- [ ] **Step 1: Extend `PickerSelection` and add catalog state** + +In the ` @@ -151,6 +187,38 @@ {/each} + {#if catalogLoading} +

{m.add_food_catalog_searching()}

+ {:else if catalogResults.length > 0} +

{m.add_food_catalog_section()}

+
    + {#each catalogResults as hit (hit.id)} +
  • + + {hit.name} + {m.catalog_source_badge({ source: hit.source })} + + +
  • + {/each} +
+ {/if} From a46f22d8985275ddb687f7faa9257a7bd6f0d798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Orell=20B=C3=BChler?= Date: Thu, 21 May 2026 21:30:01 +0200 Subject: [PATCH 16/26] feat: catalog pick instantiates a personal food then logs (copy-on-use) --- src/lib/components/entries/AddFoodModal.svelte | 17 ++++++++++++++++- src/lib/services/food-service.svelte.ts | 13 ++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/lib/components/entries/AddFoodModal.svelte b/src/lib/components/entries/AddFoodModal.svelte index 83647f3f..cf37af7b 100644 --- a/src/lib/components/entries/AddFoodModal.svelte +++ b/src/lib/components/entries/AddFoodModal.svelte @@ -17,6 +17,7 @@ import { Label } from '$lib/components/ui/label/index.js'; import { timeToIsoString, currentTime24h } from '$lib/utils/dates'; import * as m from '$lib/paraglide/messages'; + import { foodService } from '$lib/services/food-service.svelte'; type Props = { open?: boolean; @@ -80,7 +81,7 @@ wasOpen = open; }); - const handleSelect = (selection: PickerSelection) => { + const handleSelect = async (selection: PickerSelection) => { if (selection.type === 'food') { selectedFood = { id: selection.food.id, @@ -92,6 +93,20 @@ }; } else if (selection.type === 'recipe') { selectedFood = { id: selection.recipe.id, name: selection.recipe.name, type: 'recipe' }; + } else if (selection.type === 'catalog') { + const food = await foodService.saveFromCatalog(selection.catalog.id); + if (!food) { + alert(m.add_food_catalog_add_failed()); + return; + } + selectedFood = { + id: food.id, + name: food.name, + type: 'food', + servingSize: food.servingSize, + servingUnit: food.servingUnit, + calories: food.calories + }; } else if (selection.type === 'favorite') { selectedFood = { id: selection.favorite.id, diff --git a/src/lib/services/food-service.svelte.ts b/src/lib/services/food-service.svelte.ts index 64fde79d..dc3261d4 100644 --- a/src/lib/services/food-service.svelte.ts +++ b/src/lib/services/food-service.svelte.ts @@ -198,6 +198,16 @@ async function findByBarcode(barcode: string): Promise { } } +async function saveFromCatalog(catalogId: string): Promise { + const { data } = await api.POST('/api/catalog/{id}/save', { + params: { path: { id: catalogId } } + }); + if (!data?.food) return null; + const food = data.food as unknown as DexieFood; + await db.foods.put(food); + return food; +} + export const foodService = { allFoods, foodById, @@ -208,5 +218,6 @@ export const foodService = { create, update, delete: deleteFood, - findByBarcode + findByBarcode, + saveFromCatalog }; From c88a38a1591922592adf2ec796f15ff954cfd518 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Orell=20B=C3=BChler?= Date: Thu, 21 May 2026 21:32:18 +0200 Subject: [PATCH 17/26] feat: barcode scan checks catalog before Open Food Facts fallback --- src/lib/components/entries/DayLog.svelte | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/lib/components/entries/DayLog.svelte b/src/lib/components/entries/DayLog.svelte index 6f7781f3..482cc6df 100644 --- a/src/lib/components/entries/DayLog.svelte +++ b/src/lib/components/entries/DayLog.svelte @@ -7,6 +7,7 @@ import { DEFAULT_MEAL_TYPES, getCurrentMealByTime } from '$lib/utils/meals'; import { goto } from '$app/navigation'; import { useLiveQuery } from '$lib/db/live.svelte'; + import { api } from '$lib/api/client'; import { entryService } from '$lib/services/entry-service.svelte'; import { foodService } from '$lib/services/food-service.svelte'; import { recipeService } from '$lib/services/recipe-service.svelte'; @@ -147,9 +148,24 @@ if (food) { barcodeFoodId = food.id; addModalOpen = true; - } else { - goto(`/foods?barcode=${encodeURIComponent(barcode)}`); + return; } + try { + const { data } = await api.GET('/api/catalog/barcode/{code}', { + params: { path: { code: barcode } } + }); + if (data?.found && data.result) { + const saved = await foodService.saveFromCatalog((data.result as { id: string }).id); + if (saved) { + barcodeFoodId = saved.id; + addModalOpen = true; + return; + } + } + } catch { + // fall through to OFF prefill + } + goto(`/foods?barcode=${encodeURIComponent(barcode)}`); }; }; From 7a7e0b3890f7fab56868ac4e5c96e472986238ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Orell=20B=C3=BChler?= Date: Thu, 21 May 2026 21:47:39 +0200 Subject: [PATCH 18/26] =?UTF-8?q?chore:=20catalog=20UX/OpenAPI=20polish=20?= =?UTF-8?q?=E2=80=94=20toast=20for=20save=20error;=20uuid=20format=20for?= =?UTF-8?q?=20catalog=20id?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/openapi.json | 4 +++- src/lib/components/entries/AddFoodModal.svelte | 3 ++- src/lib/server/openapi.ts | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/openapi.json b/docs/openapi.json index ce22788b..d87b9880 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -2516,7 +2516,9 @@ "in": "path", "name": "id", "schema": { - "type": "string" + "type": "string", + "format": "uuid", + "pattern": "^([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-8][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}|00000000-0000-0000-0000-000000000000|ffffffff-ffff-ffff-ffff-ffffffffffff)$" }, "required": true } diff --git a/src/lib/components/entries/AddFoodModal.svelte b/src/lib/components/entries/AddFoodModal.svelte index cf37af7b..1f51e146 100644 --- a/src/lib/components/entries/AddFoodModal.svelte +++ b/src/lib/components/entries/AddFoodModal.svelte @@ -17,6 +17,7 @@ import { Label } from '$lib/components/ui/label/index.js'; import { timeToIsoString, currentTime24h } from '$lib/utils/dates'; import * as m from '$lib/paraglide/messages'; + import { toast } from 'svelte-sonner'; import { foodService } from '$lib/services/food-service.svelte'; type Props = { @@ -96,7 +97,7 @@ } else if (selection.type === 'catalog') { const food = await foodService.saveFromCatalog(selection.catalog.id); if (!food) { - alert(m.add_food_catalog_add_failed()); + toast.error(m.add_food_catalog_add_failed()); return; } selectedFood = { diff --git a/src/lib/server/openapi.ts b/src/lib/server/openapi.ts index d116c1a3..8e9364a1 100644 --- a/src/lib/server/openapi.ts +++ b/src/lib/server/openapi.ts @@ -1351,7 +1351,7 @@ export function generateSpec() { operationId: 'saveCatalogFood', tags: ['Catalog'], description: 'Instantiate a personal food from a catalog row (copy-on-use).', - requestParams: { path: z.object({ id: z.string() }) }, + requestParams: { path: z.object({ id: z.string().uuid() }) }, responses: { '201': { description: 'Created', From 5f8775203ea7aa8d7a4186dd6daea87190391793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Orell=20B=C3=BChler?= Date: Sat, 30 May 2026 01:15:39 +0200 Subject: [PATCH 19/26] feat(crawler): scaffold Bun package + shared lib (normalize, jsonl, http, checkpoint) --- crawler/README.md | 67 ++++++++++ crawler/bun.lock | 211 +++++++++++++++++++++++++++++++ crawler/lib/checkpoint.test.ts | 15 +++ crawler/lib/checkpoint.ts | 13 ++ crawler/lib/http.test.ts | 58 +++++++++ crawler/lib/http.ts | 57 +++++++++ crawler/lib/jsonl-stream.test.ts | 13 ++ crawler/lib/jsonl-stream.ts | 26 ++++ crawler/lib/jsonl-writer.test.ts | 48 +++++++ crawler/lib/jsonl-writer.ts | 53 ++++++++ crawler/lib/normalize.test.ts | 51 ++++++++ crawler/lib/normalize.ts | 71 +++++++++++ crawler/lib/smoke.test.ts | 22 ++++ crawler/package.json | 14 ++ crawler/tsconfig.json | 20 +++ crawler/types.test.ts | 16 +++ crawler/types.ts | 77 +++++++++++ 17 files changed, 832 insertions(+) create mode 100644 crawler/README.md create mode 100644 crawler/bun.lock create mode 100644 crawler/lib/checkpoint.test.ts create mode 100644 crawler/lib/checkpoint.ts create mode 100644 crawler/lib/http.test.ts create mode 100644 crawler/lib/http.ts create mode 100644 crawler/lib/jsonl-stream.test.ts create mode 100644 crawler/lib/jsonl-stream.ts create mode 100644 crawler/lib/jsonl-writer.test.ts create mode 100644 crawler/lib/jsonl-writer.ts create mode 100644 crawler/lib/normalize.test.ts create mode 100644 crawler/lib/normalize.ts create mode 100644 crawler/lib/smoke.test.ts create mode 100644 crawler/package.json create mode 100644 crawler/tsconfig.json create mode 100644 crawler/types.test.ts create mode 100644 crawler/types.ts diff --git a/crawler/README.md b/crawler/README.md new file mode 100644 index 00000000..4ca6c420 --- /dev/null +++ b/crawler/README.md @@ -0,0 +1,67 @@ +# Bissbilanz Catalog Crawler + +Offline tool that builds **catalog datasets** (normalized JSONL) for the access-gated +base food catalog. It is **not part of the SvelteKit app**, its build, or `bun run security` +scope — nothing under `src/` imports it. + +## Legal posture + +- **Private use, no redistribution.** Crawler _code_ ships in this repo; crawled _data_ never + does. Datasets are written under `data/catalog/` which is git-ignored and rejected by a + pre-commit hook (`no-catalog-data`). +- Output is imported only into this app's database and surfaced only to its authenticated, + individually access-granted users. It is not rehosted or redistributed. +- Retailer images are referenced by source URL only — never rehosted. +- Sources are accessed politely: fixed-delay throttling, on-disk response caching, descriptive + User-Agent, exponential-backoff retry. + +## Dataset format + +One JSONL file per dataset. Line 1 is a `{ "_dataset": { ... } }` header; lines 2..n are one +product per line. The contract is the shared Zod schema +`src/lib/server/catalog/dataset-schema.ts` — the crawler validates every emitted row against it, +so a produced file always imports cleanly (`catalog:import` is fail-closed). + +## Usage + +```bash +cd crawler +bun install # installs migros-api-wrapper (Migros source only) + +# Open Food Facts — from a downloaded ODbL bulk dump (.jsonl or .jsonl.gz): +# download once from https://world.openfoodfacts.org/data (openfoodfacts-products.jsonl.gz) +bun run crawl off /path/to/openfoodfacts-products.jsonl.gz +# → writes data/catalog/off-ch-.jsonl (Swiss products with full core macros) + +# Migros — live API (polite, throttled): +bun run crawl migros +# → writes data/catalog/migros-.jsonl +``` + +The OFF dump is large (tens of GB uncompressed); the crawler streams it (gunzip + line split), +never loading it into memory. The Migros crawl is live and rate-limited — expect it to take a +while; it checkpoints progress. + +## Importing on the server host + +The CLI that loads a dataset into Postgres runs **on the server host** (production Postgres is +Docker-internal), not from the crawler: + +```bash +scp data/catalog/migros-.jsonl server:/tmp/ +ssh server +docker compose exec -T app bun run catalog:import /tmp/migros-.jsonl +docker compose exec -T app bun run catalog:grant migros +``` + +Re-importing the same dataset `key` fully replaces its rows and preserves access grants. + +## Testing + +```bash +cd crawler && bun test +``` + +All tests are fixture-driven — no live network. Adapters split a pure, tested normalizer from +thin live-fetch glue; the glue (`createMigrosClient`, dump download) is exercised only by the +maintainer during a real crawl. diff --git a/crawler/bun.lock b/crawler/bun.lock new file mode 100644 index 00000000..9bbf9985 --- /dev/null +++ b/crawler/bun.lock @@ -0,0 +1,211 @@ +{ + "lockfileVersion": 1, + "configVersion": 1, + "workspaces": { + "": { + "name": "@bissbilanz/crawler", + "dependencies": { + "migros-api-wrapper": "1.1.37", + }, + }, + }, + "packages": { + "abort-controller": ["abort-controller@3.0.0", "", { "dependencies": { "event-target-shim": "^5.0.0" } }, "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg=="], + + "agent-base": ["agent-base@6.0.2", "", { "dependencies": { "debug": "4" } }, "sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ=="], + + "asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="], + + "atomic-sleep": ["atomic-sleep@1.0.0", "", {}, "sha512-kNOjDqAh7px0XWNI+4QbzoiR/nTkHAWNud2uvnJquD1/x5a7EQZMJT0AczqK0Qn67oY/TTQ1LbUKajZpp3I9tQ=="], + + "axios": ["axios@1.16.1", "", { "dependencies": { "follow-redirects": "^1.16.0", "form-data": "^4.0.5", "https-proxy-agent": "^5.0.1", "proxy-from-env": "^2.1.0" } }, "sha512-caYkukvroVPO8KrzuJEb50Hm07KwfBZPEC3VeFHTsqWHvKTsy54hjJz9BS/cdaypROE2rH6xvm9mHX4fgWkr3A=="], + + "balanced-match": ["balanced-match@1.0.2", "", {}, "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw=="], + + "base64-js": ["base64-js@1.5.1", "", {}, "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA=="], + + "boolbase": ["boolbase@1.0.0", "", {}, "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="], + + "brace-expansion": ["brace-expansion@2.1.1", "", { "dependencies": { "balanced-match": "^1.0.0" } }, "sha512-WR1cURNjuvBLMZBMbqM0UoE+WAfdUcEV1ccD8PVBVOI+Z3ND4+SZbN8RsfT2bMuG1qwz5RFvPukSZm5fF2D5eA=="], + + "buffer": ["buffer@6.0.3", "", { "dependencies": { "base64-js": "^1.3.1", "ieee754": "^1.2.1" } }, "sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA=="], + + "call-bind-apply-helpers": ["call-bind-apply-helpers@1.0.2", "", { "dependencies": { "es-errors": "^1.3.0", "function-bind": "^1.1.2" } }, "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ=="], + + "cheerio": ["cheerio@1.2.0", "", { "dependencies": { "cheerio-select": "^2.1.0", "dom-serializer": "^2.0.0", "domhandler": "^5.0.3", "domutils": "^3.2.2", "encoding-sniffer": "^0.2.1", "htmlparser2": "^10.1.0", "parse5": "^7.3.0", "parse5-htmlparser2-tree-adapter": "^7.1.0", "parse5-parser-stream": "^7.1.2", "undici": "^7.19.0", "whatwg-mimetype": "^4.0.0" } }, "sha512-WDrybc/gKFpTYQutKIK6UvfcuxijIZfMfXaYm8NMsPQxSYvf+13fXUJ4rztGGbJcBQ/GF55gvrZ0Bc0bj/mqvg=="], + + "cheerio-select": ["cheerio-select@2.1.0", "", { "dependencies": { "boolbase": "^1.0.0", "css-select": "^5.1.0", "css-what": "^6.1.0", "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.0.1" } }, "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g=="], + + "colorette": ["colorette@2.0.20", "", {}, "sha512-IfEDxwoWIjkeXL1eXcDiow4UbKjhLdq6/EuSVR9GMN7KVH3r9gQ83e73hsz1Nd1T3ijd5xv1wcWRYO+D6kCI2w=="], + + "combined-stream": ["combined-stream@1.0.8", "", { "dependencies": { "delayed-stream": "~1.0.0" } }, "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg=="], + + "css-select": ["css-select@5.2.2", "", { "dependencies": { "boolbase": "^1.0.0", "css-what": "^6.1.0", "domhandler": "^5.0.2", "domutils": "^3.0.1", "nth-check": "^2.0.1" } }, "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw=="], + + "css-what": ["css-what@6.2.2", "", {}, "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA=="], + + "dateformat": ["dateformat@4.6.3", "", {}, "sha512-2P0p0pFGzHS5EMnhdxQi7aJN+iMheud0UhG4dlE1DLAlvL8JHjJJTX/CSm4JXwV0Ka5nGk3zC5mcb5bUQUxxMA=="], + + "debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="], + + "deepmerge": ["deepmerge@4.3.1", "", {}, "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A=="], + + "delayed-stream": ["delayed-stream@1.0.0", "", {}, "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="], + + "dom-serializer": ["dom-serializer@2.0.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.2", "entities": "^4.2.0" } }, "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg=="], + + "domelementtype": ["domelementtype@2.3.0", "", {}, "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw=="], + + "domhandler": ["domhandler@5.0.3", "", { "dependencies": { "domelementtype": "^2.3.0" } }, "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w=="], + + "domutils": ["domutils@3.2.2", "", { "dependencies": { "dom-serializer": "^2.0.0", "domelementtype": "^2.3.0", "domhandler": "^5.0.3" } }, "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw=="], + + "dotenv": ["dotenv@16.6.1", "", {}, "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow=="], + + "dunder-proto": ["dunder-proto@1.0.1", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.1", "es-errors": "^1.3.0", "gopd": "^1.2.0" } }, "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A=="], + + "encoding-sniffer": ["encoding-sniffer@0.2.1", "", { "dependencies": { "iconv-lite": "^0.6.3", "whatwg-encoding": "^3.1.1" } }, "sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw=="], + + "end-of-stream": ["end-of-stream@1.4.5", "", { "dependencies": { "once": "^1.4.0" } }, "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg=="], + + "entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="], + + "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="], + + "es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="], + + "es-object-atoms": ["es-object-atoms@1.1.2", "", { "dependencies": { "es-errors": "^1.3.0" } }, "sha512-HWcBoN6NileqtSydK2FqHbS/LoDd2pqrnQHLyJzBj4kOp/ky2MWMN694xOfkK8/SnUsW2DH7EfyVlydKCsm1Zw=="], + + "es-set-tostringtag": ["es-set-tostringtag@2.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "get-intrinsic": "^1.2.6", "has-tostringtag": "^1.0.2", "hasown": "^2.0.2" } }, "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA=="], + + "event-target-shim": ["event-target-shim@5.0.1", "", {}, "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ=="], + + "events": ["events@3.3.0", "", {}, "sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q=="], + + "fast-copy": ["fast-copy@3.0.2", "", {}, "sha512-dl0O9Vhju8IrcLndv2eU4ldt1ftXMqqfgN4H1cpmGV7P6jeB9FwpN9a2c8DPGE1Ys88rNUJVYDHq73CGAGOPfQ=="], + + "fast-redact": ["fast-redact@3.5.0", "", {}, "sha512-dwsoQlS7h9hMeYUq1W++23NDcBLV4KqONnITDV9DjfS3q1SgDGVrBdvvTLUotWtPSD7asWDV9/CmsZPy8Hf70A=="], + + "fast-safe-stringify": ["fast-safe-stringify@2.1.1", "", {}, "sha512-W+KJc2dmILlPplD/H4K9l9LcAHAfPtP6BY84uVLXQ6Evcz9Lcg33Y2z1IVblT6xdY54PXYVHEv+0Wpq8Io6zkA=="], + + "follow-redirects": ["follow-redirects@1.16.0", "", {}, "sha512-y5rN/uOsadFT/JfYwhxRS5R7Qce+g3zG97+JrtFZlC9klX/W5hD7iiLzScI4nZqUS7DNUdhPgw4xI8W2LuXlUw=="], + + "form-data": ["form-data@4.0.5", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w=="], + + "fs.realpath": ["fs.realpath@1.0.0", "", {}, "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw=="], + + "function-bind": ["function-bind@1.1.2", "", {}, "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA=="], + + "get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="], + + "get-proto": ["get-proto@1.0.1", "", { "dependencies": { "dunder-proto": "^1.0.1", "es-object-atoms": "^1.0.0" } }, "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g=="], + + "glob": ["glob@8.1.0", "", { "dependencies": { "fs.realpath": "^1.0.0", "inflight": "^1.0.4", "inherits": "2", "minimatch": "^5.0.1", "once": "^1.3.0" } }, "sha512-r8hpEjiQEYlF2QU0df3dS+nxxSIreXQS1qRhMJM0Q5NDdR386C7jb7Hwwod8Fgiuex+k0GFjgft18yvxm5XoCQ=="], + + "gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="], + + "has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="], + + "has-tostringtag": ["has-tostringtag@1.0.2", "", { "dependencies": { "has-symbols": "^1.0.3" } }, "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw=="], + + "hasown": ["hasown@2.0.4", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-T2UbfbBEF32wiepXIsMlTW9+dDYC6wMh/t/vYA4tuOMKqWz/n3vr1NFSxQiyP+zk2mXsoMA/i/7qV6LKut1t1A=="], + + "help-me": ["help-me@4.2.0", "", { "dependencies": { "glob": "^8.0.0", "readable-stream": "^3.6.0" } }, "sha512-TAOnTB8Tz5Dw8penUuzHVrKNKlCIbwwbHnXraNJxPwf8LRtE2HlM84RYuezMFcwOJmoYOCWVDyJ8TQGxn9PgxA=="], + + "htmlparser2": ["htmlparser2@10.1.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.2.2", "entities": "^7.0.1" } }, "sha512-VTZkM9GWRAtEpveh7MSF6SjjrpNVNNVJfFup7xTY3UpFtm67foy9HDVXneLtFVt4pMz5kZtgNcvCniNFb1hlEQ=="], + + "https-proxy-agent": ["https-proxy-agent@5.0.1", "", { "dependencies": { "agent-base": "6", "debug": "4" } }, "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA=="], + + "iconv-lite": ["iconv-lite@0.6.3", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw=="], + + "ieee754": ["ieee754@1.2.1", "", {}, "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA=="], + + "inflight": ["inflight@1.0.6", "", { "dependencies": { "once": "^1.3.0", "wrappy": "1" } }, "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA=="], + + "inherits": ["inherits@2.0.4", "", {}, "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="], + + "joycon": ["joycon@3.1.1", "", {}, "sha512-34wB/Y7MW7bzjKRjUKTa46I2Z7eV62Rkhva+KkopW7Qvv/OSWBqvkSY7vusOPrNuZcUG3tApvdVgNB8POj3SPw=="], + + "math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="], + + "migros-api-wrapper": ["migros-api-wrapper@1.1.37", "", { "dependencies": { "axios": "^1.8.4", "cheerio": "^1.0.0-rc.12", "deepmerge": "^4.3.1", "dotenv": "^16.4.5", "pino": "^8.6.1", "pino-pretty": "^9.1.1" } }, "sha512-D69K7y2BFc2sU+jums4nFIihSU9BiczIeHir5TSeRP9K/e1W6IXCHRLIeB1Of5RKG0wEH8pz8Bxv6S8Fv21tgg=="], + + "mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="], + + "mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="], + + "minimatch": ["minimatch@5.1.9", "", { "dependencies": { "brace-expansion": "^2.0.1" } }, "sha512-7o1wEA2RyMP7Iu7GNba9vc0RWWGACJOCZBJX2GJWip0ikV+wcOsgVuY9uE8CPiyQhkGFSlhuSkZPavN7u1c2Fw=="], + + "minimist": ["minimist@1.2.8", "", {}, "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA=="], + + "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="], + + "nth-check": ["nth-check@2.1.1", "", { "dependencies": { "boolbase": "^1.0.0" } }, "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w=="], + + "on-exit-leak-free": ["on-exit-leak-free@2.1.2", "", {}, "sha512-0eJJY6hXLGf1udHwfNftBqH+g73EU4B504nZeKpz1sYRKafAghwxEJunB2O7rDZkL4PGfsMVnTXZ2EjibbqcsA=="], + + "once": ["once@1.4.0", "", { "dependencies": { "wrappy": "1" } }, "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w=="], + + "parse5": ["parse5@7.3.0", "", { "dependencies": { "entities": "^6.0.0" } }, "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw=="], + + "parse5-htmlparser2-tree-adapter": ["parse5-htmlparser2-tree-adapter@7.1.0", "", { "dependencies": { "domhandler": "^5.0.3", "parse5": "^7.0.0" } }, "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g=="], + + "parse5-parser-stream": ["parse5-parser-stream@7.1.2", "", { "dependencies": { "parse5": "^7.0.0" } }, "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow=="], + + "pino": ["pino@8.21.0", "", { "dependencies": { "atomic-sleep": "^1.0.0", "fast-redact": "^3.1.1", "on-exit-leak-free": "^2.1.0", "pino-abstract-transport": "^1.2.0", "pino-std-serializers": "^6.0.0", "process-warning": "^3.0.0", "quick-format-unescaped": "^4.0.3", "real-require": "^0.2.0", "safe-stable-stringify": "^2.3.1", "sonic-boom": "^3.7.0", "thread-stream": "^2.6.0" }, "bin": { "pino": "bin.js" } }, "sha512-ip4qdzjkAyDDZklUaZkcRFb2iA118H9SgRh8yzTkSQK8HilsOJF7rSY8HoW5+I0M46AZgX/pxbprf2vvzQCE0Q=="], + + "pino-abstract-transport": ["pino-abstract-transport@1.2.0", "", { "dependencies": { "readable-stream": "^4.0.0", "split2": "^4.0.0" } }, "sha512-Guhh8EZfPCfH+PMXAb6rKOjGQEoy0xlAIn+irODG5kgfYV+BQ0rGYYWTIel3P5mmyXqkYkPmdIkywsn6QKUR1Q=="], + + "pino-pretty": ["pino-pretty@9.4.1", "", { "dependencies": { "colorette": "^2.0.7", "dateformat": "^4.6.3", "fast-copy": "^3.0.0", "fast-safe-stringify": "^2.1.1", "help-me": "^4.0.1", "joycon": "^3.1.1", "minimist": "^1.2.6", "on-exit-leak-free": "^2.1.0", "pino-abstract-transport": "^1.0.0", "pump": "^3.0.0", "readable-stream": "^4.0.0", "secure-json-parse": "^2.4.0", "sonic-boom": "^3.0.0", "strip-json-comments": "^3.1.1" }, "bin": { "pino-pretty": "bin.js" } }, "sha512-loWr5SNawVycvY//hamIzyz3Fh5OSpvkcO13MwdDW+eKIGylobPLqnVGTDwDXkdmpJd1BhEG+qhDw09h6SqJiQ=="], + + "pino-std-serializers": ["pino-std-serializers@6.2.2", "", {}, "sha512-cHjPPsE+vhj/tnhCy/wiMh3M3z3h/j15zHQX+S9GkTBgqJuTuJzYJ4gUyACLhDaJ7kk9ba9iRDmbH2tJU03OiA=="], + + "process": ["process@0.11.10", "", {}, "sha512-cdGef/drWFoydD1JsMzuFf8100nZl+GT+yacc2bEced5f9Rjk4z+WtFUTBu9PhOi9j/jfmBPu0mMEY4wIdAF8A=="], + + "process-warning": ["process-warning@3.0.0", "", {}, "sha512-mqn0kFRl0EoqhnL0GQ0veqFHyIN1yig9RHh/InzORTUiZHFRAur+aMtRkELNwGs9aNwKS6tg/An4NYBPGwvtzQ=="], + + "proxy-from-env": ["proxy-from-env@2.1.0", "", {}, "sha512-cJ+oHTW1VAEa8cJslgmUZrc+sjRKgAKl3Zyse6+PV38hZe/V6Z14TbCuXcan9F9ghlz4QrFr2c92TNF82UkYHA=="], + + "pump": ["pump@3.0.4", "", { "dependencies": { "end-of-stream": "^1.1.0", "once": "^1.3.1" } }, "sha512-VS7sjc6KR7e1ukRFhQSY5LM2uBWAUPiOPa/A3mkKmiMwSmRFUITt0xuj+/lesgnCv+dPIEYlkzrcyXgquIHMcA=="], + + "quick-format-unescaped": ["quick-format-unescaped@4.0.4", "", {}, "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg=="], + + "readable-stream": ["readable-stream@4.7.0", "", { "dependencies": { "abort-controller": "^3.0.0", "buffer": "^6.0.3", "events": "^3.3.0", "process": "^0.11.10", "string_decoder": "^1.3.0" } }, "sha512-oIGGmcpTLwPga8Bn6/Z75SVaH1z5dUut2ibSyAMVhmUggWpmDn2dapB0n7f8nwaSiRtepAsfJyfXIO5DCVAODg=="], + + "real-require": ["real-require@0.2.0", "", {}, "sha512-57frrGM/OCTLqLOAh0mhVA9VBMHd+9U7Zb2THMGdBUoZVOtGbJzjxsYGDJ3A9AYYCP4hn6y1TVbaOfzWtm5GFg=="], + + "safe-buffer": ["safe-buffer@5.2.1", "", {}, "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ=="], + + "safe-stable-stringify": ["safe-stable-stringify@2.5.0", "", {}, "sha512-b3rppTKm9T+PsVCBEOUR46GWI7fdOs00VKZ1+9c1EWDaDMvjQc6tUwuFyIprgGgTcWoVHSKrU8H31ZHA2e0RHA=="], + + "safer-buffer": ["safer-buffer@2.1.2", "", {}, "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="], + + "secure-json-parse": ["secure-json-parse@2.7.0", "", {}, "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw=="], + + "sonic-boom": ["sonic-boom@3.8.1", "", { "dependencies": { "atomic-sleep": "^1.0.0" } }, "sha512-y4Z8LCDBuum+PBP3lSV7RHrXscqksve/bi0as7mhwVnBW+/wUqKT/2Kb7um8yqcFy0duYbbPxzt89Zy2nOCaxg=="], + + "split2": ["split2@4.2.0", "", {}, "sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg=="], + + "string_decoder": ["string_decoder@1.3.0", "", { "dependencies": { "safe-buffer": "~5.2.0" } }, "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA=="], + + "strip-json-comments": ["strip-json-comments@3.1.1", "", {}, "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig=="], + + "thread-stream": ["thread-stream@2.7.0", "", { "dependencies": { "real-require": "^0.2.0" } }, "sha512-qQiRWsU/wvNolI6tbbCKd9iKaTnCXsTwVxhhKM6nctPdujTyztjlbUkUTUymidWcMnZ5pWR0ej4a0tjsW021vw=="], + + "undici": ["undici@7.26.0", "", {}, "sha512-3O9Tf67pGhgOv9jM35AbhkXAKi13f3oy3aE4CSgr+TckGeY+/iu97ZXN+J7DpHPzLbVApFd1IFhcnBjREYXYcg=="], + + "util-deprecate": ["util-deprecate@1.0.2", "", {}, "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw=="], + + "whatwg-encoding": ["whatwg-encoding@3.1.1", "", { "dependencies": { "iconv-lite": "0.6.3" } }, "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ=="], + + "whatwg-mimetype": ["whatwg-mimetype@4.0.0", "", {}, "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg=="], + + "wrappy": ["wrappy@1.0.2", "", {}, "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="], + + "help-me/readable-stream": ["readable-stream@3.6.2", "", { "dependencies": { "inherits": "^2.0.3", "string_decoder": "^1.1.1", "util-deprecate": "^1.0.1" } }, "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA=="], + + "htmlparser2/entities": ["entities@7.0.1", "", {}, "sha512-TWrgLOFUQTH994YUyl1yT4uyavY5nNB5muff+RtWaqNVCAK408b5ZnnbNAUEWLTCpum9w6arT70i1XdQ4UeOPA=="], + + "parse5/entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="], + } +} diff --git a/crawler/lib/checkpoint.test.ts b/crawler/lib/checkpoint.test.ts new file mode 100644 index 00000000..51bb1638 --- /dev/null +++ b/crawler/lib/checkpoint.test.ts @@ -0,0 +1,15 @@ +import { test, expect } from 'bun:test'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { rmSync } from 'node:fs'; +import { readCheckpoint, writeCheckpoint } from './checkpoint'; + +type Cursor = { category: string; page: number }; + +test('round-trips a checkpoint and returns null when absent', async () => { + const path = join(tmpdir(), `crawler-cp-${process.pid}.json`); + rmSync(path, { force: true }); + expect(await readCheckpoint(path)).toBeNull(); + await writeCheckpoint(path, { category: 'snacks', page: 3 }); + expect(await readCheckpoint(path)).toEqual({ category: 'snacks', page: 3 }); +}); diff --git a/crawler/lib/checkpoint.ts b/crawler/lib/checkpoint.ts new file mode 100644 index 00000000..d771502d --- /dev/null +++ b/crawler/lib/checkpoint.ts @@ -0,0 +1,13 @@ +export async function readCheckpoint(path: string): Promise { + const f = Bun.file(path); + if (!(await f.exists())) return null; + try { + return JSON.parse(await f.text()) as T; + } catch { + return null; + } +} + +export async function writeCheckpoint(path: string, value: unknown): Promise { + await Bun.write(path, JSON.stringify(value)); +} diff --git a/crawler/lib/http.test.ts b/crawler/lib/http.test.ts new file mode 100644 index 00000000..05b702cb --- /dev/null +++ b/crawler/lib/http.test.ts @@ -0,0 +1,58 @@ +import { test, expect } from 'bun:test'; +import { createPoliteClient } from './http'; + +test('retries on failure then succeeds, applying backoff via injected sleep', async () => { + let calls = 0; + const sleeps: number[] = []; + const client = createPoliteClient({ + minDelayMs: 50, + maxRetries: 3, + now: () => 0, + sleep: async (ms) => { + sleeps.push(ms); + }, + fetchImpl: async () => { + calls++; + if (calls < 3) throw new Error('boom'); + return new Response(JSON.stringify({ ok: true }), { status: 200 }); + } + }); + const res = await client.getJson<{ ok: boolean }>('https://x.test/a'); + expect(res?.ok).toBe(true); + expect(calls).toBe(3); + expect(sleeps.filter((s) => s > 0).length).toBeGreaterThanOrEqual(2); // two backoff sleeps +}); + +test('returns null on a 404 without retrying', async () => { + let calls = 0; + const client = createPoliteClient({ + minDelayMs: 0, + maxRetries: 3, + sleep: async () => {}, + fetchImpl: async () => { + calls++; + return new Response('nope', { status: 404 }); + } + }); + expect(await client.getJson('https://x.test/missing')).toBeNull(); + expect(calls).toBe(1); +}); + +test('caches responses by url when a cache is provided', async () => { + let calls = 0; + const store = new Map(); + const client = createPoliteClient({ + minDelayMs: 0, + maxRetries: 1, + sleep: async () => {}, + cache: { get: async (k) => store.get(k) ?? null, set: async (k, v) => void store.set(k, v) }, + fetchImpl: async () => { + calls++; + return new Response(JSON.stringify({ n: calls }), { status: 200 }); + } + }); + const a = await client.getJson<{ n: number }>('https://x.test/c'); + const b = await client.getJson<{ n: number }>('https://x.test/c'); + expect(a).toEqual(b); + expect(calls).toBe(1); +}); diff --git a/crawler/lib/http.ts b/crawler/lib/http.ts new file mode 100644 index 00000000..c1fc9798 --- /dev/null +++ b/crawler/lib/http.ts @@ -0,0 +1,57 @@ +export type CacheLike = { + get: (key: string) => Promise; + set: (key: string, value: string) => Promise; +}; + +export type PoliteClientOpts = { + minDelayMs?: number; + maxRetries?: number; + userAgent?: string; + sleep?: (ms: number) => Promise; + fetchImpl?: (url: string, init?: RequestInit) => Promise; + cache?: CacheLike; + now?: () => number; +}; + +const defaultSleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); + +export function createPoliteClient(opts: PoliteClientOpts = {}) { + const minDelayMs = opts.minDelayMs ?? 500; + const maxRetries = opts.maxRetries ?? 4; + const sleep = opts.sleep ?? defaultSleep; + const doFetch = opts.fetchImpl ?? fetch; + const now = opts.now ?? Date.now; + const ua = opts.userAgent ?? 'Bissbilanz-Catalog-Crawler/1.0 (+private use; non-redistribution)'; + let lastAt = 0; + + async function throttle() { + const wait = Math.max(0, lastAt + minDelayMs - now()); + if (wait > 0) await sleep(wait); + lastAt = now(); + } + + async function getJson(url: string, headers: Record = {}): Promise { + if (opts.cache) { + const hit = await opts.cache.get(url); + if (hit != null) return JSON.parse(hit) as T; + } + let attempt = 0; + for (;;) { + await throttle(); + try { + const res = await doFetch(url, { headers: { 'User-Agent': ua, ...headers } }); + if (res.status === 404 || res.status === 410) return null; + if (!res.ok) throw new Error(`HTTP ${res.status}`); + const text = await res.text(); + if (opts.cache) await opts.cache.set(url, text); + return JSON.parse(text) as T; + } catch (err) { + attempt++; + if (attempt >= maxRetries) throw err; + await sleep(minDelayMs * 2 ** attempt); + } + } + } + + return { getJson }; +} diff --git a/crawler/lib/jsonl-stream.test.ts b/crawler/lib/jsonl-stream.test.ts new file mode 100644 index 00000000..78965f9c --- /dev/null +++ b/crawler/lib/jsonl-stream.test.ts @@ -0,0 +1,13 @@ +import { test, expect } from 'bun:test'; +import { splitJsonlLines } from './jsonl-stream'; + +async function* chunks(parts: string[]) { + for (const p of parts) yield new TextEncoder().encode(p); +} + +test('splits a byte stream into lines across chunk boundaries, skipping blanks', async () => { + const out: string[] = []; + for await (const line of splitJsonlLines(chunks(['{"a":1}\n{"b":', '2}\n\n{"c":3}']))) + out.push(line); + expect(out).toEqual(['{"a":1}', '{"b":2}', '{"c":3}']); +}); diff --git a/crawler/lib/jsonl-stream.ts b/crawler/lib/jsonl-stream.ts new file mode 100644 index 00000000..a81ebe8a --- /dev/null +++ b/crawler/lib/jsonl-stream.ts @@ -0,0 +1,26 @@ +export async function* splitJsonlLines(source: AsyncIterable): AsyncIterable { + const decoder = new TextDecoder(); + let buf = ''; + for await (const chunk of source) { + buf += decoder.decode(chunk, { stream: true }); + let nl: number; + while ((nl = buf.indexOf('\n')) >= 0) { + const line = buf.slice(0, nl).trim(); + buf = buf.slice(nl + 1); + if (line.length > 0) yield line; + } + } + const last = buf.trim(); + if (last.length > 0) yield last; +} + +export async function* readDumpLines(path: string): AsyncIterable { + const file = Bun.file(path); + let stream: ReadableStream = file.stream(); + if (path.endsWith('.gz')) { + stream = stream.pipeThrough( + new DecompressionStream('gzip') as unknown as ReadableWritablePair + ); + } + yield* splitJsonlLines(stream as unknown as AsyncIterable); +} diff --git a/crawler/lib/jsonl-writer.test.ts b/crawler/lib/jsonl-writer.test.ts new file mode 100644 index 00000000..5cee1f98 --- /dev/null +++ b/crawler/lib/jsonl-writer.test.ts @@ -0,0 +1,48 @@ +import { test, expect } from 'bun:test'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { DatasetWriter } from './jsonl-writer'; +import { datasetHeaderSchema, datasetProductSchema } from '$lib/server/catalog/dataset-schema'; + +function tmpFile(name: string) { + return join(tmpdir(), `crawler-test-${name}-${process.pid}.jsonl`); +} + +test('writes a header line then product lines, all schema-valid, with a correct count', async () => { + const path = tmpFile('write'); + const w = new DatasetWriter(path, { + key: 'off-ch', + name: 'OFF (CH)', + source: 'off', + priority: 20 + }); + await w.open(); + await w.write({ + name: 'A', + servingSize: 100, + servingUnit: 'g', + calories: 1, + protein: 1, + carbs: 1, + fat: 1, + fiber: 1 + }); + await w.write({ + name: 'B', + servingSize: 100, + servingUnit: 'g', + calories: 2, + protein: 2, + carbs: 2, + fat: 2, + fiber: 2 + }); + const count = await w.close(); + expect(count).toBe(2); + + const lines = (await Bun.file(path).text()).trim().split('\n'); + expect(lines.length).toBe(3); + expect(datasetHeaderSchema.safeParse(JSON.parse(lines[0])).success).toBe(true); + expect(datasetProductSchema.safeParse(JSON.parse(lines[1])).success).toBe(true); + expect(JSON.parse(lines[0])._dataset.source).toBe('off'); +}); diff --git a/crawler/lib/jsonl-writer.ts b/crawler/lib/jsonl-writer.ts new file mode 100644 index 00000000..7c43e701 --- /dev/null +++ b/crawler/lib/jsonl-writer.ts @@ -0,0 +1,53 @@ +import { datasetProductSchema, type DatasetProduct } from '$lib/server/catalog/dataset-schema'; + +export type DatasetHeaderInput = { + key: string; + name: string; + source: 'migros' | 'off' | 'coop'; + priority: number; + version?: string; +}; + +export class DatasetWriter { + #path: string; + #header: DatasetHeaderInput; + #snapshotAt: string; + #sink: Bun.FileSink | null = null; + #count = 0; + + constructor(path: string, header: DatasetHeaderInput, snapshotAt = new Date().toISOString()) { + this.#path = path; + this.#header = header; + this.#snapshotAt = snapshotAt; + } + + async open(): Promise { + this.#sink = Bun.file(this.#path).writer(); + const headerLine = JSON.stringify({ + _dataset: { + key: this.#header.key, + name: this.#header.name, + source: this.#header.source, + priority: this.#header.priority, + version: this.#header.version ?? null, + snapshotAt: this.#snapshotAt + } + }); + this.#sink.write(headerLine + '\n'); + } + + async write(product: DatasetProduct): Promise { + if (!this.#sink) throw new Error('DatasetWriter.open() not called'); + // fail-closed: never write a line the importer would reject + const parsed = datasetProductSchema.safeParse(product); + if (!parsed.success) throw new Error(`invalid product: ${parsed.error.issues[0]?.message}`); + this.#sink.write(JSON.stringify(product) + '\n'); + this.#count++; + } + + async close(): Promise { + if (this.#sink) await this.#sink.end(); + this.#sink = null; + return this.#count; + } +} diff --git a/crawler/lib/normalize.test.ts b/crawler/lib/normalize.test.ts new file mode 100644 index 00000000..e234e49c --- /dev/null +++ b/crawler/lib/normalize.test.ts @@ -0,0 +1,51 @@ +import { test, expect } from 'bun:test'; +import { buildDatasetProduct } from './normalize'; + +const core = { calories: 515, protein: 5.8, carbs: 53, fat: 30, fiber: 5.6 }; +const meta = { name: 'Zweifel Paprika Chips', servingSize: 100, servingUnit: 'g' as const }; + +test('builds a valid product from core macros + nutrients', () => { + const r = buildDatasetProduct({ ...meta, ...core, nutrients: { saturatedFat: 1.8, salt: 1.3 } }); + expect(r.ok).toBe(true); + if (r.ok) { + expect(r.product.name).toBe('Zweifel Paprika Chips'); + expect(r.product.saturatedFat).toBe(1.8); + expect(r.product.fiber).toBe(5.6); + } +}); + +test('drops a product missing a core macro with reason', () => { + const r = buildDatasetProduct({ + ...meta, + calories: 1, + protein: 1, + carbs: 1, + fat: 1, + fiber: null + }); + expect(r.ok).toBe(false); + if (!r.ok) expect(r.reason).toContain('fiber'); +}); + +test('drops a product with a negative macro', () => { + const r = buildDatasetProduct({ ...meta, ...core, calories: -1 }); + expect(r.ok).toBe(false); +}); + +test('passes through optional quality fields', () => { + const r = buildDatasetProduct({ + ...meta, + ...core, + barcode: '7610095131003', + nutriScore: 'd', + novaGroup: 4, + additives: ['en:e330'], + sourceUrl: 'https://example.com/p/1', + sourceRef: '1' + }); + expect(r.ok).toBe(true); + if (r.ok) { + expect(r.product.barcode).toBe('7610095131003'); + expect(r.product.nutriScore).toBe('d'); + } +}); diff --git a/crawler/lib/normalize.ts b/crawler/lib/normalize.ts new file mode 100644 index 00000000..eee6fa8d --- /dev/null +++ b/crawler/lib/normalize.ts @@ -0,0 +1,71 @@ +import { datasetProductSchema } from '$lib/server/catalog/dataset-schema'; +import { ALL_NUTRIENT_KEYS } from '$lib/nutrients'; +import type { ServingUnit } from '$lib/units'; +import type { BuildResult } from '../types'; + +export type NormalizerInput = { + name: string; + brand?: string | null; + language?: 'de' | 'fr' | 'it' | 'en' | null; + servingSize: number; + servingUnit: ServingUnit; + calories: number | null; + protein: number | null; + carbs: number | null; + fat: number | null; + fiber: number | null; + nutrients?: Record; + barcode?: string | null; + nutriScore?: 'a' | 'b' | 'c' | 'd' | 'e' | null; + novaGroup?: number | null; + additives?: string[] | null; + ingredientsText?: string | null; + imageUrl?: string | null; + sourceUrl?: string | null; + sourceRef?: string | null; + crawledAt?: string | null; +}; + +const CORE = ['calories', 'protein', 'carbs', 'fat', 'fiber'] as const; + +export function buildDatasetProduct(input: NormalizerInput): BuildResult { + for (const k of CORE) { + const v = input[k]; + if (v == null || Number.isNaN(v)) return { ok: false, reason: `missing-core:${k}` }; + } + + const nutrients: Record = {}; + for (const key of ALL_NUTRIENT_KEYS) { + const v = input.nutrients?.[key]; + nutrients[key] = v == null || Number.isNaN(v) ? null : v; + } + + const candidate = { + name: input.name, + brand: input.brand ?? null, + language: input.language ?? null, + servingSize: input.servingSize, + servingUnit: input.servingUnit, + calories: input.calories, + protein: input.protein, + carbs: input.carbs, + fat: input.fat, + fiber: input.fiber, + ...nutrients, + barcode: input.barcode ?? null, + nutriScore: input.nutriScore ?? null, + novaGroup: input.novaGroup ?? null, + additives: input.additives ?? null, + ingredientsText: input.ingredientsText ?? null, + imageUrl: input.imageUrl ?? null, + sourceUrl: input.sourceUrl ?? null, + sourceRef: input.sourceRef ?? null, + crawledAt: input.crawledAt ?? null + }; + + const parsed = datasetProductSchema.safeParse(candidate); + if (!parsed.success) { + return { ok: false, reason: `schema:${parsed.error.issues[0]?.path.join('.') || 'invalid'}` }; + } + return { ok: true, product: parsed.data }; +} diff --git a/crawler/lib/smoke.test.ts b/crawler/lib/smoke.test.ts new file mode 100644 index 00000000..d23fe2ed --- /dev/null +++ b/crawler/lib/smoke.test.ts @@ -0,0 +1,22 @@ +import { test, expect } from 'bun:test'; +import { ALL_NUTRIENT_KEYS } from '$lib/nutrients'; +import { extractAllNutrients } from '$lib/server/nutrient-extract'; +import { datasetProductSchema } from '$lib/server/catalog/dataset-schema'; + +test('shared $lib modules resolve and work from the crawler package', () => { + expect(ALL_NUTRIENT_KEYS.length).toBe(43); + const out = extractAllNutrients({ 'saturated-fat_100g': 1.8, sodium_100g: 0.5 }); + expect(out.saturatedFat).toBe(1.8); + expect(out.sodium).toBe(500); // g→mg conversion + const r = datasetProductSchema.safeParse({ + name: 'X', + servingSize: 100, + servingUnit: 'g', + calories: 1, + protein: 1, + carbs: 1, + fat: 1, + fiber: 1 + }); + expect(r.success).toBe(true); +}); diff --git a/crawler/package.json b/crawler/package.json new file mode 100644 index 00000000..8673901a --- /dev/null +++ b/crawler/package.json @@ -0,0 +1,14 @@ +{ + "name": "@bissbilanz/crawler", + "private": true, + "type": "module", + "description": "Offline crawler that builds Bissbilanz catalog datasets (not part of the app build).", + "scripts": { + "test": "bun test", + "check": "tsc --noEmit", + "crawl": "bun run index.ts" + }, + "dependencies": { + "migros-api-wrapper": "1.1.37" + } +} diff --git a/crawler/tsconfig.json b/crawler/tsconfig.json new file mode 100644 index 00000000..41d2e0a0 --- /dev/null +++ b/crawler/tsconfig.json @@ -0,0 +1,20 @@ +{ + "compilerOptions": { + "target": "ESNext", + "module": "ESNext", + "moduleResolution": "bundler", + "types": ["bun-types"], + "strict": true, + "skipLibCheck": true, + "noEmit": true, + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": false, + "esModuleInterop": true, + "resolveJsonModule": true, + "paths": { + "$lib": ["../src/lib"], + "$lib/*": ["../src/lib/*"] + } + }, + "include": ["**/*.ts"] +} diff --git a/crawler/types.test.ts b/crawler/types.test.ts new file mode 100644 index 00000000..c312916a --- /dev/null +++ b/crawler/types.test.ts @@ -0,0 +1,16 @@ +import { test, expect } from 'bun:test'; +import { NUTRIENT_KEYS, recordDrop, newStats } from './types'; +import { ALL_NUTRIENT_KEYS } from '$lib/nutrients'; + +test('NUTRIENT_KEYS matches the app ALL_NUTRIENT_KEYS exactly (drift guard)', () => { + expect(([...NUTRIENT_KEYS] as string[]).sort()).toEqual([...ALL_NUTRIENT_KEYS].sort()); +}); + +test('recordDrop buckets reasons by prefix before the colon', () => { + const s = newStats(); + recordDrop(s, 'dup:id'); + recordDrop(s, 'dup:barcode'); + recordDrop(s, 'not-swiss'); + expect(s.dropped).toBe(3); + expect(s.dropReasons).toEqual({ dup: 2, 'not-swiss': 1 }); +}); diff --git a/crawler/types.ts b/crawler/types.ts new file mode 100644 index 00000000..bdd1352d --- /dev/null +++ b/crawler/types.ts @@ -0,0 +1,77 @@ +import type { DatasetProduct as SchemaDatasetProduct } from '$lib/server/catalog/dataset-schema'; + +/** + * The 43 extended-nutrient keys. `DatasetProduct` from the shared Zod schema loses these + * (its `z.infer` is built via `Object.fromEntries`, so the keys vanish from the static type). + * We re-attach them here for typed nutrient access in the crawler. `types.test.ts` guards this + * list against `ALL_NUTRIENT_KEYS` (the app's single source of truth) so it can never drift. + */ +export const NUTRIENT_KEYS = [ + 'saturatedFat', + 'monounsaturatedFat', + 'polyunsaturatedFat', + 'transFat', + 'cholesterol', + 'omega3', + 'omega6', + 'sugar', + 'addedSugars', + 'sugarAlcohols', + 'starch', + 'sodium', + 'potassium', + 'calcium', + 'iron', + 'magnesium', + 'phosphorus', + 'zinc', + 'copper', + 'manganese', + 'selenium', + 'iodine', + 'fluoride', + 'chromium', + 'molybdenum', + 'chloride', + 'vitaminA', + 'vitaminC', + 'vitaminD', + 'vitaminE', + 'vitaminK', + 'vitaminB1', + 'vitaminB2', + 'vitaminB3', + 'vitaminB5', + 'vitaminB6', + 'vitaminB7', + 'vitaminB9', + 'vitaminB12', + 'caffeine', + 'alcohol', + 'water', + 'salt' +] as const; + +export type NutrientKey = (typeof NUTRIENT_KEYS)[number]; + +/** Dataset product with the extended-nutrient keys typed (see NUTRIENT_KEYS). */ +export type DatasetProduct = SchemaDatasetProduct & Partial>; + +export type BuildResult = { ok: true; product: DatasetProduct } | { ok: false; reason: string }; + +export type CrawlStats = { + seen: number; + emitted: number; + dropped: number; + dropReasons: Record; +}; + +export function newStats(): CrawlStats { + return { seen: 0, emitted: 0, dropped: 0, dropReasons: {} }; +} + +export function recordDrop(stats: CrawlStats, reason: string): void { + stats.dropped++; + const key = reason.split(':')[0]; + stats.dropReasons[key] = (stats.dropReasons[key] ?? 0) + 1; +} From d2bb95b1c1efe254582f331ac656eea3ee95c8fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Orell=20B=C3=BChler?= Date: Sat, 30 May 2026 01:16:02 +0200 Subject: [PATCH 20/26] feat(crawler): OFF bulk-dump adapter (Swiss/food filter + streaming crawl) --- crawler/adapters/off/crawl-off.test.ts | 28 +++++++++ crawler/adapters/off/crawl-off.ts | 38 +++++++++++ crawler/adapters/off/normalize-off.test.ts | 73 ++++++++++++++++++++++ crawler/adapters/off/normalize-off.ts | 57 +++++++++++++++++ crawler/adapters/off/types.ts | 17 +++++ crawler/fixtures/off-sample.jsonl | 6 ++ 6 files changed, 219 insertions(+) create mode 100644 crawler/adapters/off/crawl-off.test.ts create mode 100644 crawler/adapters/off/crawl-off.ts create mode 100644 crawler/adapters/off/normalize-off.test.ts create mode 100644 crawler/adapters/off/normalize-off.ts create mode 100644 crawler/adapters/off/types.ts create mode 100644 crawler/fixtures/off-sample.jsonl diff --git a/crawler/adapters/off/crawl-off.test.ts b/crawler/adapters/off/crawl-off.test.ts new file mode 100644 index 00000000..78c5f8e2 --- /dev/null +++ b/crawler/adapters/off/crawl-off.test.ts @@ -0,0 +1,28 @@ +import { test, expect } from 'bun:test'; +import { join } from 'node:path'; +import { readDumpLines } from '../../lib/jsonl-stream'; +import { crawlOffDump } from './crawl-off'; +import { newStats } from '../../types'; + +const FIXTURE = join(import.meta.dir, '../../fixtures/off-sample.jsonl'); + +test('crawlOffDump emits only valid Swiss products from the fixture dump', async () => { + const stats = newStats(); + const names: string[] = []; + for await (const product of crawlOffDump(readDumpLines(FIXTURE), { stats })) + names.push(product.name); + expect(stats.seen).toBe(6); + expect(stats.emitted).toBe(2); + expect(stats.dropped).toBe(4); + expect(names).toContain('Zweifel Paprika Chips'); + expect(stats.dropReasons['not-swiss']).toBe(1); + expect(stats.dropReasons['no-barcode']).toBe(1); + expect(stats.dropReasons['no-name']).toBe(1); + expect(stats.dropReasons['missing-core']).toBe(1); +}); + +test('crawlOffDump respects the limit option', async () => { + const out = []; + for await (const p of crawlOffDump(readDumpLines(FIXTURE), { limit: 1 })) out.push(p); + expect(out.length).toBe(1); +}); diff --git a/crawler/adapters/off/crawl-off.ts b/crawler/adapters/off/crawl-off.ts new file mode 100644 index 00000000..963ca280 --- /dev/null +++ b/crawler/adapters/off/crawl-off.ts @@ -0,0 +1,38 @@ +import type { DatasetProduct, CrawlStats } from '../../types'; +import { newStats, recordDrop } from '../../types'; +import { offDumpToDataset } from './normalize-off'; +import type { OffDumpProduct } from './types'; + +export type OffCrawlOpts = { + limit?: number; + stats?: CrawlStats; + crawledAt?: string; + onProgress?: (stats: CrawlStats) => void; +}; + +export async function* crawlOffDump( + lines: AsyncIterable, + opts: OffCrawlOpts = {} +): AsyncIterable { + const stats = opts.stats ?? newStats(); + const crawledAt = opts.crawledAt ?? new Date().toISOString(); + for await (const line of lines) { + stats.seen++; + let raw: OffDumpProduct; + try { + raw = JSON.parse(line); + } catch { + recordDrop(stats, 'bad-json'); + continue; + } + const r = offDumpToDataset(raw, crawledAt); + if (!r.ok) { + recordDrop(stats, r.reason); + continue; + } + stats.emitted++; + if (opts.onProgress && stats.seen % 10000 === 0) opts.onProgress(stats); + yield r.product; + if (opts.limit && stats.emitted >= opts.limit) return; + } +} diff --git a/crawler/adapters/off/normalize-off.test.ts b/crawler/adapters/off/normalize-off.test.ts new file mode 100644 index 00000000..157d5a13 --- /dev/null +++ b/crawler/adapters/off/normalize-off.test.ts @@ -0,0 +1,73 @@ +import { test, expect } from 'bun:test'; +import { offDumpToDataset } from './normalize-off'; + +const swissFull = { + code: '7610095131003', + product_name: 'Zweifel Paprika Chips', + brands: 'Zweifel', + lang: 'de', + countries_tags: ['en:switzerland'], + nutriscore_grade: 'd', + nova_group: 4, + nutriments: { + 'energy-kcal_100g': 515, + proteins_100g: 5.8, + carbohydrates_100g: 53, + fat_100g: 30, + fiber_100g: 5.6, + 'saturated-fat_100g': 1.8, + salt_100g: 1.3 + } +}; + +test('maps a full Swiss product to a valid dataset product', () => { + const r = offDumpToDataset(swissFull); + expect(r.ok).toBe(true); + if (r.ok) { + expect(r.product.name).toBe('Zweifel Paprika Chips'); + expect(r.product.barcode).toBe('7610095131003'); + expect(r.product.calories).toBe(515); + expect(r.product.saturatedFat).toBe(1.8); + expect(r.product.salt).toBe(1.3); + expect(r.product.nutriScore).toBe('d'); + expect(r.product.sourceUrl).toContain('7610095131003'); + } +}); + +test('rejects a non-Swiss product', () => { + const r = offDumpToDataset({ ...swissFull, countries_tags: ['en:france'] }); + expect(r.ok).toBe(false); + if (!r.ok) expect(r.reason).toContain('not-swiss'); +}); + +test('rejects a product with no barcode or no name', () => { + expect(offDumpToDataset({ ...swissFull, code: '' }).ok).toBe(false); + expect(offDumpToDataset({ ...swissFull, product_name: '' }).ok).toBe(false); +}); + +test('drops a product missing a core macro', () => { + const n = { ...swissFull.nutriments } as Record; + delete n['fiber_100g']; + const r = offDumpToDataset({ ...swissFull, nutriments: n }); + expect(r.ok).toBe(false); + if (!r.ok) expect(r.reason).toContain('fiber'); +}); + +test('derives kcal from kJ when energy-kcal is absent', () => { + const n = { ...swissFull.nutriments } as Record; + delete n['energy-kcal_100g']; + n['energy-kj_100g'] = 2155; // ~515 kcal + const r = offDumpToDataset({ ...swissFull, nutriments: n }); + expect(r.ok).toBe(true); + if (r.ok) expect(Math.round(r.product.calories)).toBe(515); +}); + +test('prefers product_name_de when present', () => { + const r = offDumpToDataset({ + ...swissFull, + product_name: 'Paprika Chips', + product_name_de: 'Paprika Chips DE' + }); + expect(r.ok).toBe(true); + if (r.ok) expect(r.product.name).toBe('Paprika Chips DE'); +}); diff --git a/crawler/adapters/off/normalize-off.ts b/crawler/adapters/off/normalize-off.ts new file mode 100644 index 00000000..4837b12c --- /dev/null +++ b/crawler/adapters/off/normalize-off.ts @@ -0,0 +1,57 @@ +import { extractAllNutrients } from '$lib/server/nutrient-extract'; +import { buildDatasetProduct } from '../../lib/normalize'; +import type { BuildResult } from '../../types'; +import type { OffDumpProduct } from './types'; + +const KJ_PER_KCAL = 4.184; +const NUTRISCORE = new Set(['a', 'b', 'c', 'd', 'e']); + +function num(v: number | string | undefined): number | null { + if (v == null) return null; + const n = typeof v === 'string' ? parseFloat(v) : v; + return Number.isNaN(n) ? null : n; +} + +export function offDumpToDataset(p: OffDumpProduct, crawledAt?: string): BuildResult { + const code = (p.code ?? '').trim(); + if (!code) return { ok: false, reason: 'no-barcode' }; + const name = (p.product_name_de || p.product_name || '').trim(); + if (!name) return { ok: false, reason: 'no-name' }; + if (!(p.countries_tags ?? []).includes('en:switzerland')) + return { ok: false, reason: 'not-swiss' }; + + const n = (p.nutriments ?? {}) as Record; + let calories = num(n['energy-kcal_100g']); + if (calories == null) { + const kj = num(n['energy-kj_100g']); + if (kj != null) calories = Math.round((kj / KJ_PER_KCAL) * 100) / 100; + } + + const grade = (p.nutriscore_grade ?? '').toLowerCase(); + const nova = num(p.nova_group); + const additives = (p.additives_tags ?? []).slice(0, 200); + const ingredients = (p.ingredients_text_de || p.ingredients_text || '').slice(0, 10000); + + return buildDatasetProduct({ + name: name.slice(0, 500), + brand: (p.brands ?? '').split(',')[0]?.trim() || null, + language: 'de', + servingSize: 100, + servingUnit: 'g', + calories, + protein: num(n['proteins_100g']), + carbs: num(n['carbohydrates_100g']), + fat: num(n['fat_100g']), + fiber: num(n['fiber_100g']), + nutrients: extractAllNutrients(n), + barcode: code.slice(0, 32), + nutriScore: NUTRISCORE.has(grade) ? (grade as 'a' | 'b' | 'c' | 'd' | 'e') : null, + novaGroup: nova != null && nova >= 1 && nova <= 4 ? Math.round(nova) : null, + additives: additives.length > 0 ? additives : null, + ingredientsText: ingredients.length > 0 ? ingredients : null, + imageUrl: p.image_front_url || p.image_url || null, + sourceUrl: `https://world.openfoodfacts.org/product/${code}`, + sourceRef: code, + crawledAt: crawledAt ?? null + }); +} diff --git a/crawler/adapters/off/types.ts b/crawler/adapters/off/types.ts new file mode 100644 index 00000000..dac48804 --- /dev/null +++ b/crawler/adapters/off/types.ts @@ -0,0 +1,17 @@ +export type OffDumpProduct = { + code?: string; + product_name?: string; + product_name_de?: string; + brands?: string; + lang?: string; + lc?: string; + countries_tags?: string[]; + nutriscore_grade?: string; + nova_group?: number | string; + additives_tags?: string[]; + ingredients_text?: string; + ingredients_text_de?: string; + image_url?: string; + image_front_url?: string; + nutriments?: Record; +}; diff --git a/crawler/fixtures/off-sample.jsonl b/crawler/fixtures/off-sample.jsonl new file mode 100644 index 00000000..2f0f9180 --- /dev/null +++ b/crawler/fixtures/off-sample.jsonl @@ -0,0 +1,6 @@ +{"code":"7610095131003","product_name":"Zweifel Paprika Chips","brands":"Zweifel","lang":"de","countries_tags":["en:switzerland"],"nutriscore_grade":"d","nova_group":4,"nutriments":{"energy-kcal_100g":515,"proteins_100g":5.8,"carbohydrates_100g":53,"fat_100g":30,"fiber_100g":5.6,"saturated-fat_100g":1.8,"salt_100g":1.3}} +{"code":"7610095999999","product_name":"Bio Apfelsaft","brands":"Coop","lang":"de","countries_tags":["en:switzerland","en:france"],"nutriments":{"energy-kj_100g":192,"proteins_100g":0.2,"carbohydrates_100g":11,"fat_100g":0.1,"fiber_100g":0.2}} +{"code":"3017620422003","product_name":"Nutella","brands":"Ferrero","lang":"fr","countries_tags":["en:france"],"nutriments":{"energy-kcal_100g":539,"proteins_100g":6.3,"carbohydrates_100g":57,"fat_100g":30.9,"fiber_100g":0}} +{"code":"7610095000001","product_name":"Mystery Item","countries_tags":["en:switzerland"],"nutriments":{"energy-kcal_100g":100,"proteins_100g":1,"carbohydrates_100g":1,"fat_100g":1}} +{"code":"","product_name":"No Barcode","countries_tags":["en:switzerland"],"nutriments":{"energy-kcal_100g":100,"proteins_100g":1,"carbohydrates_100g":1,"fat_100g":1,"fiber_100g":1}} +{"code":"7610095000002","product_name":"","countries_tags":["en:switzerland"],"nutriments":{"energy-kcal_100g":100,"proteins_100g":1,"carbohydrates_100g":1,"fat_100g":1,"fiber_100g":1}} From 7c636c87288bb68f63095455b0641d0e1726fba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Orell=20B=C3=BChler?= Date: Sat, 30 May 2026 01:16:26 +0200 Subject: [PATCH 21/26] feat(crawler): Migros adapter (normalizer, crawl loop, migros-api-wrapper client) --- crawler/adapters/migros/client.test.ts | 39 +++++ crawler/adapters/migros/client.ts | 138 ++++++++++++++++++ crawler/adapters/migros/crawl-migros.test.ts | 61 ++++++++ crawler/adapters/migros/crawl-migros.ts | 58 ++++++++ .../adapters/migros/normalize-migros.test.ts | 70 +++++++++ crawler/adapters/migros/normalize-migros.ts | 50 +++++++ crawler/adapters/migros/types.ts | 32 ++++ crawler/fixtures/migros-product-detail.json | 21 +++ 8 files changed, 469 insertions(+) create mode 100644 crawler/adapters/migros/client.test.ts create mode 100644 crawler/adapters/migros/client.ts create mode 100644 crawler/adapters/migros/crawl-migros.test.ts create mode 100644 crawler/adapters/migros/crawl-migros.ts create mode 100644 crawler/adapters/migros/normalize-migros.test.ts create mode 100644 crawler/adapters/migros/normalize-migros.ts create mode 100644 crawler/adapters/migros/types.ts create mode 100644 crawler/fixtures/migros-product-detail.json diff --git a/crawler/adapters/migros/client.test.ts b/crawler/adapters/migros/client.test.ts new file mode 100644 index 00000000..3f6b1064 --- /dev/null +++ b/crawler/adapters/migros/client.test.ts @@ -0,0 +1,39 @@ +import { test, expect } from 'bun:test'; +import { join } from 'node:path'; +import { mapProductDetail, extractProductIds, pickDetail } from './client'; + +test('mapProductDetail reduces a Migros API product-detail to MigrosProductDetail', async () => { + const raw = await Bun.file( + join(import.meta.dir, '../../fixtures/migros-product-detail.json') + ).json(); + const d = mapProductDetail(raw); + expect(d).not.toBeNull(); + expect(d!.id).toBe('100001'); + expect(d!.name).toBe('M-Classic Vollmilch UHT'); + expect(d!.gtins).toEqual(['7610200000001']); + expect(d!.productUrl).toContain('100001'); + expect(d!.nutrition.basis).toBe('100g'); + expect(d!.nutrition.energyKcal).toBe(64); + expect(d!.nutrition.sugar).toBe(4.8); + expect(d!.nutrition.saturatedFat).toBe(2.1); + expect(d!.nutrition.salt).toBe(0.1); +}); + +test('mapProductDetail returns null when id or name is missing', () => { + expect(mapProductDetail({ name: 'no id' })).toBeNull(); + expect(mapProductDetail({ productId: '1' })).toBeNull(); +}); + +test('extractProductIds reads productIds or products[].id/uid', () => { + expect(extractProductIds({ productIds: ['a', 'b'] })).toEqual(['a', 'b']); + expect(extractProductIds({ products: [{ id: 'x' }, { uid: 'y' }] })).toEqual(['x', 'y']); + expect(extractProductIds({})).toEqual([]); + expect(extractProductIds(null)).toEqual([]); +}); + +test('pickDetail selects a single product from array/products/object shapes', () => { + expect(pickDetail([{ productId: '1' }])?.productId).toBe('1'); + expect(pickDetail({ products: [{ productId: '2' }] })?.productId).toBe('2'); + expect(pickDetail({ productId: '3' })?.productId).toBe('3'); + expect(pickDetail(null)).toBeNull(); +}); diff --git a/crawler/adapters/migros/client.ts b/crawler/adapters/migros/client.ts new file mode 100644 index 00000000..76d28ef9 --- /dev/null +++ b/crawler/adapters/migros/client.ts @@ -0,0 +1,138 @@ +import type { MigrosClient, MigrosProductDetail, MigrosNutrition } from './types'; + +type RawNutrientValue = { code?: string; value?: number | string }; +type RawProductDetail = { + productId?: string; + name?: string; + brand?: string; + gtins?: string[]; + productUrls?: Record; + image?: { original?: string }; + ingredients?: string; + nutrients?: { referenceValue?: string; values?: RawNutrientValue[] }; +}; + +type MigrosNumericKey = Exclude; + +const NUTRIENT_CODE: Record = { + energy_kcal: 'energyKcal', + protein: 'protein', + carbohydrate: 'carbohydrate', + of_which_sugars: 'sugar', + fat: 'fat', + of_which_saturated: 'saturatedFat', + dietary_fiber: 'fiber', + salt: 'salt' +}; + +function num(v: number | string | undefined): number | null { + if (v == null) return null; + const n = typeof v === 'string' ? parseFloat(v) : v; + return Number.isNaN(n) ? null : n; +} + +export function mapProductDetail(raw: RawProductDetail): MigrosProductDetail | null { + const id = raw.productId; + const name = raw.name; + if (!id || !name) return null; + const nutrition: MigrosNutrition = { basis: raw.nutrients?.referenceValue ?? '100g' }; + for (const entry of raw.nutrients?.values ?? []) { + const key = entry.code ? NUTRIENT_CODE[entry.code] : undefined; + if (key) nutrition[key] = num(entry.value); + } + return { + id, + name, + brand: raw.brand ?? null, + gtins: (raw.gtins ?? []).filter((g) => !!g), + productUrl: raw.productUrls?.de ?? Object.values(raw.productUrls ?? {})[0] ?? null, + imageUrl: raw.image?.original ?? null, + ingredients: raw.ingredients ?? null, + nutrition + }; +} + +export type MigrosClientConfig = { + /** Food category search terms or category ids to page through (host-confirmed). */ + categories: string[]; + pageSize?: number; + maxPagesPerCategory?: number; +}; + +/** Best-effort extraction of product ids from a (loosely-typed) search response. */ +export function extractProductIds(res: unknown): string[] { + const r = res as { productIds?: string[]; products?: Array<{ id?: string; uid?: string }> }; + if (Array.isArray(r?.productIds)) return r.productIds.filter((id): id is string => !!id); + if (Array.isArray(r?.products)) { + return r.products.map((p) => p.id ?? p.uid).filter((id): id is string => !!id); + } + return []; +} + +/** Best-effort selection of the single product object from a product-detail response. */ +export function pickDetail(res: unknown): RawProductDetail | null { + if (!res) return null; + if (Array.isArray(res)) return (res[0] as RawProductDetail) ?? null; + const r = res as { products?: RawProductDetail[] }; + if (Array.isArray(r.products)) return r.products[0] ?? null; + return res as RawProductDetail; +} + +/** + * Live client backed by `migros-api-wrapper` (`MigrosAPI`: guest token → product search → + * product-detail). NOT unit-tested — no live network in CI (spec §12). The dependency is + * imported dynamically so the tested core type-checks/runs without loading axios/cheerio/pino. + * + * The wrapper's instance methods return `any` and some option types are inconsistent, so the + * call boundary is navigated through a narrow facade. The exact category ids/pagination params + * and the product-detail response field paths consumed by `mapProductDetail`/`extractProductIds` + * are verified against a live response on the server host during the first crawl (spec §13). + */ +export async function createMigrosClient(config: MigrosClientConfig): Promise { + const { MigrosAPI } = await import('migros-api-wrapper'); + const api = new MigrosAPI(); + // Guest token — public product data needs no login. + const token = (await api.account.oauth2.loginGuestToken()) as string; + + const products = api.products as unknown as { + productSearch: { + searchProduct: ( + body: { query: string; [k: string]: unknown }, + options?: Record, + token?: string + ) => Promise; + }; + productDisplay: { + getProductDetails: ( + options: { uids: string | string[]; [k: string]: unknown }, + token?: string + ) => Promise; + }; + }; + + const pageSize = config.pageSize ?? 24; + const maxPages = config.maxPagesPerCategory ?? 1000; + + return { + async *listProductIds({ resume }) { + for (const category of config.categories) { + let page = resume && resume.category === category ? resume.page : 0; + for (; page < maxPages; page++) { + const res = await products.productSearch.searchProduct( + { query: category }, + { from: page * pageSize, hitsPerPage: pageSize }, + token + ); + const ids = extractProductIds(res); + for (const id of ids) yield { id, cursor: { category, page } }; + if (ids.length < pageSize) break; + } + } + }, + async getProduct(id) { + const res = await products.productDisplay.getProductDetails({ uids: id }, token); + const raw = pickDetail(res); + return raw ? mapProductDetail(raw) : null; + } + }; +} diff --git a/crawler/adapters/migros/crawl-migros.test.ts b/crawler/adapters/migros/crawl-migros.test.ts new file mode 100644 index 00000000..b448c737 --- /dev/null +++ b/crawler/adapters/migros/crawl-migros.test.ts @@ -0,0 +1,61 @@ +import { test, expect } from 'bun:test'; +import { crawlMigros } from './crawl-migros'; +import { newStats } from '../../types'; +import type { MigrosClient, MigrosProductDetail } from './types'; + +function makeClient( + products: Record, + ids: string[] +): MigrosClient { + return { + async *listProductIds() { + let page = 0; + for (const id of ids) yield { id, cursor: { category: 'all', page: page++ } }; + }, + async getProduct(id) { + return products[id] ?? null; + } + }; +} + +const base: MigrosProductDetail = { + id: '1', + name: 'A', + gtins: ['7610200000001'], + productUrl: 'https://m/1', + nutrition: { basis: '100g', energyKcal: 64, protein: 3.3, carbohydrate: 4.8, fat: 3.5, fiber: 0 } +}; + +test('emits normalized products and dedupes repeated ids and barcodes', async () => { + const client = makeClient( + { + '1': base, + '2': { ...base, id: '2', name: 'B', gtins: ['7610200000002'] }, + '3': { ...base, id: '3', name: 'A-dup', gtins: ['7610200000001'] } // dup barcode + }, + ['1', '2', '2', '3'] // '2' listed twice + ); + const stats = newStats(); + const out = []; + for await (const p of crawlMigros(client, { stats, sleep: async () => {} })) out.push(p); + expect(out.map((p) => p.name).sort()).toEqual(['A', 'B']); + expect(stats.emitted).toBe(2); + expect(stats.dropReasons['dup']).toBe(2); // one dup id + one dup barcode +}); + +test('skips ids whose product detail is null', async () => { + const client = makeClient({ '1': base, '9': null }, ['1', '9']); + const out = []; + for await (const p of crawlMigros(client, { sleep: async () => {} })) out.push(p); + expect(out.length).toBe(1); +}); + +test('respects the limit option', async () => { + const client = makeClient({ '1': base, '2': { ...base, id: '2', gtins: ['7610200000002'] } }, [ + '1', + '2' + ]); + const out = []; + for await (const p of crawlMigros(client, { limit: 1, sleep: async () => {} })) out.push(p); + expect(out.length).toBe(1); +}); diff --git a/crawler/adapters/migros/crawl-migros.ts b/crawler/adapters/migros/crawl-migros.ts new file mode 100644 index 00000000..6054a644 --- /dev/null +++ b/crawler/adapters/migros/crawl-migros.ts @@ -0,0 +1,58 @@ +import type { DatasetProduct, CrawlStats } from '../../types'; +import { newStats, recordDrop } from '../../types'; +import { migrosToDataset } from './normalize-migros'; +import type { MigrosClient } from './types'; + +export type MigrosCrawlOpts = { + limit?: number; + stats?: CrawlStats; + crawledAt?: string; + resume?: { category: string; page: number } | null; + sleep?: (ms: number) => Promise; + throttleMs?: number; + onCheckpoint?: (cursor: { category: string; page: number }) => Promise | void; + onProgress?: (stats: CrawlStats) => void; +}; + +export async function* crawlMigros( + client: MigrosClient, + opts: MigrosCrawlOpts = {} +): AsyncIterable { + const stats = opts.stats ?? newStats(); + const crawledAt = opts.crawledAt ?? new Date().toISOString(); + const sleep = opts.sleep ?? ((ms: number) => new Promise((r) => setTimeout(r, ms))); + const throttleMs = opts.throttleMs ?? 0; + const seenIds = new Set(); + const seenBarcodes = new Set(); + + for await (const { id, cursor } of client.listProductIds({ resume: opts.resume ?? null })) { + stats.seen++; + if (seenIds.has(id)) { + recordDrop(stats, 'dup:id'); + continue; + } + seenIds.add(id); + if (opts.onCheckpoint) await opts.onCheckpoint(cursor); + + const detail = await client.getProduct(id); + if (throttleMs > 0) await sleep(throttleMs); + if (!detail) { + recordDrop(stats, 'no-detail'); + continue; + } + const r = migrosToDataset(detail, crawledAt); + if (!r.ok) { + recordDrop(stats, r.reason); + continue; + } + if (r.product.barcode && seenBarcodes.has(r.product.barcode)) { + recordDrop(stats, 'dup:barcode'); + continue; + } + if (r.product.barcode) seenBarcodes.add(r.product.barcode); + stats.emitted++; + if (opts.onProgress && stats.emitted % 500 === 0) opts.onProgress(stats); + yield r.product; + if (opts.limit && stats.emitted >= opts.limit) return; + } +} diff --git a/crawler/adapters/migros/normalize-migros.test.ts b/crawler/adapters/migros/normalize-migros.test.ts new file mode 100644 index 00000000..76c6d7e3 --- /dev/null +++ b/crawler/adapters/migros/normalize-migros.test.ts @@ -0,0 +1,70 @@ +import { test, expect } from 'bun:test'; +import { migrosToDataset } from './normalize-migros'; +import type { MigrosProductDetail } from './types'; + +const detail: MigrosProductDetail = { + id: '100001', + name: 'M-Classic Vollmilch', + brand: 'M-Classic', + gtins: ['7610200000001'], + productUrl: 'https://www.migros.ch/de/product/100001', + imageUrl: 'https://image.migros.ch/100001.jpg', + nutrition: { + basis: '100g', + energyKcal: 64, + protein: 3.3, + carbohydrate: 4.8, + fat: 3.5, + fiber: 0, + sugar: 4.8, + saturatedFat: 2.1, + salt: 0.1 + } +}; + +test('maps a Migros product-detail to a valid dataset product (de)', () => { + const r = migrosToDataset(detail); + expect(r.ok).toBe(true); + if (r.ok) { + expect(r.product.name).toBe('M-Classic Vollmilch'); + expect(r.product.language).toBe('de'); + expect(r.product.barcode).toBe('7610200000001'); + expect(r.product.calories).toBe(64); + expect(r.product.sugar).toBe(4.8); + expect(r.product.saturatedFat).toBe(2.1); + expect(r.product.salt).toBe(0.1); + expect(r.product.sourceRef).toBe('100001'); + expect(r.product.vitaminC).toBeNull(); + } +}); + +test('rescales per-serving nutrition to per-100g when basis is grams', () => { + const r = migrosToDataset({ + ...detail, + nutrition: { + ...detail.nutrition, + basis: '200g', + energyKcal: 128, + protein: 6.6, + carbohydrate: 9.6, + fat: 7, + fiber: 0 + } + }); + expect(r.ok).toBe(true); + if (r.ok) expect(r.product.calories).toBe(64); +}); + +test('drops a product with no GTIN', () => { + const r = migrosToDataset({ ...detail, gtins: [] }); + expect(r.ok).toBe(false); + if (!r.ok) expect(r.reason).toContain('no-barcode'); +}); + +test('drops a product missing core macros', () => { + const r = migrosToDataset({ + ...detail, + nutrition: { basis: '100g', energyKcal: 64, protein: 3.3 } + }); + expect(r.ok).toBe(false); +}); diff --git a/crawler/adapters/migros/normalize-migros.ts b/crawler/adapters/migros/normalize-migros.ts new file mode 100644 index 00000000..c6de5bd4 --- /dev/null +++ b/crawler/adapters/migros/normalize-migros.ts @@ -0,0 +1,50 @@ +import { buildDatasetProduct } from '../../lib/normalize'; +import type { BuildResult } from '../../types'; +import type { MigrosProductDetail } from './types'; + +function gramsBasis(basis: string | undefined): number | null { + if (!basis) return 100; // assume per-100g when unspecified + const m = basis + .trim() + .toLowerCase() + .match(/^(\d+(?:\.\d+)?)\s*(g|ml)$/); + if (!m) return null; + return parseFloat(m[1]); +} + +export function migrosToDataset(d: MigrosProductDetail, crawledAt?: string): BuildResult { + const barcode = (d.gtins ?? []).find((g) => g && g.trim().length > 0)?.trim(); + if (!barcode) return { ok: false, reason: 'no-barcode' }; + const name = (d.name ?? '').trim(); + if (!name) return { ok: false, reason: 'no-name' }; + + const basisG = gramsBasis(d.nutrition.basis); + if (basisG == null || basisG <= 0) return { ok: false, reason: 'bad-basis' }; + const f = 100 / basisG; + const scale = (v: number | null | undefined): number | null => + v == null || Number.isNaN(v) ? null : Math.round(v * f * 100) / 100; + + return buildDatasetProduct({ + name: name.slice(0, 500), + brand: d.brand ?? null, + language: 'de', + servingSize: 100, + servingUnit: 'g', + calories: scale(d.nutrition.energyKcal), + protein: scale(d.nutrition.protein), + carbs: scale(d.nutrition.carbohydrate), + fat: scale(d.nutrition.fat), + fiber: scale(d.nutrition.fiber), + nutrients: { + sugar: scale(d.nutrition.sugar), + saturatedFat: scale(d.nutrition.saturatedFat), + salt: scale(d.nutrition.salt) + }, + barcode: barcode.slice(0, 32), + ingredientsText: d.ingredients?.slice(0, 10000) ?? null, + imageUrl: d.imageUrl ?? null, + sourceUrl: d.productUrl ?? null, + sourceRef: d.id, + crawledAt: crawledAt ?? null + }); +} diff --git a/crawler/adapters/migros/types.ts b/crawler/adapters/migros/types.ts new file mode 100644 index 00000000..9df8d6f4 --- /dev/null +++ b/crawler/adapters/migros/types.ts @@ -0,0 +1,32 @@ +export type MigrosNutrition = { + basis?: string; // e.g. "100g", "200g", "100ml" + energyKcal?: number | null; + protein?: number | null; + carbohydrate?: number | null; + fat?: number | null; + fiber?: number | null; + sugar?: number | null; + saturatedFat?: number | null; + salt?: number | null; +}; + +export type MigrosProductDetail = { + id: string; + name: string; + brand?: string | null; + gtins?: string[]; + productUrl?: string | null; + imageUrl?: string | null; + ingredients?: string | null; + nutrition: MigrosNutrition; +}; + +export interface MigrosClient { + /** Yields product ids for the configured food categories, page by page. */ + listProductIds(opts: { resume?: { category: string; page: number } | null }): AsyncIterable<{ + id: string; + cursor: { category: string; page: number }; + }>; + /** Fetches and normalizes one product detail; null if unavailable. */ + getProduct(id: string): Promise; +} diff --git a/crawler/fixtures/migros-product-detail.json b/crawler/fixtures/migros-product-detail.json new file mode 100644 index 00000000..97ce3199 --- /dev/null +++ b/crawler/fixtures/migros-product-detail.json @@ -0,0 +1,21 @@ +{ + "productId": "100001", + "name": "M-Classic Vollmilch UHT", + "brand": "M-Classic", + "gtins": ["7610200000001"], + "productUrls": { "de": "https://www.migros.ch/de/product/100001" }, + "image": { "original": "https://image.migros.ch/100001.jpg" }, + "nutrients": { + "referenceValue": "100g", + "values": [ + { "code": "energy_kcal", "value": 64 }, + { "code": "protein", "value": 3.3 }, + { "code": "carbohydrate", "value": 4.8 }, + { "code": "of_which_sugars", "value": 4.8 }, + { "code": "fat", "value": 3.5 }, + { "code": "of_which_saturated", "value": 2.1 }, + { "code": "dietary_fiber", "value": 0 }, + { "code": "salt", "value": 0.1 } + ] + } +} From 33d957f6972f76ffef0e89c0fedb4fc61152f1a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Orell=20B=C3=BChler?= Date: Sat, 30 May 2026 01:16:46 +0200 Subject: [PATCH 22/26] feat(crawler): CLI entrypoint (off|migros) + end-to-end test --- crawler/index.test.ts | 19 +++++++++++ crawler/index.ts | 79 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 crawler/index.test.ts create mode 100644 crawler/index.ts diff --git a/crawler/index.test.ts b/crawler/index.test.ts new file mode 100644 index 00000000..6e34ad08 --- /dev/null +++ b/crawler/index.test.ts @@ -0,0 +1,19 @@ +import { test, expect } from 'bun:test'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { runOff } from './index'; +import { datasetHeaderSchema, datasetProductSchema } from '$lib/server/catalog/dataset-schema'; + +test('runOff produces a schema-valid dataset file from the fixture dump', async () => { + const dump = join(import.meta.dir, 'fixtures/off-sample.jsonl'); + const out = join(tmpdir(), `crawler-e2e-${process.pid}.jsonl`); + const stats = await runOff({ dumpPath: dump, outPath: out }); + expect(stats.emitted).toBe(2); + + const lines = (await Bun.file(out).text()).trim().split('\n'); + expect(lines.length).toBe(3); // header + 2 products + expect(datasetHeaderSchema.safeParse(JSON.parse(lines[0])).success).toBe(true); + for (const l of lines.slice(1)) { + expect(datasetProductSchema.safeParse(JSON.parse(l)).success).toBe(true); + } +}); diff --git a/crawler/index.ts b/crawler/index.ts new file mode 100644 index 00000000..453b2877 --- /dev/null +++ b/crawler/index.ts @@ -0,0 +1,79 @@ +import { readDumpLines } from './lib/jsonl-stream'; +import { crawlOffDump } from './adapters/off/crawl-off'; +import { crawlMigros } from './adapters/migros/crawl-migros'; +import { createMigrosClient } from './adapters/migros/client'; +import { DatasetWriter } from './lib/jsonl-writer'; +import { newStats, type CrawlStats } from './types'; + +// Root "food" category id(s) in the Migros taxonomy; refine on the host during a real crawl. +const MIGROS_FOOD_CATEGORIES = ['7494731']; + +export async function runOff(opts: { dumpPath: string; outPath: string }): Promise { + const stats = newStats(); + const writer = new DatasetWriter(opts.outPath, { + key: 'off-ch', + name: 'Open Food Facts (Switzerland)', + source: 'off', + priority: 20 + }); + await writer.open(); + for await (const product of crawlOffDump(readDumpLines(opts.dumpPath), { + stats, + onProgress: (s) => + console.error(`[off] seen=${s.seen} emitted=${s.emitted} dropped=${s.dropped}`) + })) { + await writer.write(product); + } + const count = await writer.close(); + console.error(`[off] done: ${count} products → ${opts.outPath}`); + console.error(`[off] drop reasons: ${JSON.stringify(stats.dropReasons)}`); + return stats; +} + +export async function runMigros(opts: { outPath: string }): Promise { + const stats = newStats(); + const client = await createMigrosClient({ categories: MIGROS_FOOD_CATEGORIES }); + const writer = new DatasetWriter(opts.outPath, { + key: 'migros', + name: 'Migros (Switzerland)', + source: 'migros', + priority: 10 + }); + await writer.open(); + for await (const product of crawlMigros(client, { + stats, + throttleMs: 600, + onProgress: (s) => + console.error(`[migros] seen=${s.seen} emitted=${s.emitted} dropped=${s.dropped}`) + })) { + await writer.write(product); + } + const count = await writer.close(); + console.error(`[migros] done: ${count} products → ${opts.outPath}`); + console.error(`[migros] drop reasons: ${JSON.stringify(stats.dropReasons)}`); + return stats; +} + +function dateStamp(): string { + return new Date().toISOString().slice(0, 10); +} + +async function main() { + const [cmd, ...args] = process.argv.slice(2); + if (cmd === 'off') { + const dumpPath = args[0]; + if (!dumpPath) throw new Error('Usage: crawl off [outPath]'); + await runOff({ dumpPath, outPath: args[1] ?? `data/catalog/off-ch-${dateStamp()}.jsonl` }); + } else if (cmd === 'migros') { + await runMigros({ outPath: args[0] ?? `data/catalog/migros-${dateStamp()}.jsonl` }); + } else { + throw new Error(`Unknown command: ${cmd ?? '(none)'}. Expected: off | migros`); + } +} + +if (import.meta.main) { + main().catch((e) => { + console.error(e instanceof Error ? e.message : String(e)); + process.exit(1); + }); +} From 0a409002997d1b07ee7fc8b4e1d8d49d931d72ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Orell=20B=C3=BChler?= Date: Sat, 30 May 2026 01:29:18 +0200 Subject: [PATCH 23/26] =?UTF-8?q?fix(crawler):=20address=20review=20?= =?UTF-8?q?=E2=80=94=20writer=20try/finally,=20checkpoint-after-yield,=20m?= =?UTF-8?q?l=20serving=20unit,=20stream=20flush=20+=20header-keyed=20cache?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawler/adapters/migros/crawl-migros.test.ts | 17 ++++++ crawler/adapters/migros/crawl-migros.ts | 2 +- .../adapters/migros/normalize-migros.test.ts | 12 ++++ crawler/adapters/migros/normalize-migros.ts | 5 +- crawler/index.ts | 57 +++++++++++++------ crawler/lib/http.test.ts | 18 ++++++ crawler/lib/http.ts | 5 +- crawler/lib/jsonl-stream.test.ts | 12 ++++ crawler/lib/jsonl-stream.ts | 1 + 9 files changed, 107 insertions(+), 22 deletions(-) diff --git a/crawler/adapters/migros/crawl-migros.test.ts b/crawler/adapters/migros/crawl-migros.test.ts index b448c737..ce95e8a2 100644 --- a/crawler/adapters/migros/crawl-migros.test.ts +++ b/crawler/adapters/migros/crawl-migros.test.ts @@ -59,3 +59,20 @@ test('respects the limit option', async () => { for await (const p of crawlMigros(client, { limit: 1, sleep: async () => {} })) out.push(p); expect(out.length).toBe(1); }); + +test('checkpoints only emitted products (after yield), not dropped ones', async () => { + const client = makeClient( + { '1': base, '2': { ...base, id: '2', name: 'B', gtins: ['7610200000002'] }, '9': null }, + ['1', '9', '2'] + ); + const cursors: Array<{ category: string; page: number }> = []; + const out = []; + for await (const p of crawlMigros(client, { + sleep: async () => {}, + onCheckpoint: (c) => void cursors.push(c) + })) + out.push(p); + // '9' has no detail (dropped) → not checkpointed; only the two emitted products are. + expect(out.length).toBe(2); + expect(cursors.length).toBe(2); +}); diff --git a/crawler/adapters/migros/crawl-migros.ts b/crawler/adapters/migros/crawl-migros.ts index 6054a644..7dd7086a 100644 --- a/crawler/adapters/migros/crawl-migros.ts +++ b/crawler/adapters/migros/crawl-migros.ts @@ -32,7 +32,6 @@ export async function* crawlMigros( continue; } seenIds.add(id); - if (opts.onCheckpoint) await opts.onCheckpoint(cursor); const detail = await client.getProduct(id); if (throttleMs > 0) await sleep(throttleMs); @@ -53,6 +52,7 @@ export async function* crawlMigros( stats.emitted++; if (opts.onProgress && stats.emitted % 500 === 0) opts.onProgress(stats); yield r.product; + if (opts.onCheckpoint) await opts.onCheckpoint(cursor); if (opts.limit && stats.emitted >= opts.limit) return; } } diff --git a/crawler/adapters/migros/normalize-migros.test.ts b/crawler/adapters/migros/normalize-migros.test.ts index 76c6d7e3..ad280a7f 100644 --- a/crawler/adapters/migros/normalize-migros.test.ts +++ b/crawler/adapters/migros/normalize-migros.test.ts @@ -68,3 +68,15 @@ test('drops a product missing core macros', () => { }); expect(r.ok).toBe(false); }); + +test('uses ml serving unit and rescales for an ml-based product', () => { + const r = migrosToDataset({ + ...detail, + nutrition: { ...detail.nutrition, basis: '200ml', energyKcal: 128 } + }); + expect(r.ok).toBe(true); + if (r.ok) { + expect(r.product.servingUnit).toBe('ml'); + expect(r.product.calories).toBe(64); + } +}); diff --git a/crawler/adapters/migros/normalize-migros.ts b/crawler/adapters/migros/normalize-migros.ts index c6de5bd4..35e13b95 100644 --- a/crawler/adapters/migros/normalize-migros.ts +++ b/crawler/adapters/migros/normalize-migros.ts @@ -20,6 +20,9 @@ export function migrosToDataset(d: MigrosProductDetail, crawledAt?: string): Bui const basisG = gramsBasis(d.nutrition.basis); if (basisG == null || basisG <= 0) return { ok: false, reason: 'bad-basis' }; + const servingUnit: 'g' | 'ml' = d.nutrition.basis?.trim().toLowerCase().endsWith('ml') + ? 'ml' + : 'g'; const f = 100 / basisG; const scale = (v: number | null | undefined): number | null => v == null || Number.isNaN(v) ? null : Math.round(v * f * 100) / 100; @@ -29,7 +32,7 @@ export function migrosToDataset(d: MigrosProductDetail, crawledAt?: string): Bui brand: d.brand ?? null, language: 'de', servingSize: 100, - servingUnit: 'g', + servingUnit, calories: scale(d.nutrition.energyKcal), protein: scale(d.nutrition.protein), carbs: scale(d.nutrition.carbohydrate), diff --git a/crawler/index.ts b/crawler/index.ts index 453b2877..d55931a6 100644 --- a/crawler/index.ts +++ b/crawler/index.ts @@ -1,12 +1,15 @@ +import { rmSync } from 'node:fs'; import { readDumpLines } from './lib/jsonl-stream'; import { crawlOffDump } from './adapters/off/crawl-off'; import { crawlMigros } from './adapters/migros/crawl-migros'; import { createMigrosClient } from './adapters/migros/client'; import { DatasetWriter } from './lib/jsonl-writer'; +import { readCheckpoint, writeCheckpoint } from './lib/checkpoint'; import { newStats, type CrawlStats } from './types'; // Root "food" category id(s) in the Migros taxonomy; refine on the host during a real crawl. const MIGROS_FOOD_CATEGORIES = ['7494731']; +const MIGROS_CHECKPOINT = 'data/catalog/.migros-checkpoint.json'; export async function runOff(opts: { dumpPath: string; outPath: string }): Promise { const stats = newStats(); @@ -17,21 +20,32 @@ export async function runOff(opts: { dumpPath: string; outPath: string }): Promi priority: 20 }); await writer.open(); - for await (const product of crawlOffDump(readDumpLines(opts.dumpPath), { - stats, - onProgress: (s) => - console.error(`[off] seen=${s.seen} emitted=${s.emitted} dropped=${s.dropped}`) - })) { - await writer.write(product); + try { + for await (const product of crawlOffDump(readDumpLines(opts.dumpPath), { + stats, + onProgress: (s) => + console.error(`[off] seen=${s.seen} emitted=${s.emitted} dropped=${s.dropped}`) + })) { + await writer.write(product); + } + } finally { + await writer.close(); } - const count = await writer.close(); - console.error(`[off] done: ${count} products → ${opts.outPath}`); + console.error(`[off] done: ${stats.emitted} products → ${opts.outPath}`); console.error(`[off] drop reasons: ${JSON.stringify(stats.dropReasons)}`); return stats; } -export async function runMigros(opts: { outPath: string }): Promise { +export async function runMigros(opts: { + outPath: string; + checkpointPath?: string; +}): Promise { const stats = newStats(); + const checkpointPath = opts.checkpointPath ?? MIGROS_CHECKPOINT; + const resume = await readCheckpoint<{ category: string; page: number }>(checkpointPath); + if (resume) + console.error(`[migros] resuming from category ${resume.category} page ${resume.page}`); + const client = await createMigrosClient({ categories: MIGROS_FOOD_CATEGORIES }); const writer = new DatasetWriter(opts.outPath, { key: 'migros', @@ -40,16 +54,23 @@ export async function runMigros(opts: { outPath: string }): Promise priority: 10 }); await writer.open(); - for await (const product of crawlMigros(client, { - stats, - throttleMs: 600, - onProgress: (s) => - console.error(`[migros] seen=${s.seen} emitted=${s.emitted} dropped=${s.dropped}`) - })) { - await writer.write(product); + try { + for await (const product of crawlMigros(client, { + stats, + throttleMs: 600, + resume, + onCheckpoint: (cursor) => writeCheckpoint(checkpointPath, cursor), + onProgress: (s) => + console.error(`[migros] seen=${s.seen} emitted=${s.emitted} dropped=${s.dropped}`) + })) { + await writer.write(product); + } + } finally { + await writer.close(); } - const count = await writer.close(); - console.error(`[migros] done: ${count} products → ${opts.outPath}`); + // Completed cleanly → drop the checkpoint so the next run starts fresh. + rmSync(checkpointPath, { force: true }); + console.error(`[migros] done: ${stats.emitted} products → ${opts.outPath}`); console.error(`[migros] drop reasons: ${JSON.stringify(stats.dropReasons)}`); return stats; } diff --git a/crawler/lib/http.test.ts b/crawler/lib/http.test.ts index 05b702cb..c081ae53 100644 --- a/crawler/lib/http.test.ts +++ b/crawler/lib/http.test.ts @@ -56,3 +56,21 @@ test('caches responses by url when a cache is provided', async () => { expect(a).toEqual(b); expect(calls).toBe(1); }); + +test('cache key varies by request headers', async () => { + let calls = 0; + const store = new Map(); + const client = createPoliteClient({ + minDelayMs: 0, + maxRetries: 1, + sleep: async () => {}, + cache: { get: async (k) => store.get(k) ?? null, set: async (k, v) => void store.set(k, v) }, + fetchImpl: async () => { + calls++; + return new Response(JSON.stringify({ n: calls }), { status: 200 }); + } + }); + await client.getJson('https://x.test/c', { Authorization: 'Bearer a' }); + await client.getJson('https://x.test/c', { Authorization: 'Bearer b' }); + expect(calls).toBe(2); // different headers → different cache entries +}); diff --git a/crawler/lib/http.ts b/crawler/lib/http.ts index c1fc9798..e1104437 100644 --- a/crawler/lib/http.ts +++ b/crawler/lib/http.ts @@ -31,8 +31,9 @@ export function createPoliteClient(opts: PoliteClientOpts = {}) { } async function getJson(url: string, headers: Record = {}): Promise { + const cacheKey = Object.keys(headers).length > 0 ? `${url}|${JSON.stringify(headers)}` : url; if (opts.cache) { - const hit = await opts.cache.get(url); + const hit = await opts.cache.get(cacheKey); if (hit != null) return JSON.parse(hit) as T; } let attempt = 0; @@ -43,7 +44,7 @@ export function createPoliteClient(opts: PoliteClientOpts = {}) { if (res.status === 404 || res.status === 410) return null; if (!res.ok) throw new Error(`HTTP ${res.status}`); const text = await res.text(); - if (opts.cache) await opts.cache.set(url, text); + if (opts.cache) await opts.cache.set(cacheKey, text); return JSON.parse(text) as T; } catch (err) { attempt++; diff --git a/crawler/lib/jsonl-stream.test.ts b/crawler/lib/jsonl-stream.test.ts index 78965f9c..23e397c3 100644 --- a/crawler/lib/jsonl-stream.test.ts +++ b/crawler/lib/jsonl-stream.test.ts @@ -11,3 +11,15 @@ test('splits a byte stream into lines across chunk boundaries, skipping blanks', out.push(line); expect(out).toEqual(['{"a":1}', '{"b":2}', '{"c":3}']); }); + +test('decodes a multi-byte UTF-8 char split across chunk boundaries', async () => { + // "ü" (U+00FC) encodes to bytes 0xC3 0xBC; split the stream between those two bytes. + const bytes = new TextEncoder().encode('{"n":"Grün"}\n'); + async function* src() { + yield bytes.slice(0, 9); // ends on the first byte of "ü" + yield bytes.slice(9); + } + const out: string[] = []; + for await (const line of splitJsonlLines(src())) out.push(line); + expect(out).toEqual(['{"n":"Grün"}']); +}); diff --git a/crawler/lib/jsonl-stream.ts b/crawler/lib/jsonl-stream.ts index a81ebe8a..ef55c5b6 100644 --- a/crawler/lib/jsonl-stream.ts +++ b/crawler/lib/jsonl-stream.ts @@ -10,6 +10,7 @@ export async function* splitJsonlLines(source: AsyncIterable): Async if (line.length > 0) yield line; } } + buf += decoder.decode(); // flush any buffered bytes from an incomplete trailing sequence const last = buf.trim(); if (last.length > 0) yield last; } From 276cfb9318d02069ebd2209d8b0b846cfc0a66d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Orell=20B=C3=BChler?= Date: Sat, 30 May 2026 12:46:18 +0200 Subject: [PATCH 24/26] chore: ignore entire .claude directory --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6815148a..9e98a995 100644 --- a/.gitignore +++ b/.gitignore @@ -30,7 +30,9 @@ vite.config.ts.timestamp-* # Worktrees .worktrees/ -.claude/worktrees/ + +# Claude Code +.claude/ # Superpowers .superpowers/ From 8fadab63ef8b3a9de17a9c2b0294ce539ff2a4d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Orell=20B=C3=BChler?= Date: Sat, 30 May 2026 12:53:13 +0200 Subject: [PATCH 25/26] feat: surface Open Food Facts search in FoodPicker with copy-on-use save Add online OFF search and barcode save (copy-on-use) API routes, a search response validation schema, and surface OFF results in the FoodPicker. Includes OpenAPI/schema, i18n, and test updates. --- docs/openapi.json | 110 +++++++++++++++++- messages/de.json | 4 + messages/en.json | 4 + src/lib/api/generated/schema.d.ts | 99 +++++++++++++++- .../components/entries/AddFoodModal.svelte | 14 +++ src/lib/components/entries/FoodPicker.svelte | 87 ++++++++++++-- src/lib/server/openapi.ts | 45 ++++++- src/lib/server/openfoodfacts.test.ts | 87 ++++++++++++++ .../validation/responses/openfoodfacts.ts | 6 + src/lib/services/food-service.svelte.ts | 13 ++- src/routes/(app)/foods/+page.svelte | 72 +++++++++++- .../openfoodfacts/[barcode]/save/+server.ts | 37 ++++++ .../api/openfoodfacts/search/+server.ts | 29 +++++ tests/api/openfoodfacts-save.test.ts | 81 +++++++++++++ tests/api/openfoodfacts-search.test.ts | 60 ++++++++++ 15 files changed, 730 insertions(+), 18 deletions(-) create mode 100644 src/lib/server/openfoodfacts.test.ts create mode 100644 src/routes/api/openfoodfacts/[barcode]/save/+server.ts create mode 100644 src/routes/api/openfoodfacts/search/+server.ts create mode 100644 tests/api/openfoodfacts-save.test.ts create mode 100644 tests/api/openfoodfacts-search.test.ts diff --git a/docs/openapi.json b/docs/openapi.json index d87b9880..6ab5bf54 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -2546,6 +2546,47 @@ } } }, + "/api/openfoodfacts/search": { + "get": { + "operationId": "searchOpenFoodFacts", + "tags": ["OpenFoodFacts"], + "description": "Text search Open Food Facts products. Online fallback used by the food picker when local + catalog results are sparse.", + "parameters": [ + { + "in": "query", + "name": "q", + "schema": { + "type": "string" + }, + "required": true + }, + { + "in": "query", + "name": "limit", + "schema": { + "type": "integer", + "minimum": -9007199254740991, + "maximum": 9007199254740991 + } + } + ], + "responses": { + "200": { + "description": "Success", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OpenFoodFactsSearchResponse" + } + } + } + }, + "401": { + "$ref": "#/components/responses/UnauthorizedResponse" + } + } + } + }, "/api/openfoodfacts/{barcode}": { "get": { "operationId": "lookupOpenFoodFacts", @@ -2577,6 +2618,54 @@ } } } + }, + "/api/openfoodfacts/{barcode}/save": { + "post": { + "operationId": "saveOpenFoodFactsProduct", + "tags": ["OpenFoodFacts"], + "description": "Instantiate a personal food from an Open Food Facts product by barcode (copy-on-use). Idempotent: returns the existing food if already saved.", + "parameters": [ + { + "in": "path", + "name": "barcode", + "schema": { + "type": "string" + }, + "required": true + } + ], + "responses": { + "200": { + "description": "Existing food returned", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FoodResponse" + } + } + } + }, + "201": { + "description": "Created", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FoodResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/ValidationErrorResponse" + }, + "401": { + "$ref": "#/components/responses/UnauthorizedResponse" + }, + "404": { + "$ref": "#/components/responses/NotFoundResponse" + } + } + } } }, "components": { @@ -9243,14 +9332,17 @@ "required": ["date", "eveningCalories", "sleepDurationMinutes", "sleepQuality"], "additionalProperties": false }, - "OpenFoodFactsResponse": { + "OpenFoodFactsSearchResponse": { "type": "object", "properties": { - "product": { - "$ref": "#/components/schemas/OpenFoodFactsProduct" + "results": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenFoodFactsProduct" + } } }, - "required": ["product"], + "required": ["results"], "additionalProperties": false }, "OpenFoodFactsProduct": { @@ -9814,6 +9906,16 @@ "ingredientsText" ], "additionalProperties": false + }, + "OpenFoodFactsResponse": { + "type": "object", + "properties": { + "product": { + "$ref": "#/components/schemas/OpenFoodFactsProduct" + } + }, + "required": ["product"], + "additionalProperties": false } }, "responses": { diff --git a/messages/de.json b/messages/de.json index c427d012..24f521fa 100644 --- a/messages/de.json +++ b/messages/de.json @@ -67,6 +67,10 @@ "add_food_catalog_section": "Aus Katalog", "add_food_catalog_add_failed": "Produkt konnte nicht hinzugefügt werden. Es ist evtl. schon in deinen Lebensmitteln.", "catalog_source_badge": "{source}", + "add_food_off_searching": "Open Food Facts wird durchsucht…", + "add_food_off_section": "Aus Open Food Facts", + "add_food_off_badge": "OFF", + "add_food_off_add_failed": "Produkt konnte nicht aus Open Food Facts hinzugefügt werden.", "quick_log_name_placeholder": "Bezeichnung (optional)", "quick_log_calories": "Kalorien (kcal)", "quick_log_protein": "Protein (g)", diff --git a/messages/en.json b/messages/en.json index 9d2d7f71..4937c600 100644 --- a/messages/en.json +++ b/messages/en.json @@ -67,6 +67,10 @@ "add_food_catalog_section": "From catalog", "add_food_catalog_add_failed": "Could not add this product. It may already be in your foods.", "catalog_source_badge": "{source}", + "add_food_off_searching": "Searching Open Food Facts…", + "add_food_off_section": "From Open Food Facts", + "add_food_off_badge": "OFF", + "add_food_off_add_failed": "Could not add this product from Open Food Facts.", "quick_log_name_placeholder": "Label (optional)", "quick_log_calories": "Calories (kcal)", "quick_log_protein": "Protein (g)", diff --git a/src/lib/api/generated/schema.d.ts b/src/lib/api/generated/schema.d.ts index c3a7523a..27d9b9d5 100644 --- a/src/lib/api/generated/schema.d.ts +++ b/src/lib/api/generated/schema.d.ts @@ -824,6 +824,23 @@ export interface paths { patch?: never; trace?: never; }; + '/api/openfoodfacts/search': { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** @description Text search Open Food Facts products. Online fallback used by the food picker when local + catalog results are sparse. */ + get: operations['searchOpenFoodFacts']; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; '/api/openfoodfacts/{barcode}': { parameters: { query?: never; @@ -841,6 +858,23 @@ export interface paths { patch?: never; trace?: never; }; + '/api/openfoodfacts/{barcode}/save': { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** @description Instantiate a personal food from an Open Food Facts product by barcode (copy-on-use). Idempotent: returns the existing food if already saved. */ + post: operations['saveOpenFoodFactsProduct']; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; } export type webhooks = Record; export interface components { @@ -1953,8 +1987,8 @@ export interface components { sleepDurationMinutes: number; sleepQuality: number; }; - OpenFoodFactsResponse: { - product: components['schemas']['OpenFoodFactsProduct']; + OpenFoodFactsSearchResponse: { + results: components['schemas']['OpenFoodFactsProduct'][]; }; OpenFoodFactsProduct: { id: string; @@ -2017,6 +2051,9 @@ export interface components { additives: string[] | null; ingredientsText: string | null; }; + OpenFoodFactsResponse: { + product: components['schemas']['OpenFoodFactsProduct']; + }; }; responses: { /** @description Unauthorized */ @@ -3686,6 +3723,30 @@ export interface operations { 409: components['responses']['ConflictResponse']; }; }; + searchOpenFoodFacts: { + parameters: { + query: { + q: string; + limit?: number; + }; + header?: never; + path?: never; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Success */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + 'application/json': components['schemas']['OpenFoodFactsSearchResponse']; + }; + }; + 401: components['responses']['UnauthorizedResponse']; + }; + }; lookupOpenFoodFacts: { parameters: { query?: never; @@ -3709,4 +3770,38 @@ export interface operations { 401: components['responses']['UnauthorizedResponse']; }; }; + saveOpenFoodFactsProduct: { + parameters: { + query?: never; + header?: never; + path: { + barcode: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Existing food returned */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + 'application/json': components['schemas']['FoodResponse']; + }; + }; + /** @description Created */ + 201: { + headers: { + [name: string]: unknown; + }; + content: { + 'application/json': components['schemas']['FoodResponse']; + }; + }; + 400: components['responses']['ValidationErrorResponse']; + 401: components['responses']['UnauthorizedResponse']; + 404: components['responses']['NotFoundResponse']; + }; + }; } diff --git a/src/lib/components/entries/AddFoodModal.svelte b/src/lib/components/entries/AddFoodModal.svelte index 1f51e146..003746c3 100644 --- a/src/lib/components/entries/AddFoodModal.svelte +++ b/src/lib/components/entries/AddFoodModal.svelte @@ -108,6 +108,20 @@ servingUnit: food.servingUnit, calories: food.calories }; + } else if (selection.type === 'off') { + const food = await foodService.saveFromOFF(selection.off.barcode); + if (!food) { + toast.error(m.add_food_off_add_failed()); + return; + } + selectedFood = { + id: food.id, + name: food.name, + type: 'food', + servingSize: food.servingSize, + servingUnit: food.servingUnit, + calories: food.calories + }; } else if (selection.type === 'favorite') { selectedFood = { id: selection.favorite.id, diff --git a/src/lib/components/entries/FoodPicker.svelte b/src/lib/components/entries/FoodPicker.svelte index 7675b54b..326a72a7 100644 --- a/src/lib/components/entries/FoodPicker.svelte +++ b/src/lib/components/entries/FoodPicker.svelte @@ -41,7 +41,8 @@ servingUnit?: string | null; }; } - | { type: 'catalog'; catalog: { id: string; name: string; source: string } }; + | { type: 'catalog'; catalog: { id: string; name: string; source: string } } + | { type: 'off'; off: { barcode: string; name: string; brand: string | null } }; type FavoriteItem = Extract['favorite']; @@ -62,30 +63,65 @@ source: string; datasetKey: string; }; + type OffHit = { + barcode: string; + name: string; + brand: string | null; + }; let catalogResults: CatalogHit[] = $state([]); let catalogLoading = $state(false); - let catalogTimer: ReturnType | undefined; + let offResults: OffHit[] = $state([]); + let offLoading = $state(false); + let searchTimer: ReturnType | undefined; - const runCatalogSearch = (term: string) => { - clearTimeout(catalogTimer); - if (term.trim().length < 2) { + // Below this many local + catalog matches, fall back to an Open Food Facts search. + const OFF_FALLBACK_THRESHOLD = 5; + + const runOnlineSearch = (term: string) => { + clearTimeout(searchTimer); + const trimmed = term.trim(); + if (trimmed.length < 2) { catalogResults = []; + offResults = []; catalogLoading = false; + offLoading = false; return; } catalogLoading = true; - catalogTimer = setTimeout(async () => { + offResults = []; + searchTimer = setTimeout(async () => { + // Snapshot the local match count before awaiting, so the fallback + // decision isn't skewed by the `foods` prop changing mid-request. + const localCount = filtered().length; try { const { data } = await api.GET('/api/catalog/search', { - params: { query: { q: term } } + params: { query: { q: trimmed } } }); - catalogResults = (data?.results ?? []) as CatalogHit[]; + if (trimmed === query.trim()) catalogResults = (data?.results ?? []) as CatalogHit[]; } catch (e) { if (dev) console.warn('catalog search failed:', e); catalogResults = []; } finally { catalogLoading = false; } + // Online fallback: only when the combined local + catalog result set is sparse. + if (trimmed !== query.trim()) return; + if (localCount + catalogResults.length >= OFF_FALLBACK_THRESHOLD) { + offResults = []; + return; + } + offLoading = true; + try { + const { data } = await api.GET('/api/openfoodfacts/search', { + params: { query: { q: trimmed } } + }); + if (trimmed === query.trim()) offResults = (data?.results ?? []) as OffHit[]; + } catch (e) { + if (dev) console.warn('Open Food Facts search failed:', e); + offResults = []; + } finally { + offLoading = false; + } }, 300); }; @@ -164,7 +200,7 @@ $effect(() => { if (tab === 'recent') loadRecentFoods(); if (tab === 'favorites') loadFavoriteRecipes(); - if (tab === 'search') runCatalogSearch(query); + if (tab === 'search') runOnlineSearch(query); }); @@ -219,6 +255,39 @@ {/each} {/if} + {#if offLoading} +

{m.add_food_off_searching()}

+ {:else if offResults.length > 0} +

{m.add_food_off_section()}

+
    + {#each offResults as hit (hit.barcode)} +
  • + + {hit.name} + {#if hit.brand} · {hit.brand}{/if} + {m.add_food_off_badge()} + + +
  • + {/each} +
+ {/if}
diff --git a/src/lib/server/openapi.ts b/src/lib/server/openapi.ts index 8e9364a1..f5ecf33c 100644 --- a/src/lib/server/openapi.ts +++ b/src/lib/server/openapi.ts @@ -60,7 +60,10 @@ import { import { favoritesResponseSchema } from './validation/responses/favorites'; import { maintenanceResponseSchema } from './validation/responses/maintenance'; import { imageUploadResponseSchema } from './validation/responses/images'; -import { openfoodfactsResponseSchema } from './validation/responses/openfoodfacts'; +import { + openfoodfactsResponseSchema, + openfoodfactsSearchResponseSchema +} from './validation/responses/openfoodfacts'; import { goalsResponseSchema, goalsSetResponseSchema } from './validation/responses/goals'; import { dayPropertiesResponseSchema, @@ -1365,6 +1368,24 @@ export function generateSpec() { }, // ── Open Food Facts ─────────────────────────────────── + '/api/openfoodfacts/search': { + get: { + operationId: 'searchOpenFoodFacts', + tags: ['OpenFoodFacts'], + description: + 'Text search Open Food Facts products. Online fallback used by the food picker when local + catalog results are sparse.', + requestParams: { + query: z.object({ q: z.string(), limit: z.number().int().optional() }) + }, + responses: { + '200': { + description: 'Success', + content: { 'application/json': { schema: openfoodfactsSearchResponseSchema } } + }, + '401': res401 + } + } + }, '/api/openfoodfacts/{barcode}': { get: { operationId: 'lookupOpenFoodFacts', @@ -1381,6 +1402,28 @@ export function generateSpec() { '401': res401 } } + }, + '/api/openfoodfacts/{barcode}/save': { + post: { + operationId: 'saveOpenFoodFactsProduct', + tags: ['OpenFoodFacts'], + description: + 'Instantiate a personal food from an Open Food Facts product by barcode (copy-on-use). Idempotent: returns the existing food if already saved.', + requestParams: { path: z.object({ barcode: z.string() }) }, + responses: { + '200': { + description: 'Existing food returned', + content: { 'application/json': { schema: foodResponseSchema } } + }, + '201': { + description: 'Created', + content: { 'application/json': { schema: foodResponseSchema } } + }, + '400': res400, + '401': res401, + '404': res404 + } + } } } }); diff --git a/src/lib/server/openfoodfacts.test.ts b/src/lib/server/openfoodfacts.test.ts new file mode 100644 index 00000000..0775480e --- /dev/null +++ b/src/lib/server/openfoodfacts.test.ts @@ -0,0 +1,87 @@ +import { describe, it, expect, vi, afterEach } from 'vitest'; +import { searchProducts } from './openfoodfacts'; + +const okResponse = (body: unknown) => + new Response(JSON.stringify(body), { headers: { 'content-type': 'application/json' } }); + +describe('searchProducts (Open Food Facts text search)', () => { + afterEach(() => vi.unstubAllGlobals()); + + it('maps OFF search results to the OFFProduct shape', async () => { + const fetchMock = vi.fn().mockResolvedValue( + okResponse({ + products: [ + { + code: '7610200004444', + product_name: 'Test Chocolate', + brands: 'Frey', + nutriscore_grade: 'd', + nutriments: { + 'energy-kcal_100g': 540, + proteins_100g: 7.2, + carbohydrates_100g: 55, + fat_100g: 32, + fiber_100g: 4 + } + } + ] + }) + ); + vi.stubGlobal('fetch', fetchMock); + + const results = await searchProducts('chocolate', 5); + + expect(results).toHaveLength(1); + expect(results[0]).toMatchObject({ + name: 'Test Chocolate', + brand: 'Frey', + barcode: '7610200004444', + calories: 540, + protein: 7.2, + carbs: 55, + fat: 32, + fiber: 4, + servingSize: 100, + servingUnit: 'g', + nutriScore: 'd' + }); + }); + + it('filters out products without a name', async () => { + vi.stubGlobal( + 'fetch', + vi.fn().mockResolvedValue( + okResponse({ + products: [ + { code: '111', product_name: '', nutriments: {} }, + { code: '222', product_name: 'Has Name', nutriments: {} } + ] + }) + ) + ); + + const results = await searchProducts('x', 5); + expect(results.map((r) => r.name)).toEqual(['Has Name']); + }); + + it('returns [] on a non-ok response', async () => { + vi.stubGlobal('fetch', vi.fn().mockResolvedValue(new Response('', { status: 503 }))); + expect(await searchProducts('x')).toEqual([]); + }); + + it('returns [] on a network error', async () => { + vi.stubGlobal('fetch', vi.fn().mockRejectedValue(new Error('network down'))); + expect(await searchProducts('x')).toEqual([]); + }); + + it('clamps page_size to 20 and forwards the search term', async () => { + const fetchMock = vi.fn().mockResolvedValue(okResponse({ products: [] })); + vi.stubGlobal('fetch', fetchMock); + + await searchProducts('milk', 100); + + const url = String(fetchMock.mock.calls[0][0]); + expect(url).toContain('search_terms=milk'); + expect(url).toContain('page_size=20'); + }); +}); diff --git a/src/lib/server/validation/responses/openfoodfacts.ts b/src/lib/server/validation/responses/openfoodfacts.ts index b09e8e88..9c6a6179 100644 --- a/src/lib/server/validation/responses/openfoodfacts.ts +++ b/src/lib/server/validation/responses/openfoodfacts.ts @@ -32,3 +32,9 @@ export const openfoodfactsResponseSchema = z product: productSchema }) .meta({ id: 'OpenFoodFactsResponse' }); + +export const openfoodfactsSearchResponseSchema = z + .object({ + results: z.array(productSchema) + }) + .meta({ id: 'OpenFoodFactsSearchResponse' }); diff --git a/src/lib/services/food-service.svelte.ts b/src/lib/services/food-service.svelte.ts index dc3261d4..121c4704 100644 --- a/src/lib/services/food-service.svelte.ts +++ b/src/lib/services/food-service.svelte.ts @@ -208,6 +208,16 @@ async function saveFromCatalog(catalogId: string): Promise { return food; } +async function saveFromOFF(barcode: string): Promise { + const { data } = await api.POST('/api/openfoodfacts/{barcode}/save', { + params: { path: { barcode } } + }); + if (!data?.food) return null; + const food = data.food as unknown as DexieFood; + await db.foods.put(food); + return food; +} + export const foodService = { allFoods, foodById, @@ -219,5 +229,6 @@ export const foodService = { update, delete: deleteFood, findByBarcode, - saveFromCatalog + saveFromCatalog, + saveFromOFF }; diff --git a/src/routes/(app)/foods/+page.svelte b/src/routes/(app)/foods/+page.svelte index b4ad0e3b..3c7c1bfe 100644 --- a/src/routes/(app)/foods/+page.svelte +++ b/src/routes/(app)/foods/+page.svelte @@ -37,6 +37,10 @@ let offLoading = $state(false); let offNotFound = $state(false); let activeBarcode = $state(''); + let offResults = $state([]); + let offSearchLoading = $state(false); + // Below this many local matches, offer Open Food Facts results to fill the gap. + const OFF_FALLBACK_THRESHOLD = 5; let forceDeleteId: string | null = $state(null); let forceDeleteCount = $state(0); let qualityOpen = $state(false); @@ -94,6 +98,33 @@ const foods = $derived(debouncedQuery ? searchResults.value : allFoodsQuery.value); + // Online Open Food Facts fallback when the personal DB has few matches. + $effect(() => { + const q = debouncedQuery.trim(); + const localCount = foods.length; + if (!browser || q.length < 2 || localCount >= OFF_FALLBACK_THRESHOLD) { + offResults = []; + offSearchLoading = false; + return; + } + let cancelled = false; + offSearchLoading = true; + api + .GET('/api/openfoodfacts/search', { params: { query: { q } } }) + .then(({ data }) => { + if (!cancelled) offResults = data?.results ?? []; + }) + .catch(() => { + if (!cancelled) offResults = []; + }) + .finally(() => { + if (!cancelled) offSearchLoading = false; + }); + return () => { + cancelled = true; + }; + }); + $effect(() => { if (browser) { foodService.refresh(); @@ -252,6 +283,13 @@ } } + const prefillFromOff = (product: components['schemas']['OpenFoodFactsProduct']) => { + resetFormState(); + offData = product; + activeBarcode = product.barcode; + showForm = true; + }; + // Load visible nutrients preference (once) $effect(() => { if (browser) { @@ -330,7 +368,7 @@ /> - {#if query && foods.length === 0} + {#if query && foods.length === 0 && !offSearchLoading && offResults.length === 0}

{m.foods_no_results()}

{:else} {/if} + + {#if debouncedQuery && (offSearchLoading || offResults.length > 0)} +
+

{m.add_food_off_section()}

+ {#if offSearchLoading} +

{m.add_food_off_searching()}

+ {:else} +
    + {#each offResults as product (product.barcode)} +
  • + + {product.name} + {#if product.brand} + · {product.brand}{/if} + + +
  • + {/each} +
+ {/if} +
+ {/if}