diff --git a/.gitignore b/.gitignore index 78138827..9e98a995 100644 --- a/.gitignore +++ b/.gitignore @@ -30,7 +30,9 @@ vite.config.ts.timestamp-* # Worktrees .worktrees/ -.claude/worktrees/ + +# Claude Code +.claude/ # Superpowers .superpowers/ @@ -61,3 +63,7 @@ mobile/iosApp/*.xcodeproj # OpenAPI generator temp output .openapi-gen-tmp/ + +# Crawled catalog datasets — never commit (public repo; private data) +data/catalog/ +tests/fixtures/catalog/bad.jsonl diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 625b4faa..35144dec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -43,3 +43,11 @@ repos: entry: gitleaks protect --staged --verbose --redact language: system pass_filenames: false + + - id: no-catalog-data + name: no committed catalog datasets + entry: >- + bash -c 'if git diff --cached --name-only | grep -qE "^data/catalog/.*\.jsonl$"; + then echo "ERROR - catalog dataset files must not be committed (public repo)"; exit 1; fi' + language: system + pass_filenames: false diff --git a/crawler/README.md b/crawler/README.md new file mode 100644 index 00000000..4ca6c420 --- /dev/null +++ b/crawler/README.md @@ -0,0 +1,67 @@ +# Bissbilanz Catalog Crawler + +Offline tool that builds **catalog datasets** (normalized JSONL) for the access-gated +base food catalog. It is **not part of the SvelteKit app**, its build, or `bun run security` +scope — nothing under `src/` imports it. + +## Legal posture + +- **Private use, no redistribution.** Crawler _code_ ships in this repo; crawled _data_ never + does. Datasets are written under `data/catalog/` which is git-ignored and rejected by a + pre-commit hook (`no-catalog-data`). +- Output is imported only into this app's database and surfaced only to its authenticated, + individually access-granted users. It is not rehosted or redistributed. +- Retailer images are referenced by source URL only — never rehosted. +- Sources are accessed politely: fixed-delay throttling, on-disk response caching, descriptive + User-Agent, exponential-backoff retry. + +## Dataset format + +One JSONL file per dataset. Line 1 is a `{ "_dataset": { ... } }` header; lines 2..n are one +product per line. The contract is the shared Zod schema +`src/lib/server/catalog/dataset-schema.ts` — the crawler validates every emitted row against it, +so a produced file always imports cleanly (`catalog:import` is fail-closed). + +## Usage + +```bash +cd crawler +bun install # installs migros-api-wrapper (Migros source only) + +# Open Food Facts — from a downloaded ODbL bulk dump (.jsonl or .jsonl.gz): +# download once from https://world.openfoodfacts.org/data (openfoodfacts-products.jsonl.gz) +bun run crawl off /path/to/openfoodfacts-products.jsonl.gz +# → writes data/catalog/off-ch-.jsonl (Swiss products with full core macros) + +# Migros — live API (polite, throttled): +bun run crawl migros +# → writes data/catalog/migros-.jsonl +``` + +The OFF dump is large (tens of GB uncompressed); the crawler streams it (gunzip + line split), +never loading it into memory. The Migros crawl is live and rate-limited — expect it to take a +while; it checkpoints progress. + +## Importing on the server host + +The CLI that loads a dataset into Postgres runs **on the server host** (production Postgres is +Docker-internal), not from the crawler: + +```bash +scp data/catalog/migros-.jsonl server:/tmp/ +ssh server +docker compose exec -T app bun run catalog:import /tmp/migros-.jsonl +docker compose exec -T app bun run catalog:grant migros +``` + +Re-importing the same dataset `key` fully replaces its rows and preserves access grants. + +## Testing + +```bash +cd crawler && bun test +``` + +All tests are fixture-driven — no live network. Adapters split a pure, tested normalizer from +thin live-fetch glue; the glue (`createMigrosClient`, dump download) is exercised only by the +maintainer during a real crawl. diff --git a/crawler/adapters/migros/client.test.ts b/crawler/adapters/migros/client.test.ts new file mode 100644 index 00000000..3f6b1064 --- /dev/null +++ b/crawler/adapters/migros/client.test.ts @@ -0,0 +1,39 @@ +import { test, expect } from 'bun:test'; +import { join } from 'node:path'; +import { mapProductDetail, extractProductIds, pickDetail } from './client'; + +test('mapProductDetail reduces a Migros API product-detail to MigrosProductDetail', async () => { + const raw = await Bun.file( + join(import.meta.dir, '../../fixtures/migros-product-detail.json') + ).json(); + const d = mapProductDetail(raw); + expect(d).not.toBeNull(); + expect(d!.id).toBe('100001'); + expect(d!.name).toBe('M-Classic Vollmilch UHT'); + expect(d!.gtins).toEqual(['7610200000001']); + expect(d!.productUrl).toContain('100001'); + expect(d!.nutrition.basis).toBe('100g'); + expect(d!.nutrition.energyKcal).toBe(64); + expect(d!.nutrition.sugar).toBe(4.8); + expect(d!.nutrition.saturatedFat).toBe(2.1); + expect(d!.nutrition.salt).toBe(0.1); +}); + +test('mapProductDetail returns null when id or name is missing', () => { + expect(mapProductDetail({ name: 'no id' })).toBeNull(); + expect(mapProductDetail({ productId: '1' })).toBeNull(); +}); + +test('extractProductIds reads productIds or products[].id/uid', () => { + expect(extractProductIds({ productIds: ['a', 'b'] })).toEqual(['a', 'b']); + expect(extractProductIds({ products: [{ id: 'x' }, { uid: 'y' }] })).toEqual(['x', 'y']); + expect(extractProductIds({})).toEqual([]); + expect(extractProductIds(null)).toEqual([]); +}); + +test('pickDetail selects a single product from array/products/object shapes', () => { + expect(pickDetail([{ productId: '1' }])?.productId).toBe('1'); + expect(pickDetail({ products: [{ productId: '2' }] })?.productId).toBe('2'); + expect(pickDetail({ productId: '3' })?.productId).toBe('3'); + expect(pickDetail(null)).toBeNull(); +}); diff --git a/crawler/adapters/migros/client.ts b/crawler/adapters/migros/client.ts new file mode 100644 index 00000000..76d28ef9 --- /dev/null +++ b/crawler/adapters/migros/client.ts @@ -0,0 +1,138 @@ +import type { MigrosClient, MigrosProductDetail, MigrosNutrition } from './types'; + +type RawNutrientValue = { code?: string; value?: number | string }; +type RawProductDetail = { + productId?: string; + name?: string; + brand?: string; + gtins?: string[]; + productUrls?: Record; + image?: { original?: string }; + ingredients?: string; + nutrients?: { referenceValue?: string; values?: RawNutrientValue[] }; +}; + +type MigrosNumericKey = Exclude; + +const NUTRIENT_CODE: Record = { + energy_kcal: 'energyKcal', + protein: 'protein', + carbohydrate: 'carbohydrate', + of_which_sugars: 'sugar', + fat: 'fat', + of_which_saturated: 'saturatedFat', + dietary_fiber: 'fiber', + salt: 'salt' +}; + +function num(v: number | string | undefined): number | null { + if (v == null) return null; + const n = typeof v === 'string' ? parseFloat(v) : v; + return Number.isNaN(n) ? null : n; +} + +export function mapProductDetail(raw: RawProductDetail): MigrosProductDetail | null { + const id = raw.productId; + const name = raw.name; + if (!id || !name) return null; + const nutrition: MigrosNutrition = { basis: raw.nutrients?.referenceValue ?? '100g' }; + for (const entry of raw.nutrients?.values ?? []) { + const key = entry.code ? NUTRIENT_CODE[entry.code] : undefined; + if (key) nutrition[key] = num(entry.value); + } + return { + id, + name, + brand: raw.brand ?? null, + gtins: (raw.gtins ?? []).filter((g) => !!g), + productUrl: raw.productUrls?.de ?? Object.values(raw.productUrls ?? {})[0] ?? null, + imageUrl: raw.image?.original ?? null, + ingredients: raw.ingredients ?? null, + nutrition + }; +} + +export type MigrosClientConfig = { + /** Food category search terms or category ids to page through (host-confirmed). */ + categories: string[]; + pageSize?: number; + maxPagesPerCategory?: number; +}; + +/** Best-effort extraction of product ids from a (loosely-typed) search response. */ +export function extractProductIds(res: unknown): string[] { + const r = res as { productIds?: string[]; products?: Array<{ id?: string; uid?: string }> }; + if (Array.isArray(r?.productIds)) return r.productIds.filter((id): id is string => !!id); + if (Array.isArray(r?.products)) { + return r.products.map((p) => p.id ?? p.uid).filter((id): id is string => !!id); + } + return []; +} + +/** Best-effort selection of the single product object from a product-detail response. */ +export function pickDetail(res: unknown): RawProductDetail | null { + if (!res) return null; + if (Array.isArray(res)) return (res[0] as RawProductDetail) ?? null; + const r = res as { products?: RawProductDetail[] }; + if (Array.isArray(r.products)) return r.products[0] ?? null; + return res as RawProductDetail; +} + +/** + * Live client backed by `migros-api-wrapper` (`MigrosAPI`: guest token → product search → + * product-detail). NOT unit-tested — no live network in CI (spec §12). The dependency is + * imported dynamically so the tested core type-checks/runs without loading axios/cheerio/pino. + * + * The wrapper's instance methods return `any` and some option types are inconsistent, so the + * call boundary is navigated through a narrow facade. The exact category ids/pagination params + * and the product-detail response field paths consumed by `mapProductDetail`/`extractProductIds` + * are verified against a live response on the server host during the first crawl (spec §13). + */ +export async function createMigrosClient(config: MigrosClientConfig): Promise { + const { MigrosAPI } = await import('migros-api-wrapper'); + const api = new MigrosAPI(); + // Guest token — public product data needs no login. + const token = (await api.account.oauth2.loginGuestToken()) as string; + + const products = api.products as unknown as { + productSearch: { + searchProduct: ( + body: { query: string; [k: string]: unknown }, + options?: Record, + token?: string + ) => Promise; + }; + productDisplay: { + getProductDetails: ( + options: { uids: string | string[]; [k: string]: unknown }, + token?: string + ) => Promise; + }; + }; + + const pageSize = config.pageSize ?? 24; + const maxPages = config.maxPagesPerCategory ?? 1000; + + return { + async *listProductIds({ resume }) { + for (const category of config.categories) { + let page = resume && resume.category === category ? resume.page : 0; + for (; page < maxPages; page++) { + const res = await products.productSearch.searchProduct( + { query: category }, + { from: page * pageSize, hitsPerPage: pageSize }, + token + ); + const ids = extractProductIds(res); + for (const id of ids) yield { id, cursor: { category, page } }; + if (ids.length < pageSize) break; + } + } + }, + async getProduct(id) { + const res = await products.productDisplay.getProductDetails({ uids: id }, token); + const raw = pickDetail(res); + return raw ? mapProductDetail(raw) : null; + } + }; +} diff --git a/crawler/adapters/migros/crawl-migros.test.ts b/crawler/adapters/migros/crawl-migros.test.ts new file mode 100644 index 00000000..ce95e8a2 --- /dev/null +++ b/crawler/adapters/migros/crawl-migros.test.ts @@ -0,0 +1,78 @@ +import { test, expect } from 'bun:test'; +import { crawlMigros } from './crawl-migros'; +import { newStats } from '../../types'; +import type { MigrosClient, MigrosProductDetail } from './types'; + +function makeClient( + products: Record, + ids: string[] +): MigrosClient { + return { + async *listProductIds() { + let page = 0; + for (const id of ids) yield { id, cursor: { category: 'all', page: page++ } }; + }, + async getProduct(id) { + return products[id] ?? null; + } + }; +} + +const base: MigrosProductDetail = { + id: '1', + name: 'A', + gtins: ['7610200000001'], + productUrl: 'https://m/1', + nutrition: { basis: '100g', energyKcal: 64, protein: 3.3, carbohydrate: 4.8, fat: 3.5, fiber: 0 } +}; + +test('emits normalized products and dedupes repeated ids and barcodes', async () => { + const client = makeClient( + { + '1': base, + '2': { ...base, id: '2', name: 'B', gtins: ['7610200000002'] }, + '3': { ...base, id: '3', name: 'A-dup', gtins: ['7610200000001'] } // dup barcode + }, + ['1', '2', '2', '3'] // '2' listed twice + ); + const stats = newStats(); + const out = []; + for await (const p of crawlMigros(client, { stats, sleep: async () => {} })) out.push(p); + expect(out.map((p) => p.name).sort()).toEqual(['A', 'B']); + expect(stats.emitted).toBe(2); + expect(stats.dropReasons['dup']).toBe(2); // one dup id + one dup barcode +}); + +test('skips ids whose product detail is null', async () => { + const client = makeClient({ '1': base, '9': null }, ['1', '9']); + const out = []; + for await (const p of crawlMigros(client, { sleep: async () => {} })) out.push(p); + expect(out.length).toBe(1); +}); + +test('respects the limit option', async () => { + const client = makeClient({ '1': base, '2': { ...base, id: '2', gtins: ['7610200000002'] } }, [ + '1', + '2' + ]); + const out = []; + for await (const p of crawlMigros(client, { limit: 1, sleep: async () => {} })) out.push(p); + expect(out.length).toBe(1); +}); + +test('checkpoints only emitted products (after yield), not dropped ones', async () => { + const client = makeClient( + { '1': base, '2': { ...base, id: '2', name: 'B', gtins: ['7610200000002'] }, '9': null }, + ['1', '9', '2'] + ); + const cursors: Array<{ category: string; page: number }> = []; + const out = []; + for await (const p of crawlMigros(client, { + sleep: async () => {}, + onCheckpoint: (c) => void cursors.push(c) + })) + out.push(p); + // '9' has no detail (dropped) → not checkpointed; only the two emitted products are. + expect(out.length).toBe(2); + expect(cursors.length).toBe(2); +}); diff --git a/crawler/adapters/migros/crawl-migros.ts b/crawler/adapters/migros/crawl-migros.ts new file mode 100644 index 00000000..7dd7086a --- /dev/null +++ b/crawler/adapters/migros/crawl-migros.ts @@ -0,0 +1,58 @@ +import type { DatasetProduct, CrawlStats } from '../../types'; +import { newStats, recordDrop } from '../../types'; +import { migrosToDataset } from './normalize-migros'; +import type { MigrosClient } from './types'; + +export type MigrosCrawlOpts = { + limit?: number; + stats?: CrawlStats; + crawledAt?: string; + resume?: { category: string; page: number } | null; + sleep?: (ms: number) => Promise; + throttleMs?: number; + onCheckpoint?: (cursor: { category: string; page: number }) => Promise | void; + onProgress?: (stats: CrawlStats) => void; +}; + +export async function* crawlMigros( + client: MigrosClient, + opts: MigrosCrawlOpts = {} +): AsyncIterable { + const stats = opts.stats ?? newStats(); + const crawledAt = opts.crawledAt ?? new Date().toISOString(); + const sleep = opts.sleep ?? ((ms: number) => new Promise((r) => setTimeout(r, ms))); + const throttleMs = opts.throttleMs ?? 0; + const seenIds = new Set(); + const seenBarcodes = new Set(); + + for await (const { id, cursor } of client.listProductIds({ resume: opts.resume ?? null })) { + stats.seen++; + if (seenIds.has(id)) { + recordDrop(stats, 'dup:id'); + continue; + } + seenIds.add(id); + + const detail = await client.getProduct(id); + if (throttleMs > 0) await sleep(throttleMs); + if (!detail) { + recordDrop(stats, 'no-detail'); + continue; + } + const r = migrosToDataset(detail, crawledAt); + if (!r.ok) { + recordDrop(stats, r.reason); + continue; + } + if (r.product.barcode && seenBarcodes.has(r.product.barcode)) { + recordDrop(stats, 'dup:barcode'); + continue; + } + if (r.product.barcode) seenBarcodes.add(r.product.barcode); + stats.emitted++; + if (opts.onProgress && stats.emitted % 500 === 0) opts.onProgress(stats); + yield r.product; + if (opts.onCheckpoint) await opts.onCheckpoint(cursor); + if (opts.limit && stats.emitted >= opts.limit) return; + } +} diff --git a/crawler/adapters/migros/normalize-migros.test.ts b/crawler/adapters/migros/normalize-migros.test.ts new file mode 100644 index 00000000..ad280a7f --- /dev/null +++ b/crawler/adapters/migros/normalize-migros.test.ts @@ -0,0 +1,82 @@ +import { test, expect } from 'bun:test'; +import { migrosToDataset } from './normalize-migros'; +import type { MigrosProductDetail } from './types'; + +const detail: MigrosProductDetail = { + id: '100001', + name: 'M-Classic Vollmilch', + brand: 'M-Classic', + gtins: ['7610200000001'], + productUrl: 'https://www.migros.ch/de/product/100001', + imageUrl: 'https://image.migros.ch/100001.jpg', + nutrition: { + basis: '100g', + energyKcal: 64, + protein: 3.3, + carbohydrate: 4.8, + fat: 3.5, + fiber: 0, + sugar: 4.8, + saturatedFat: 2.1, + salt: 0.1 + } +}; + +test('maps a Migros product-detail to a valid dataset product (de)', () => { + const r = migrosToDataset(detail); + expect(r.ok).toBe(true); + if (r.ok) { + expect(r.product.name).toBe('M-Classic Vollmilch'); + expect(r.product.language).toBe('de'); + expect(r.product.barcode).toBe('7610200000001'); + expect(r.product.calories).toBe(64); + expect(r.product.sugar).toBe(4.8); + expect(r.product.saturatedFat).toBe(2.1); + expect(r.product.salt).toBe(0.1); + expect(r.product.sourceRef).toBe('100001'); + expect(r.product.vitaminC).toBeNull(); + } +}); + +test('rescales per-serving nutrition to per-100g when basis is grams', () => { + const r = migrosToDataset({ + ...detail, + nutrition: { + ...detail.nutrition, + basis: '200g', + energyKcal: 128, + protein: 6.6, + carbohydrate: 9.6, + fat: 7, + fiber: 0 + } + }); + expect(r.ok).toBe(true); + if (r.ok) expect(r.product.calories).toBe(64); +}); + +test('drops a product with no GTIN', () => { + const r = migrosToDataset({ ...detail, gtins: [] }); + expect(r.ok).toBe(false); + if (!r.ok) expect(r.reason).toContain('no-barcode'); +}); + +test('drops a product missing core macros', () => { + const r = migrosToDataset({ + ...detail, + nutrition: { basis: '100g', energyKcal: 64, protein: 3.3 } + }); + expect(r.ok).toBe(false); +}); + +test('uses ml serving unit and rescales for an ml-based product', () => { + const r = migrosToDataset({ + ...detail, + nutrition: { ...detail.nutrition, basis: '200ml', energyKcal: 128 } + }); + expect(r.ok).toBe(true); + if (r.ok) { + expect(r.product.servingUnit).toBe('ml'); + expect(r.product.calories).toBe(64); + } +}); diff --git a/crawler/adapters/migros/normalize-migros.ts b/crawler/adapters/migros/normalize-migros.ts new file mode 100644 index 00000000..35e13b95 --- /dev/null +++ b/crawler/adapters/migros/normalize-migros.ts @@ -0,0 +1,53 @@ +import { buildDatasetProduct } from '../../lib/normalize'; +import type { BuildResult } from '../../types'; +import type { MigrosProductDetail } from './types'; + +function gramsBasis(basis: string | undefined): number | null { + if (!basis) return 100; // assume per-100g when unspecified + const m = basis + .trim() + .toLowerCase() + .match(/^(\d+(?:\.\d+)?)\s*(g|ml)$/); + if (!m) return null; + return parseFloat(m[1]); +} + +export function migrosToDataset(d: MigrosProductDetail, crawledAt?: string): BuildResult { + const barcode = (d.gtins ?? []).find((g) => g && g.trim().length > 0)?.trim(); + if (!barcode) return { ok: false, reason: 'no-barcode' }; + const name = (d.name ?? '').trim(); + if (!name) return { ok: false, reason: 'no-name' }; + + const basisG = gramsBasis(d.nutrition.basis); + if (basisG == null || basisG <= 0) return { ok: false, reason: 'bad-basis' }; + const servingUnit: 'g' | 'ml' = d.nutrition.basis?.trim().toLowerCase().endsWith('ml') + ? 'ml' + : 'g'; + const f = 100 / basisG; + const scale = (v: number | null | undefined): number | null => + v == null || Number.isNaN(v) ? null : Math.round(v * f * 100) / 100; + + return buildDatasetProduct({ + name: name.slice(0, 500), + brand: d.brand ?? null, + language: 'de', + servingSize: 100, + servingUnit, + calories: scale(d.nutrition.energyKcal), + protein: scale(d.nutrition.protein), + carbs: scale(d.nutrition.carbohydrate), + fat: scale(d.nutrition.fat), + fiber: scale(d.nutrition.fiber), + nutrients: { + sugar: scale(d.nutrition.sugar), + saturatedFat: scale(d.nutrition.saturatedFat), + salt: scale(d.nutrition.salt) + }, + barcode: barcode.slice(0, 32), + ingredientsText: d.ingredients?.slice(0, 10000) ?? null, + imageUrl: d.imageUrl ?? null, + sourceUrl: d.productUrl ?? null, + sourceRef: d.id, + crawledAt: crawledAt ?? null + }); +} diff --git a/crawler/adapters/migros/types.ts b/crawler/adapters/migros/types.ts new file mode 100644 index 00000000..9df8d6f4 --- /dev/null +++ b/crawler/adapters/migros/types.ts @@ -0,0 +1,32 @@ +export type MigrosNutrition = { + basis?: string; // e.g. "100g", "200g", "100ml" + energyKcal?: number | null; + protein?: number | null; + carbohydrate?: number | null; + fat?: number | null; + fiber?: number | null; + sugar?: number | null; + saturatedFat?: number | null; + salt?: number | null; +}; + +export type MigrosProductDetail = { + id: string; + name: string; + brand?: string | null; + gtins?: string[]; + productUrl?: string | null; + imageUrl?: string | null; + ingredients?: string | null; + nutrition: MigrosNutrition; +}; + +export interface MigrosClient { + /** Yields product ids for the configured food categories, page by page. */ + listProductIds(opts: { resume?: { category: string; page: number } | null }): AsyncIterable<{ + id: string; + cursor: { category: string; page: number }; + }>; + /** Fetches and normalizes one product detail; null if unavailable. */ + getProduct(id: string): Promise; +} diff --git a/crawler/adapters/off/crawl-off.test.ts b/crawler/adapters/off/crawl-off.test.ts new file mode 100644 index 00000000..78c5f8e2 --- /dev/null +++ b/crawler/adapters/off/crawl-off.test.ts @@ -0,0 +1,28 @@ +import { test, expect } from 'bun:test'; +import { join } from 'node:path'; +import { readDumpLines } from '../../lib/jsonl-stream'; +import { crawlOffDump } from './crawl-off'; +import { newStats } from '../../types'; + +const FIXTURE = join(import.meta.dir, '../../fixtures/off-sample.jsonl'); + +test('crawlOffDump emits only valid Swiss products from the fixture dump', async () => { + const stats = newStats(); + const names: string[] = []; + for await (const product of crawlOffDump(readDumpLines(FIXTURE), { stats })) + names.push(product.name); + expect(stats.seen).toBe(6); + expect(stats.emitted).toBe(2); + expect(stats.dropped).toBe(4); + expect(names).toContain('Zweifel Paprika Chips'); + expect(stats.dropReasons['not-swiss']).toBe(1); + expect(stats.dropReasons['no-barcode']).toBe(1); + expect(stats.dropReasons['no-name']).toBe(1); + expect(stats.dropReasons['missing-core']).toBe(1); +}); + +test('crawlOffDump respects the limit option', async () => { + const out = []; + for await (const p of crawlOffDump(readDumpLines(FIXTURE), { limit: 1 })) out.push(p); + expect(out.length).toBe(1); +}); diff --git a/crawler/adapters/off/crawl-off.ts b/crawler/adapters/off/crawl-off.ts new file mode 100644 index 00000000..963ca280 --- /dev/null +++ b/crawler/adapters/off/crawl-off.ts @@ -0,0 +1,38 @@ +import type { DatasetProduct, CrawlStats } from '../../types'; +import { newStats, recordDrop } from '../../types'; +import { offDumpToDataset } from './normalize-off'; +import type { OffDumpProduct } from './types'; + +export type OffCrawlOpts = { + limit?: number; + stats?: CrawlStats; + crawledAt?: string; + onProgress?: (stats: CrawlStats) => void; +}; + +export async function* crawlOffDump( + lines: AsyncIterable, + opts: OffCrawlOpts = {} +): AsyncIterable { + const stats = opts.stats ?? newStats(); + const crawledAt = opts.crawledAt ?? new Date().toISOString(); + for await (const line of lines) { + stats.seen++; + let raw: OffDumpProduct; + try { + raw = JSON.parse(line); + } catch { + recordDrop(stats, 'bad-json'); + continue; + } + const r = offDumpToDataset(raw, crawledAt); + if (!r.ok) { + recordDrop(stats, r.reason); + continue; + } + stats.emitted++; + if (opts.onProgress && stats.seen % 10000 === 0) opts.onProgress(stats); + yield r.product; + if (opts.limit && stats.emitted >= opts.limit) return; + } +} diff --git a/crawler/adapters/off/normalize-off.test.ts b/crawler/adapters/off/normalize-off.test.ts new file mode 100644 index 00000000..157d5a13 --- /dev/null +++ b/crawler/adapters/off/normalize-off.test.ts @@ -0,0 +1,73 @@ +import { test, expect } from 'bun:test'; +import { offDumpToDataset } from './normalize-off'; + +const swissFull = { + code: '7610095131003', + product_name: 'Zweifel Paprika Chips', + brands: 'Zweifel', + lang: 'de', + countries_tags: ['en:switzerland'], + nutriscore_grade: 'd', + nova_group: 4, + nutriments: { + 'energy-kcal_100g': 515, + proteins_100g: 5.8, + carbohydrates_100g: 53, + fat_100g: 30, + fiber_100g: 5.6, + 'saturated-fat_100g': 1.8, + salt_100g: 1.3 + } +}; + +test('maps a full Swiss product to a valid dataset product', () => { + const r = offDumpToDataset(swissFull); + expect(r.ok).toBe(true); + if (r.ok) { + expect(r.product.name).toBe('Zweifel Paprika Chips'); + expect(r.product.barcode).toBe('7610095131003'); + expect(r.product.calories).toBe(515); + expect(r.product.saturatedFat).toBe(1.8); + expect(r.product.salt).toBe(1.3); + expect(r.product.nutriScore).toBe('d'); + expect(r.product.sourceUrl).toContain('7610095131003'); + } +}); + +test('rejects a non-Swiss product', () => { + const r = offDumpToDataset({ ...swissFull, countries_tags: ['en:france'] }); + expect(r.ok).toBe(false); + if (!r.ok) expect(r.reason).toContain('not-swiss'); +}); + +test('rejects a product with no barcode or no name', () => { + expect(offDumpToDataset({ ...swissFull, code: '' }).ok).toBe(false); + expect(offDumpToDataset({ ...swissFull, product_name: '' }).ok).toBe(false); +}); + +test('drops a product missing a core macro', () => { + const n = { ...swissFull.nutriments } as Record; + delete n['fiber_100g']; + const r = offDumpToDataset({ ...swissFull, nutriments: n }); + expect(r.ok).toBe(false); + if (!r.ok) expect(r.reason).toContain('fiber'); +}); + +test('derives kcal from kJ when energy-kcal is absent', () => { + const n = { ...swissFull.nutriments } as Record; + delete n['energy-kcal_100g']; + n['energy-kj_100g'] = 2155; // ~515 kcal + const r = offDumpToDataset({ ...swissFull, nutriments: n }); + expect(r.ok).toBe(true); + if (r.ok) expect(Math.round(r.product.calories)).toBe(515); +}); + +test('prefers product_name_de when present', () => { + const r = offDumpToDataset({ + ...swissFull, + product_name: 'Paprika Chips', + product_name_de: 'Paprika Chips DE' + }); + expect(r.ok).toBe(true); + if (r.ok) expect(r.product.name).toBe('Paprika Chips DE'); +}); diff --git a/crawler/adapters/off/normalize-off.ts b/crawler/adapters/off/normalize-off.ts new file mode 100644 index 00000000..4837b12c --- /dev/null +++ b/crawler/adapters/off/normalize-off.ts @@ -0,0 +1,57 @@ +import { extractAllNutrients } from '$lib/server/nutrient-extract'; +import { buildDatasetProduct } from '../../lib/normalize'; +import type { BuildResult } from '../../types'; +import type { OffDumpProduct } from './types'; + +const KJ_PER_KCAL = 4.184; +const NUTRISCORE = new Set(['a', 'b', 'c', 'd', 'e']); + +function num(v: number | string | undefined): number | null { + if (v == null) return null; + const n = typeof v === 'string' ? parseFloat(v) : v; + return Number.isNaN(n) ? null : n; +} + +export function offDumpToDataset(p: OffDumpProduct, crawledAt?: string): BuildResult { + const code = (p.code ?? '').trim(); + if (!code) return { ok: false, reason: 'no-barcode' }; + const name = (p.product_name_de || p.product_name || '').trim(); + if (!name) return { ok: false, reason: 'no-name' }; + if (!(p.countries_tags ?? []).includes('en:switzerland')) + return { ok: false, reason: 'not-swiss' }; + + const n = (p.nutriments ?? {}) as Record; + let calories = num(n['energy-kcal_100g']); + if (calories == null) { + const kj = num(n['energy-kj_100g']); + if (kj != null) calories = Math.round((kj / KJ_PER_KCAL) * 100) / 100; + } + + const grade = (p.nutriscore_grade ?? '').toLowerCase(); + const nova = num(p.nova_group); + const additives = (p.additives_tags ?? []).slice(0, 200); + const ingredients = (p.ingredients_text_de || p.ingredients_text || '').slice(0, 10000); + + return buildDatasetProduct({ + name: name.slice(0, 500), + brand: (p.brands ?? '').split(',')[0]?.trim() || null, + language: 'de', + servingSize: 100, + servingUnit: 'g', + calories, + protein: num(n['proteins_100g']), + carbs: num(n['carbohydrates_100g']), + fat: num(n['fat_100g']), + fiber: num(n['fiber_100g']), + nutrients: extractAllNutrients(n), + barcode: code.slice(0, 32), + nutriScore: NUTRISCORE.has(grade) ? (grade as 'a' | 'b' | 'c' | 'd' | 'e') : null, + novaGroup: nova != null && nova >= 1 && nova <= 4 ? Math.round(nova) : null, + additives: additives.length > 0 ? additives : null, + ingredientsText: ingredients.length > 0 ? ingredients : null, + imageUrl: p.image_front_url || p.image_url || null, + sourceUrl: `https://world.openfoodfacts.org/product/${code}`, + sourceRef: code, + crawledAt: crawledAt ?? null + }); +} diff --git a/crawler/adapters/off/types.ts b/crawler/adapters/off/types.ts new file mode 100644 index 00000000..dac48804 --- /dev/null +++ b/crawler/adapters/off/types.ts @@ -0,0 +1,17 @@ +export type OffDumpProduct = { + code?: string; + product_name?: string; + product_name_de?: string; + brands?: string; + lang?: string; + lc?: string; + countries_tags?: string[]; + nutriscore_grade?: string; + nova_group?: number | string; + additives_tags?: string[]; + ingredients_text?: string; + ingredients_text_de?: string; + image_url?: string; + image_front_url?: string; + nutriments?: Record; +}; diff --git a/crawler/bun.lock b/crawler/bun.lock new file mode 100644 index 00000000..9bbf9985 --- /dev/null +++ b/crawler/bun.lock @@ -0,0 +1,211 @@ +{ + "lockfileVersion": 1, + "configVersion": 1, + "workspaces": { + "": { + "name": "@bissbilanz/crawler", + "dependencies": { + "migros-api-wrapper": "1.1.37", + }, + }, + }, + "packages": { + "abort-controller": ["abort-controller@3.0.0", "", { "dependencies": { "event-target-shim": "^5.0.0" } }, "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg=="], + + "agent-base": ["agent-base@6.0.2", "", { "dependencies": { "debug": "4" } }, "sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ=="], + + "asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="], + + "atomic-sleep": ["atomic-sleep@1.0.0", "", {}, "sha512-kNOjDqAh7px0XWNI+4QbzoiR/nTkHAWNud2uvnJquD1/x5a7EQZMJT0AczqK0Qn67oY/TTQ1LbUKajZpp3I9tQ=="], + + "axios": ["axios@1.16.1", "", { "dependencies": { "follow-redirects": "^1.16.0", "form-data": "^4.0.5", "https-proxy-agent": "^5.0.1", "proxy-from-env": "^2.1.0" } }, "sha512-caYkukvroVPO8KrzuJEb50Hm07KwfBZPEC3VeFHTsqWHvKTsy54hjJz9BS/cdaypROE2rH6xvm9mHX4fgWkr3A=="], + + "balanced-match": ["balanced-match@1.0.2", "", {}, "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw=="], + + "base64-js": ["base64-js@1.5.1", "", {}, "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA=="], + + "boolbase": ["boolbase@1.0.0", "", {}, "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="], + + "brace-expansion": ["brace-expansion@2.1.1", "", { "dependencies": { "balanced-match": "^1.0.0" } }, "sha512-WR1cURNjuvBLMZBMbqM0UoE+WAfdUcEV1ccD8PVBVOI+Z3ND4+SZbN8RsfT2bMuG1qwz5RFvPukSZm5fF2D5eA=="], + + "buffer": ["buffer@6.0.3", "", { "dependencies": { "base64-js": "^1.3.1", "ieee754": "^1.2.1" } }, "sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA=="], + + "call-bind-apply-helpers": ["call-bind-apply-helpers@1.0.2", "", { "dependencies": { "es-errors": "^1.3.0", "function-bind": "^1.1.2" } }, "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ=="], + + "cheerio": ["cheerio@1.2.0", "", { "dependencies": { "cheerio-select": "^2.1.0", "dom-serializer": "^2.0.0", "domhandler": "^5.0.3", "domutils": "^3.2.2", "encoding-sniffer": "^0.2.1", "htmlparser2": "^10.1.0", "parse5": "^7.3.0", "parse5-htmlparser2-tree-adapter": "^7.1.0", "parse5-parser-stream": "^7.1.2", "undici": "^7.19.0", "whatwg-mimetype": "^4.0.0" } }, "sha512-WDrybc/gKFpTYQutKIK6UvfcuxijIZfMfXaYm8NMsPQxSYvf+13fXUJ4rztGGbJcBQ/GF55gvrZ0Bc0bj/mqvg=="], + + "cheerio-select": ["cheerio-select@2.1.0", "", { "dependencies": { "boolbase": "^1.0.0", "css-select": "^5.1.0", "css-what": "^6.1.0", "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.0.1" } }, "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g=="], + + "colorette": ["colorette@2.0.20", "", {}, "sha512-IfEDxwoWIjkeXL1eXcDiow4UbKjhLdq6/EuSVR9GMN7KVH3r9gQ83e73hsz1Nd1T3ijd5xv1wcWRYO+D6kCI2w=="], + + "combined-stream": ["combined-stream@1.0.8", "", { "dependencies": { "delayed-stream": "~1.0.0" } }, "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg=="], + + "css-select": ["css-select@5.2.2", "", { "dependencies": { "boolbase": "^1.0.0", "css-what": "^6.1.0", "domhandler": "^5.0.2", "domutils": "^3.0.1", "nth-check": "^2.0.1" } }, "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw=="], + + "css-what": ["css-what@6.2.2", "", {}, "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA=="], + + "dateformat": ["dateformat@4.6.3", "", {}, "sha512-2P0p0pFGzHS5EMnhdxQi7aJN+iMheud0UhG4dlE1DLAlvL8JHjJJTX/CSm4JXwV0Ka5nGk3zC5mcb5bUQUxxMA=="], + + "debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="], + + "deepmerge": ["deepmerge@4.3.1", "", {}, "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A=="], + + "delayed-stream": ["delayed-stream@1.0.0", "", {}, "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="], + + "dom-serializer": ["dom-serializer@2.0.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.2", "entities": "^4.2.0" } }, "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg=="], + + "domelementtype": ["domelementtype@2.3.0", "", {}, "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw=="], + + "domhandler": ["domhandler@5.0.3", "", { "dependencies": { "domelementtype": "^2.3.0" } }, "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w=="], + + "domutils": ["domutils@3.2.2", "", { "dependencies": { "dom-serializer": "^2.0.0", "domelementtype": "^2.3.0", "domhandler": "^5.0.3" } }, "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw=="], + + "dotenv": ["dotenv@16.6.1", "", {}, "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow=="], + + "dunder-proto": ["dunder-proto@1.0.1", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.1", "es-errors": "^1.3.0", "gopd": "^1.2.0" } }, "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A=="], + + "encoding-sniffer": ["encoding-sniffer@0.2.1", "", { "dependencies": { "iconv-lite": "^0.6.3", "whatwg-encoding": "^3.1.1" } }, "sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw=="], + + "end-of-stream": ["end-of-stream@1.4.5", "", { "dependencies": { "once": "^1.4.0" } }, "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg=="], + + "entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="], + + "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="], + + "es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="], + + "es-object-atoms": ["es-object-atoms@1.1.2", "", { "dependencies": { "es-errors": "^1.3.0" } }, "sha512-HWcBoN6NileqtSydK2FqHbS/LoDd2pqrnQHLyJzBj4kOp/ky2MWMN694xOfkK8/SnUsW2DH7EfyVlydKCsm1Zw=="], + + "es-set-tostringtag": ["es-set-tostringtag@2.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "get-intrinsic": "^1.2.6", "has-tostringtag": "^1.0.2", "hasown": "^2.0.2" } }, "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA=="], + + "event-target-shim": ["event-target-shim@5.0.1", "", {}, "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ=="], + + "events": ["events@3.3.0", "", {}, "sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q=="], + + "fast-copy": ["fast-copy@3.0.2", "", {}, "sha512-dl0O9Vhju8IrcLndv2eU4ldt1ftXMqqfgN4H1cpmGV7P6jeB9FwpN9a2c8DPGE1Ys88rNUJVYDHq73CGAGOPfQ=="], + + "fast-redact": ["fast-redact@3.5.0", "", {}, "sha512-dwsoQlS7h9hMeYUq1W++23NDcBLV4KqONnITDV9DjfS3q1SgDGVrBdvvTLUotWtPSD7asWDV9/CmsZPy8Hf70A=="], + + "fast-safe-stringify": ["fast-safe-stringify@2.1.1", "", {}, "sha512-W+KJc2dmILlPplD/H4K9l9LcAHAfPtP6BY84uVLXQ6Evcz9Lcg33Y2z1IVblT6xdY54PXYVHEv+0Wpq8Io6zkA=="], + + "follow-redirects": ["follow-redirects@1.16.0", "", {}, "sha512-y5rN/uOsadFT/JfYwhxRS5R7Qce+g3zG97+JrtFZlC9klX/W5hD7iiLzScI4nZqUS7DNUdhPgw4xI8W2LuXlUw=="], + + "form-data": ["form-data@4.0.5", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w=="], + + "fs.realpath": ["fs.realpath@1.0.0", "", {}, "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw=="], + + "function-bind": ["function-bind@1.1.2", "", {}, "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA=="], + + "get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="], + + "get-proto": ["get-proto@1.0.1", "", { "dependencies": { "dunder-proto": "^1.0.1", "es-object-atoms": "^1.0.0" } }, "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g=="], + + "glob": ["glob@8.1.0", "", { "dependencies": { "fs.realpath": "^1.0.0", "inflight": "^1.0.4", "inherits": "2", "minimatch": "^5.0.1", "once": "^1.3.0" } }, "sha512-r8hpEjiQEYlF2QU0df3dS+nxxSIreXQS1qRhMJM0Q5NDdR386C7jb7Hwwod8Fgiuex+k0GFjgft18yvxm5XoCQ=="], + + "gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="], + + "has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="], + + "has-tostringtag": ["has-tostringtag@1.0.2", "", { "dependencies": { "has-symbols": "^1.0.3" } }, "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw=="], + + "hasown": ["hasown@2.0.4", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-T2UbfbBEF32wiepXIsMlTW9+dDYC6wMh/t/vYA4tuOMKqWz/n3vr1NFSxQiyP+zk2mXsoMA/i/7qV6LKut1t1A=="], + + "help-me": ["help-me@4.2.0", "", { "dependencies": { "glob": "^8.0.0", "readable-stream": "^3.6.0" } }, "sha512-TAOnTB8Tz5Dw8penUuzHVrKNKlCIbwwbHnXraNJxPwf8LRtE2HlM84RYuezMFcwOJmoYOCWVDyJ8TQGxn9PgxA=="], + + "htmlparser2": ["htmlparser2@10.1.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.2.2", "entities": "^7.0.1" } }, "sha512-VTZkM9GWRAtEpveh7MSF6SjjrpNVNNVJfFup7xTY3UpFtm67foy9HDVXneLtFVt4pMz5kZtgNcvCniNFb1hlEQ=="], + + "https-proxy-agent": ["https-proxy-agent@5.0.1", "", { "dependencies": { "agent-base": "6", "debug": "4" } }, "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA=="], + + "iconv-lite": ["iconv-lite@0.6.3", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw=="], + + "ieee754": ["ieee754@1.2.1", "", {}, "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA=="], + + "inflight": ["inflight@1.0.6", "", { "dependencies": { "once": "^1.3.0", "wrappy": "1" } }, "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA=="], + + "inherits": ["inherits@2.0.4", "", {}, "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="], + + "joycon": ["joycon@3.1.1", "", {}, "sha512-34wB/Y7MW7bzjKRjUKTa46I2Z7eV62Rkhva+KkopW7Qvv/OSWBqvkSY7vusOPrNuZcUG3tApvdVgNB8POj3SPw=="], + + "math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="], + + "migros-api-wrapper": ["migros-api-wrapper@1.1.37", "", { "dependencies": { "axios": "^1.8.4", "cheerio": "^1.0.0-rc.12", "deepmerge": "^4.3.1", "dotenv": "^16.4.5", "pino": "^8.6.1", "pino-pretty": "^9.1.1" } }, "sha512-D69K7y2BFc2sU+jums4nFIihSU9BiczIeHir5TSeRP9K/e1W6IXCHRLIeB1Of5RKG0wEH8pz8Bxv6S8Fv21tgg=="], + + "mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="], + + "mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="], + + "minimatch": ["minimatch@5.1.9", "", { "dependencies": { "brace-expansion": "^2.0.1" } }, "sha512-7o1wEA2RyMP7Iu7GNba9vc0RWWGACJOCZBJX2GJWip0ikV+wcOsgVuY9uE8CPiyQhkGFSlhuSkZPavN7u1c2Fw=="], + + "minimist": ["minimist@1.2.8", "", {}, "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA=="], + + "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="], + + "nth-check": ["nth-check@2.1.1", "", { "dependencies": { "boolbase": "^1.0.0" } }, "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w=="], + + "on-exit-leak-free": ["on-exit-leak-free@2.1.2", "", {}, "sha512-0eJJY6hXLGf1udHwfNftBqH+g73EU4B504nZeKpz1sYRKafAghwxEJunB2O7rDZkL4PGfsMVnTXZ2EjibbqcsA=="], + + "once": ["once@1.4.0", "", { "dependencies": { "wrappy": "1" } }, "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w=="], + + "parse5": ["parse5@7.3.0", "", { "dependencies": { "entities": "^6.0.0" } }, "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw=="], + + "parse5-htmlparser2-tree-adapter": ["parse5-htmlparser2-tree-adapter@7.1.0", "", { "dependencies": { "domhandler": "^5.0.3", "parse5": "^7.0.0" } }, "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g=="], + + "parse5-parser-stream": ["parse5-parser-stream@7.1.2", "", { "dependencies": { "parse5": "^7.0.0" } }, "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow=="], + + "pino": ["pino@8.21.0", "", { "dependencies": { "atomic-sleep": "^1.0.0", "fast-redact": "^3.1.1", "on-exit-leak-free": "^2.1.0", "pino-abstract-transport": "^1.2.0", "pino-std-serializers": "^6.0.0", "process-warning": "^3.0.0", "quick-format-unescaped": "^4.0.3", "real-require": "^0.2.0", "safe-stable-stringify": "^2.3.1", "sonic-boom": "^3.7.0", "thread-stream": "^2.6.0" }, "bin": { "pino": "bin.js" } }, "sha512-ip4qdzjkAyDDZklUaZkcRFb2iA118H9SgRh8yzTkSQK8HilsOJF7rSY8HoW5+I0M46AZgX/pxbprf2vvzQCE0Q=="], + + "pino-abstract-transport": ["pino-abstract-transport@1.2.0", "", { "dependencies": { "readable-stream": "^4.0.0", "split2": "^4.0.0" } }, "sha512-Guhh8EZfPCfH+PMXAb6rKOjGQEoy0xlAIn+irODG5kgfYV+BQ0rGYYWTIel3P5mmyXqkYkPmdIkywsn6QKUR1Q=="], + + "pino-pretty": ["pino-pretty@9.4.1", "", { "dependencies": { "colorette": "^2.0.7", "dateformat": "^4.6.3", "fast-copy": "^3.0.0", "fast-safe-stringify": "^2.1.1", "help-me": "^4.0.1", "joycon": "^3.1.1", "minimist": "^1.2.6", "on-exit-leak-free": "^2.1.0", "pino-abstract-transport": "^1.0.0", "pump": "^3.0.0", "readable-stream": "^4.0.0", "secure-json-parse": "^2.4.0", "sonic-boom": "^3.0.0", "strip-json-comments": "^3.1.1" }, "bin": { "pino-pretty": "bin.js" } }, "sha512-loWr5SNawVycvY//hamIzyz3Fh5OSpvkcO13MwdDW+eKIGylobPLqnVGTDwDXkdmpJd1BhEG+qhDw09h6SqJiQ=="], + + "pino-std-serializers": ["pino-std-serializers@6.2.2", "", {}, "sha512-cHjPPsE+vhj/tnhCy/wiMh3M3z3h/j15zHQX+S9GkTBgqJuTuJzYJ4gUyACLhDaJ7kk9ba9iRDmbH2tJU03OiA=="], + + "process": ["process@0.11.10", "", {}, "sha512-cdGef/drWFoydD1JsMzuFf8100nZl+GT+yacc2bEced5f9Rjk4z+WtFUTBu9PhOi9j/jfmBPu0mMEY4wIdAF8A=="], + + "process-warning": ["process-warning@3.0.0", "", {}, "sha512-mqn0kFRl0EoqhnL0GQ0veqFHyIN1yig9RHh/InzORTUiZHFRAur+aMtRkELNwGs9aNwKS6tg/An4NYBPGwvtzQ=="], + + "proxy-from-env": ["proxy-from-env@2.1.0", "", {}, "sha512-cJ+oHTW1VAEa8cJslgmUZrc+sjRKgAKl3Zyse6+PV38hZe/V6Z14TbCuXcan9F9ghlz4QrFr2c92TNF82UkYHA=="], + + "pump": ["pump@3.0.4", "", { "dependencies": { "end-of-stream": "^1.1.0", "once": "^1.3.1" } }, "sha512-VS7sjc6KR7e1ukRFhQSY5LM2uBWAUPiOPa/A3mkKmiMwSmRFUITt0xuj+/lesgnCv+dPIEYlkzrcyXgquIHMcA=="], + + "quick-format-unescaped": ["quick-format-unescaped@4.0.4", "", {}, "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg=="], + + "readable-stream": ["readable-stream@4.7.0", "", { "dependencies": { "abort-controller": "^3.0.0", "buffer": "^6.0.3", "events": "^3.3.0", "process": "^0.11.10", "string_decoder": "^1.3.0" } }, "sha512-oIGGmcpTLwPga8Bn6/Z75SVaH1z5dUut2ibSyAMVhmUggWpmDn2dapB0n7f8nwaSiRtepAsfJyfXIO5DCVAODg=="], + + "real-require": ["real-require@0.2.0", "", {}, "sha512-57frrGM/OCTLqLOAh0mhVA9VBMHd+9U7Zb2THMGdBUoZVOtGbJzjxsYGDJ3A9AYYCP4hn6y1TVbaOfzWtm5GFg=="], + + "safe-buffer": ["safe-buffer@5.2.1", "", {}, "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ=="], + + "safe-stable-stringify": ["safe-stable-stringify@2.5.0", "", {}, "sha512-b3rppTKm9T+PsVCBEOUR46GWI7fdOs00VKZ1+9c1EWDaDMvjQc6tUwuFyIprgGgTcWoVHSKrU8H31ZHA2e0RHA=="], + + "safer-buffer": ["safer-buffer@2.1.2", "", {}, "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="], + + "secure-json-parse": ["secure-json-parse@2.7.0", "", {}, "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw=="], + + "sonic-boom": ["sonic-boom@3.8.1", "", { "dependencies": { "atomic-sleep": "^1.0.0" } }, "sha512-y4Z8LCDBuum+PBP3lSV7RHrXscqksve/bi0as7mhwVnBW+/wUqKT/2Kb7um8yqcFy0duYbbPxzt89Zy2nOCaxg=="], + + "split2": ["split2@4.2.0", "", {}, "sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg=="], + + "string_decoder": ["string_decoder@1.3.0", "", { "dependencies": { "safe-buffer": "~5.2.0" } }, "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA=="], + + "strip-json-comments": ["strip-json-comments@3.1.1", "", {}, "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig=="], + + "thread-stream": ["thread-stream@2.7.0", "", { "dependencies": { "real-require": "^0.2.0" } }, "sha512-qQiRWsU/wvNolI6tbbCKd9iKaTnCXsTwVxhhKM6nctPdujTyztjlbUkUTUymidWcMnZ5pWR0ej4a0tjsW021vw=="], + + "undici": ["undici@7.26.0", "", {}, "sha512-3O9Tf67pGhgOv9jM35AbhkXAKi13f3oy3aE4CSgr+TckGeY+/iu97ZXN+J7DpHPzLbVApFd1IFhcnBjREYXYcg=="], + + "util-deprecate": ["util-deprecate@1.0.2", "", {}, "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw=="], + + "whatwg-encoding": ["whatwg-encoding@3.1.1", "", { "dependencies": { "iconv-lite": "0.6.3" } }, "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ=="], + + "whatwg-mimetype": ["whatwg-mimetype@4.0.0", "", {}, "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg=="], + + "wrappy": ["wrappy@1.0.2", "", {}, "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="], + + "help-me/readable-stream": ["readable-stream@3.6.2", "", { "dependencies": { "inherits": "^2.0.3", "string_decoder": "^1.1.1", "util-deprecate": "^1.0.1" } }, "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA=="], + + "htmlparser2/entities": ["entities@7.0.1", "", {}, "sha512-TWrgLOFUQTH994YUyl1yT4uyavY5nNB5muff+RtWaqNVCAK408b5ZnnbNAUEWLTCpum9w6arT70i1XdQ4UeOPA=="], + + "parse5/entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="], + } +} diff --git a/crawler/fixtures/migros-product-detail.json b/crawler/fixtures/migros-product-detail.json new file mode 100644 index 00000000..97ce3199 --- /dev/null +++ b/crawler/fixtures/migros-product-detail.json @@ -0,0 +1,21 @@ +{ + "productId": "100001", + "name": "M-Classic Vollmilch UHT", + "brand": "M-Classic", + "gtins": ["7610200000001"], + "productUrls": { "de": "https://www.migros.ch/de/product/100001" }, + "image": { "original": "https://image.migros.ch/100001.jpg" }, + "nutrients": { + "referenceValue": "100g", + "values": [ + { "code": "energy_kcal", "value": 64 }, + { "code": "protein", "value": 3.3 }, + { "code": "carbohydrate", "value": 4.8 }, + { "code": "of_which_sugars", "value": 4.8 }, + { "code": "fat", "value": 3.5 }, + { "code": "of_which_saturated", "value": 2.1 }, + { "code": "dietary_fiber", "value": 0 }, + { "code": "salt", "value": 0.1 } + ] + } +} diff --git a/crawler/fixtures/off-sample.jsonl b/crawler/fixtures/off-sample.jsonl new file mode 100644 index 00000000..2f0f9180 --- /dev/null +++ b/crawler/fixtures/off-sample.jsonl @@ -0,0 +1,6 @@ +{"code":"7610095131003","product_name":"Zweifel Paprika Chips","brands":"Zweifel","lang":"de","countries_tags":["en:switzerland"],"nutriscore_grade":"d","nova_group":4,"nutriments":{"energy-kcal_100g":515,"proteins_100g":5.8,"carbohydrates_100g":53,"fat_100g":30,"fiber_100g":5.6,"saturated-fat_100g":1.8,"salt_100g":1.3}} +{"code":"7610095999999","product_name":"Bio Apfelsaft","brands":"Coop","lang":"de","countries_tags":["en:switzerland","en:france"],"nutriments":{"energy-kj_100g":192,"proteins_100g":0.2,"carbohydrates_100g":11,"fat_100g":0.1,"fiber_100g":0.2}} +{"code":"3017620422003","product_name":"Nutella","brands":"Ferrero","lang":"fr","countries_tags":["en:france"],"nutriments":{"energy-kcal_100g":539,"proteins_100g":6.3,"carbohydrates_100g":57,"fat_100g":30.9,"fiber_100g":0}} +{"code":"7610095000001","product_name":"Mystery Item","countries_tags":["en:switzerland"],"nutriments":{"energy-kcal_100g":100,"proteins_100g":1,"carbohydrates_100g":1,"fat_100g":1}} +{"code":"","product_name":"No Barcode","countries_tags":["en:switzerland"],"nutriments":{"energy-kcal_100g":100,"proteins_100g":1,"carbohydrates_100g":1,"fat_100g":1,"fiber_100g":1}} +{"code":"7610095000002","product_name":"","countries_tags":["en:switzerland"],"nutriments":{"energy-kcal_100g":100,"proteins_100g":1,"carbohydrates_100g":1,"fat_100g":1,"fiber_100g":1}} diff --git a/crawler/index.test.ts b/crawler/index.test.ts new file mode 100644 index 00000000..6e34ad08 --- /dev/null +++ b/crawler/index.test.ts @@ -0,0 +1,19 @@ +import { test, expect } from 'bun:test'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { runOff } from './index'; +import { datasetHeaderSchema, datasetProductSchema } from '$lib/server/catalog/dataset-schema'; + +test('runOff produces a schema-valid dataset file from the fixture dump', async () => { + const dump = join(import.meta.dir, 'fixtures/off-sample.jsonl'); + const out = join(tmpdir(), `crawler-e2e-${process.pid}.jsonl`); + const stats = await runOff({ dumpPath: dump, outPath: out }); + expect(stats.emitted).toBe(2); + + const lines = (await Bun.file(out).text()).trim().split('\n'); + expect(lines.length).toBe(3); // header + 2 products + expect(datasetHeaderSchema.safeParse(JSON.parse(lines[0])).success).toBe(true); + for (const l of lines.slice(1)) { + expect(datasetProductSchema.safeParse(JSON.parse(l)).success).toBe(true); + } +}); diff --git a/crawler/index.ts b/crawler/index.ts new file mode 100644 index 00000000..d55931a6 --- /dev/null +++ b/crawler/index.ts @@ -0,0 +1,100 @@ +import { rmSync } from 'node:fs'; +import { readDumpLines } from './lib/jsonl-stream'; +import { crawlOffDump } from './adapters/off/crawl-off'; +import { crawlMigros } from './adapters/migros/crawl-migros'; +import { createMigrosClient } from './adapters/migros/client'; +import { DatasetWriter } from './lib/jsonl-writer'; +import { readCheckpoint, writeCheckpoint } from './lib/checkpoint'; +import { newStats, type CrawlStats } from './types'; + +// Root "food" category id(s) in the Migros taxonomy; refine on the host during a real crawl. +const MIGROS_FOOD_CATEGORIES = ['7494731']; +const MIGROS_CHECKPOINT = 'data/catalog/.migros-checkpoint.json'; + +export async function runOff(opts: { dumpPath: string; outPath: string }): Promise { + const stats = newStats(); + const writer = new DatasetWriter(opts.outPath, { + key: 'off-ch', + name: 'Open Food Facts (Switzerland)', + source: 'off', + priority: 20 + }); + await writer.open(); + try { + for await (const product of crawlOffDump(readDumpLines(opts.dumpPath), { + stats, + onProgress: (s) => + console.error(`[off] seen=${s.seen} emitted=${s.emitted} dropped=${s.dropped}`) + })) { + await writer.write(product); + } + } finally { + await writer.close(); + } + console.error(`[off] done: ${stats.emitted} products → ${opts.outPath}`); + console.error(`[off] drop reasons: ${JSON.stringify(stats.dropReasons)}`); + return stats; +} + +export async function runMigros(opts: { + outPath: string; + checkpointPath?: string; +}): Promise { + const stats = newStats(); + const checkpointPath = opts.checkpointPath ?? MIGROS_CHECKPOINT; + const resume = await readCheckpoint<{ category: string; page: number }>(checkpointPath); + if (resume) + console.error(`[migros] resuming from category ${resume.category} page ${resume.page}`); + + const client = await createMigrosClient({ categories: MIGROS_FOOD_CATEGORIES }); + const writer = new DatasetWriter(opts.outPath, { + key: 'migros', + name: 'Migros (Switzerland)', + source: 'migros', + priority: 10 + }); + await writer.open(); + try { + for await (const product of crawlMigros(client, { + stats, + throttleMs: 600, + resume, + onCheckpoint: (cursor) => writeCheckpoint(checkpointPath, cursor), + onProgress: (s) => + console.error(`[migros] seen=${s.seen} emitted=${s.emitted} dropped=${s.dropped}`) + })) { + await writer.write(product); + } + } finally { + await writer.close(); + } + // Completed cleanly → drop the checkpoint so the next run starts fresh. + rmSync(checkpointPath, { force: true }); + console.error(`[migros] done: ${stats.emitted} products → ${opts.outPath}`); + console.error(`[migros] drop reasons: ${JSON.stringify(stats.dropReasons)}`); + return stats; +} + +function dateStamp(): string { + return new Date().toISOString().slice(0, 10); +} + +async function main() { + const [cmd, ...args] = process.argv.slice(2); + if (cmd === 'off') { + const dumpPath = args[0]; + if (!dumpPath) throw new Error('Usage: crawl off [outPath]'); + await runOff({ dumpPath, outPath: args[1] ?? `data/catalog/off-ch-${dateStamp()}.jsonl` }); + } else if (cmd === 'migros') { + await runMigros({ outPath: args[0] ?? `data/catalog/migros-${dateStamp()}.jsonl` }); + } else { + throw new Error(`Unknown command: ${cmd ?? '(none)'}. Expected: off | migros`); + } +} + +if (import.meta.main) { + main().catch((e) => { + console.error(e instanceof Error ? e.message : String(e)); + process.exit(1); + }); +} diff --git a/crawler/lib/checkpoint.test.ts b/crawler/lib/checkpoint.test.ts new file mode 100644 index 00000000..51bb1638 --- /dev/null +++ b/crawler/lib/checkpoint.test.ts @@ -0,0 +1,15 @@ +import { test, expect } from 'bun:test'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { rmSync } from 'node:fs'; +import { readCheckpoint, writeCheckpoint } from './checkpoint'; + +type Cursor = { category: string; page: number }; + +test('round-trips a checkpoint and returns null when absent', async () => { + const path = join(tmpdir(), `crawler-cp-${process.pid}.json`); + rmSync(path, { force: true }); + expect(await readCheckpoint(path)).toBeNull(); + await writeCheckpoint(path, { category: 'snacks', page: 3 }); + expect(await readCheckpoint(path)).toEqual({ category: 'snacks', page: 3 }); +}); diff --git a/crawler/lib/checkpoint.ts b/crawler/lib/checkpoint.ts new file mode 100644 index 00000000..d771502d --- /dev/null +++ b/crawler/lib/checkpoint.ts @@ -0,0 +1,13 @@ +export async function readCheckpoint(path: string): Promise { + const f = Bun.file(path); + if (!(await f.exists())) return null; + try { + return JSON.parse(await f.text()) as T; + } catch { + return null; + } +} + +export async function writeCheckpoint(path: string, value: unknown): Promise { + await Bun.write(path, JSON.stringify(value)); +} diff --git a/crawler/lib/http.test.ts b/crawler/lib/http.test.ts new file mode 100644 index 00000000..c081ae53 --- /dev/null +++ b/crawler/lib/http.test.ts @@ -0,0 +1,76 @@ +import { test, expect } from 'bun:test'; +import { createPoliteClient } from './http'; + +test('retries on failure then succeeds, applying backoff via injected sleep', async () => { + let calls = 0; + const sleeps: number[] = []; + const client = createPoliteClient({ + minDelayMs: 50, + maxRetries: 3, + now: () => 0, + sleep: async (ms) => { + sleeps.push(ms); + }, + fetchImpl: async () => { + calls++; + if (calls < 3) throw new Error('boom'); + return new Response(JSON.stringify({ ok: true }), { status: 200 }); + } + }); + const res = await client.getJson<{ ok: boolean }>('https://x.test/a'); + expect(res?.ok).toBe(true); + expect(calls).toBe(3); + expect(sleeps.filter((s) => s > 0).length).toBeGreaterThanOrEqual(2); // two backoff sleeps +}); + +test('returns null on a 404 without retrying', async () => { + let calls = 0; + const client = createPoliteClient({ + minDelayMs: 0, + maxRetries: 3, + sleep: async () => {}, + fetchImpl: async () => { + calls++; + return new Response('nope', { status: 404 }); + } + }); + expect(await client.getJson('https://x.test/missing')).toBeNull(); + expect(calls).toBe(1); +}); + +test('caches responses by url when a cache is provided', async () => { + let calls = 0; + const store = new Map(); + const client = createPoliteClient({ + minDelayMs: 0, + maxRetries: 1, + sleep: async () => {}, + cache: { get: async (k) => store.get(k) ?? null, set: async (k, v) => void store.set(k, v) }, + fetchImpl: async () => { + calls++; + return new Response(JSON.stringify({ n: calls }), { status: 200 }); + } + }); + const a = await client.getJson<{ n: number }>('https://x.test/c'); + const b = await client.getJson<{ n: number }>('https://x.test/c'); + expect(a).toEqual(b); + expect(calls).toBe(1); +}); + +test('cache key varies by request headers', async () => { + let calls = 0; + const store = new Map(); + const client = createPoliteClient({ + minDelayMs: 0, + maxRetries: 1, + sleep: async () => {}, + cache: { get: async (k) => store.get(k) ?? null, set: async (k, v) => void store.set(k, v) }, + fetchImpl: async () => { + calls++; + return new Response(JSON.stringify({ n: calls }), { status: 200 }); + } + }); + await client.getJson('https://x.test/c', { Authorization: 'Bearer a' }); + await client.getJson('https://x.test/c', { Authorization: 'Bearer b' }); + expect(calls).toBe(2); // different headers → different cache entries +}); diff --git a/crawler/lib/http.ts b/crawler/lib/http.ts new file mode 100644 index 00000000..e1104437 --- /dev/null +++ b/crawler/lib/http.ts @@ -0,0 +1,58 @@ +export type CacheLike = { + get: (key: string) => Promise; + set: (key: string, value: string) => Promise; +}; + +export type PoliteClientOpts = { + minDelayMs?: number; + maxRetries?: number; + userAgent?: string; + sleep?: (ms: number) => Promise; + fetchImpl?: (url: string, init?: RequestInit) => Promise; + cache?: CacheLike; + now?: () => number; +}; + +const defaultSleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); + +export function createPoliteClient(opts: PoliteClientOpts = {}) { + const minDelayMs = opts.minDelayMs ?? 500; + const maxRetries = opts.maxRetries ?? 4; + const sleep = opts.sleep ?? defaultSleep; + const doFetch = opts.fetchImpl ?? fetch; + const now = opts.now ?? Date.now; + const ua = opts.userAgent ?? 'Bissbilanz-Catalog-Crawler/1.0 (+private use; non-redistribution)'; + let lastAt = 0; + + async function throttle() { + const wait = Math.max(0, lastAt + minDelayMs - now()); + if (wait > 0) await sleep(wait); + lastAt = now(); + } + + async function getJson(url: string, headers: Record = {}): Promise { + const cacheKey = Object.keys(headers).length > 0 ? `${url}|${JSON.stringify(headers)}` : url; + if (opts.cache) { + const hit = await opts.cache.get(cacheKey); + if (hit != null) return JSON.parse(hit) as T; + } + let attempt = 0; + for (;;) { + await throttle(); + try { + const res = await doFetch(url, { headers: { 'User-Agent': ua, ...headers } }); + if (res.status === 404 || res.status === 410) return null; + if (!res.ok) throw new Error(`HTTP ${res.status}`); + const text = await res.text(); + if (opts.cache) await opts.cache.set(cacheKey, text); + return JSON.parse(text) as T; + } catch (err) { + attempt++; + if (attempt >= maxRetries) throw err; + await sleep(minDelayMs * 2 ** attempt); + } + } + } + + return { getJson }; +} diff --git a/crawler/lib/jsonl-stream.test.ts b/crawler/lib/jsonl-stream.test.ts new file mode 100644 index 00000000..23e397c3 --- /dev/null +++ b/crawler/lib/jsonl-stream.test.ts @@ -0,0 +1,25 @@ +import { test, expect } from 'bun:test'; +import { splitJsonlLines } from './jsonl-stream'; + +async function* chunks(parts: string[]) { + for (const p of parts) yield new TextEncoder().encode(p); +} + +test('splits a byte stream into lines across chunk boundaries, skipping blanks', async () => { + const out: string[] = []; + for await (const line of splitJsonlLines(chunks(['{"a":1}\n{"b":', '2}\n\n{"c":3}']))) + out.push(line); + expect(out).toEqual(['{"a":1}', '{"b":2}', '{"c":3}']); +}); + +test('decodes a multi-byte UTF-8 char split across chunk boundaries', async () => { + // "ü" (U+00FC) encodes to bytes 0xC3 0xBC; split the stream between those two bytes. + const bytes = new TextEncoder().encode('{"n":"Grün"}\n'); + async function* src() { + yield bytes.slice(0, 9); // ends on the first byte of "ü" + yield bytes.slice(9); + } + const out: string[] = []; + for await (const line of splitJsonlLines(src())) out.push(line); + expect(out).toEqual(['{"n":"Grün"}']); +}); diff --git a/crawler/lib/jsonl-stream.ts b/crawler/lib/jsonl-stream.ts new file mode 100644 index 00000000..ef55c5b6 --- /dev/null +++ b/crawler/lib/jsonl-stream.ts @@ -0,0 +1,27 @@ +export async function* splitJsonlLines(source: AsyncIterable): AsyncIterable { + const decoder = new TextDecoder(); + let buf = ''; + for await (const chunk of source) { + buf += decoder.decode(chunk, { stream: true }); + let nl: number; + while ((nl = buf.indexOf('\n')) >= 0) { + const line = buf.slice(0, nl).trim(); + buf = buf.slice(nl + 1); + if (line.length > 0) yield line; + } + } + buf += decoder.decode(); // flush any buffered bytes from an incomplete trailing sequence + const last = buf.trim(); + if (last.length > 0) yield last; +} + +export async function* readDumpLines(path: string): AsyncIterable { + const file = Bun.file(path); + let stream: ReadableStream = file.stream(); + if (path.endsWith('.gz')) { + stream = stream.pipeThrough( + new DecompressionStream('gzip') as unknown as ReadableWritablePair + ); + } + yield* splitJsonlLines(stream as unknown as AsyncIterable); +} diff --git a/crawler/lib/jsonl-writer.test.ts b/crawler/lib/jsonl-writer.test.ts new file mode 100644 index 00000000..5cee1f98 --- /dev/null +++ b/crawler/lib/jsonl-writer.test.ts @@ -0,0 +1,48 @@ +import { test, expect } from 'bun:test'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { DatasetWriter } from './jsonl-writer'; +import { datasetHeaderSchema, datasetProductSchema } from '$lib/server/catalog/dataset-schema'; + +function tmpFile(name: string) { + return join(tmpdir(), `crawler-test-${name}-${process.pid}.jsonl`); +} + +test('writes a header line then product lines, all schema-valid, with a correct count', async () => { + const path = tmpFile('write'); + const w = new DatasetWriter(path, { + key: 'off-ch', + name: 'OFF (CH)', + source: 'off', + priority: 20 + }); + await w.open(); + await w.write({ + name: 'A', + servingSize: 100, + servingUnit: 'g', + calories: 1, + protein: 1, + carbs: 1, + fat: 1, + fiber: 1 + }); + await w.write({ + name: 'B', + servingSize: 100, + servingUnit: 'g', + calories: 2, + protein: 2, + carbs: 2, + fat: 2, + fiber: 2 + }); + const count = await w.close(); + expect(count).toBe(2); + + const lines = (await Bun.file(path).text()).trim().split('\n'); + expect(lines.length).toBe(3); + expect(datasetHeaderSchema.safeParse(JSON.parse(lines[0])).success).toBe(true); + expect(datasetProductSchema.safeParse(JSON.parse(lines[1])).success).toBe(true); + expect(JSON.parse(lines[0])._dataset.source).toBe('off'); +}); diff --git a/crawler/lib/jsonl-writer.ts b/crawler/lib/jsonl-writer.ts new file mode 100644 index 00000000..7c43e701 --- /dev/null +++ b/crawler/lib/jsonl-writer.ts @@ -0,0 +1,53 @@ +import { datasetProductSchema, type DatasetProduct } from '$lib/server/catalog/dataset-schema'; + +export type DatasetHeaderInput = { + key: string; + name: string; + source: 'migros' | 'off' | 'coop'; + priority: number; + version?: string; +}; + +export class DatasetWriter { + #path: string; + #header: DatasetHeaderInput; + #snapshotAt: string; + #sink: Bun.FileSink | null = null; + #count = 0; + + constructor(path: string, header: DatasetHeaderInput, snapshotAt = new Date().toISOString()) { + this.#path = path; + this.#header = header; + this.#snapshotAt = snapshotAt; + } + + async open(): Promise { + this.#sink = Bun.file(this.#path).writer(); + const headerLine = JSON.stringify({ + _dataset: { + key: this.#header.key, + name: this.#header.name, + source: this.#header.source, + priority: this.#header.priority, + version: this.#header.version ?? null, + snapshotAt: this.#snapshotAt + } + }); + this.#sink.write(headerLine + '\n'); + } + + async write(product: DatasetProduct): Promise { + if (!this.#sink) throw new Error('DatasetWriter.open() not called'); + // fail-closed: never write a line the importer would reject + const parsed = datasetProductSchema.safeParse(product); + if (!parsed.success) throw new Error(`invalid product: ${parsed.error.issues[0]?.message}`); + this.#sink.write(JSON.stringify(product) + '\n'); + this.#count++; + } + + async close(): Promise { + if (this.#sink) await this.#sink.end(); + this.#sink = null; + return this.#count; + } +} diff --git a/crawler/lib/normalize.test.ts b/crawler/lib/normalize.test.ts new file mode 100644 index 00000000..e234e49c --- /dev/null +++ b/crawler/lib/normalize.test.ts @@ -0,0 +1,51 @@ +import { test, expect } from 'bun:test'; +import { buildDatasetProduct } from './normalize'; + +const core = { calories: 515, protein: 5.8, carbs: 53, fat: 30, fiber: 5.6 }; +const meta = { name: 'Zweifel Paprika Chips', servingSize: 100, servingUnit: 'g' as const }; + +test('builds a valid product from core macros + nutrients', () => { + const r = buildDatasetProduct({ ...meta, ...core, nutrients: { saturatedFat: 1.8, salt: 1.3 } }); + expect(r.ok).toBe(true); + if (r.ok) { + expect(r.product.name).toBe('Zweifel Paprika Chips'); + expect(r.product.saturatedFat).toBe(1.8); + expect(r.product.fiber).toBe(5.6); + } +}); + +test('drops a product missing a core macro with reason', () => { + const r = buildDatasetProduct({ + ...meta, + calories: 1, + protein: 1, + carbs: 1, + fat: 1, + fiber: null + }); + expect(r.ok).toBe(false); + if (!r.ok) expect(r.reason).toContain('fiber'); +}); + +test('drops a product with a negative macro', () => { + const r = buildDatasetProduct({ ...meta, ...core, calories: -1 }); + expect(r.ok).toBe(false); +}); + +test('passes through optional quality fields', () => { + const r = buildDatasetProduct({ + ...meta, + ...core, + barcode: '7610095131003', + nutriScore: 'd', + novaGroup: 4, + additives: ['en:e330'], + sourceUrl: 'https://example.com/p/1', + sourceRef: '1' + }); + expect(r.ok).toBe(true); + if (r.ok) { + expect(r.product.barcode).toBe('7610095131003'); + expect(r.product.nutriScore).toBe('d'); + } +}); diff --git a/crawler/lib/normalize.ts b/crawler/lib/normalize.ts new file mode 100644 index 00000000..eee6fa8d --- /dev/null +++ b/crawler/lib/normalize.ts @@ -0,0 +1,71 @@ +import { datasetProductSchema } from '$lib/server/catalog/dataset-schema'; +import { ALL_NUTRIENT_KEYS } from '$lib/nutrients'; +import type { ServingUnit } from '$lib/units'; +import type { BuildResult } from '../types'; + +export type NormalizerInput = { + name: string; + brand?: string | null; + language?: 'de' | 'fr' | 'it' | 'en' | null; + servingSize: number; + servingUnit: ServingUnit; + calories: number | null; + protein: number | null; + carbs: number | null; + fat: number | null; + fiber: number | null; + nutrients?: Record; + barcode?: string | null; + nutriScore?: 'a' | 'b' | 'c' | 'd' | 'e' | null; + novaGroup?: number | null; + additives?: string[] | null; + ingredientsText?: string | null; + imageUrl?: string | null; + sourceUrl?: string | null; + sourceRef?: string | null; + crawledAt?: string | null; +}; + +const CORE = ['calories', 'protein', 'carbs', 'fat', 'fiber'] as const; + +export function buildDatasetProduct(input: NormalizerInput): BuildResult { + for (const k of CORE) { + const v = input[k]; + if (v == null || Number.isNaN(v)) return { ok: false, reason: `missing-core:${k}` }; + } + + const nutrients: Record = {}; + for (const key of ALL_NUTRIENT_KEYS) { + const v = input.nutrients?.[key]; + nutrients[key] = v == null || Number.isNaN(v) ? null : v; + } + + const candidate = { + name: input.name, + brand: input.brand ?? null, + language: input.language ?? null, + servingSize: input.servingSize, + servingUnit: input.servingUnit, + calories: input.calories, + protein: input.protein, + carbs: input.carbs, + fat: input.fat, + fiber: input.fiber, + ...nutrients, + barcode: input.barcode ?? null, + nutriScore: input.nutriScore ?? null, + novaGroup: input.novaGroup ?? null, + additives: input.additives ?? null, + ingredientsText: input.ingredientsText ?? null, + imageUrl: input.imageUrl ?? null, + sourceUrl: input.sourceUrl ?? null, + sourceRef: input.sourceRef ?? null, + crawledAt: input.crawledAt ?? null + }; + + const parsed = datasetProductSchema.safeParse(candidate); + if (!parsed.success) { + return { ok: false, reason: `schema:${parsed.error.issues[0]?.path.join('.') || 'invalid'}` }; + } + return { ok: true, product: parsed.data }; +} diff --git a/crawler/lib/smoke.test.ts b/crawler/lib/smoke.test.ts new file mode 100644 index 00000000..d23fe2ed --- /dev/null +++ b/crawler/lib/smoke.test.ts @@ -0,0 +1,22 @@ +import { test, expect } from 'bun:test'; +import { ALL_NUTRIENT_KEYS } from '$lib/nutrients'; +import { extractAllNutrients } from '$lib/server/nutrient-extract'; +import { datasetProductSchema } from '$lib/server/catalog/dataset-schema'; + +test('shared $lib modules resolve and work from the crawler package', () => { + expect(ALL_NUTRIENT_KEYS.length).toBe(43); + const out = extractAllNutrients({ 'saturated-fat_100g': 1.8, sodium_100g: 0.5 }); + expect(out.saturatedFat).toBe(1.8); + expect(out.sodium).toBe(500); // g→mg conversion + const r = datasetProductSchema.safeParse({ + name: 'X', + servingSize: 100, + servingUnit: 'g', + calories: 1, + protein: 1, + carbs: 1, + fat: 1, + fiber: 1 + }); + expect(r.success).toBe(true); +}); diff --git a/crawler/package.json b/crawler/package.json new file mode 100644 index 00000000..8673901a --- /dev/null +++ b/crawler/package.json @@ -0,0 +1,14 @@ +{ + "name": "@bissbilanz/crawler", + "private": true, + "type": "module", + "description": "Offline crawler that builds Bissbilanz catalog datasets (not part of the app build).", + "scripts": { + "test": "bun test", + "check": "tsc --noEmit", + "crawl": "bun run index.ts" + }, + "dependencies": { + "migros-api-wrapper": "1.1.37" + } +} diff --git a/crawler/tsconfig.json b/crawler/tsconfig.json new file mode 100644 index 00000000..41d2e0a0 --- /dev/null +++ b/crawler/tsconfig.json @@ -0,0 +1,20 @@ +{ + "compilerOptions": { + "target": "ESNext", + "module": "ESNext", + "moduleResolution": "bundler", + "types": ["bun-types"], + "strict": true, + "skipLibCheck": true, + "noEmit": true, + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": false, + "esModuleInterop": true, + "resolveJsonModule": true, + "paths": { + "$lib": ["../src/lib"], + "$lib/*": ["../src/lib/*"] + } + }, + "include": ["**/*.ts"] +} diff --git a/crawler/types.test.ts b/crawler/types.test.ts new file mode 100644 index 00000000..c312916a --- /dev/null +++ b/crawler/types.test.ts @@ -0,0 +1,16 @@ +import { test, expect } from 'bun:test'; +import { NUTRIENT_KEYS, recordDrop, newStats } from './types'; +import { ALL_NUTRIENT_KEYS } from '$lib/nutrients'; + +test('NUTRIENT_KEYS matches the app ALL_NUTRIENT_KEYS exactly (drift guard)', () => { + expect(([...NUTRIENT_KEYS] as string[]).sort()).toEqual([...ALL_NUTRIENT_KEYS].sort()); +}); + +test('recordDrop buckets reasons by prefix before the colon', () => { + const s = newStats(); + recordDrop(s, 'dup:id'); + recordDrop(s, 'dup:barcode'); + recordDrop(s, 'not-swiss'); + expect(s.dropped).toBe(3); + expect(s.dropReasons).toEqual({ dup: 2, 'not-swiss': 1 }); +}); diff --git a/crawler/types.ts b/crawler/types.ts new file mode 100644 index 00000000..bdd1352d --- /dev/null +++ b/crawler/types.ts @@ -0,0 +1,77 @@ +import type { DatasetProduct as SchemaDatasetProduct } from '$lib/server/catalog/dataset-schema'; + +/** + * The 43 extended-nutrient keys. `DatasetProduct` from the shared Zod schema loses these + * (its `z.infer` is built via `Object.fromEntries`, so the keys vanish from the static type). + * We re-attach them here for typed nutrient access in the crawler. `types.test.ts` guards this + * list against `ALL_NUTRIENT_KEYS` (the app's single source of truth) so it can never drift. + */ +export const NUTRIENT_KEYS = [ + 'saturatedFat', + 'monounsaturatedFat', + 'polyunsaturatedFat', + 'transFat', + 'cholesterol', + 'omega3', + 'omega6', + 'sugar', + 'addedSugars', + 'sugarAlcohols', + 'starch', + 'sodium', + 'potassium', + 'calcium', + 'iron', + 'magnesium', + 'phosphorus', + 'zinc', + 'copper', + 'manganese', + 'selenium', + 'iodine', + 'fluoride', + 'chromium', + 'molybdenum', + 'chloride', + 'vitaminA', + 'vitaminC', + 'vitaminD', + 'vitaminE', + 'vitaminK', + 'vitaminB1', + 'vitaminB2', + 'vitaminB3', + 'vitaminB5', + 'vitaminB6', + 'vitaminB7', + 'vitaminB9', + 'vitaminB12', + 'caffeine', + 'alcohol', + 'water', + 'salt' +] as const; + +export type NutrientKey = (typeof NUTRIENT_KEYS)[number]; + +/** Dataset product with the extended-nutrient keys typed (see NUTRIENT_KEYS). */ +export type DatasetProduct = SchemaDatasetProduct & Partial>; + +export type BuildResult = { ok: true; product: DatasetProduct } | { ok: false; reason: string }; + +export type CrawlStats = { + seen: number; + emitted: number; + dropped: number; + dropReasons: Record; +}; + +export function newStats(): CrawlStats { + return { seen: 0, emitted: 0, dropped: 0, dropReasons: {} }; +} + +export function recordDrop(stats: CrawlStats, reason: string): void { + stats.dropped++; + const key = reason.split(':')[0]; + stats.dropReasons[key] = (stats.dropReasons[key] ?? 0) + 1; +} diff --git a/docs/openapi.json b/docs/openapi.json index f8174651..6ab5bf54 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -2385,6 +2385,208 @@ } } }, + "/api/catalog/search": { + "get": { + "operationId": "catalogSearch", + "tags": ["Catalog"], + "description": "Online catalog search across the requesting user's granted datasets.", + "parameters": [ + { + "in": "query", + "name": "q", + "schema": { + "type": "string" + }, + "required": true + }, + { + "in": "query", + "name": "limit", + "schema": { + "type": "integer", + "minimum": -9007199254740991, + "maximum": 9007199254740991 + } + } + ], + "responses": { + "200": { + "description": "Success", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "results": { + "type": "array", + "items": { + "type": "object", + "propertyNames": { + "type": "string" + }, + "additionalProperties": {} + } + } + }, + "required": ["results"], + "additionalProperties": false + } + } + } + }, + "401": { + "$ref": "#/components/responses/UnauthorizedResponse" + } + } + } + }, + "/api/catalog/barcode/{code}": { + "get": { + "operationId": "catalogByBarcode", + "tags": ["Catalog"], + "description": "Barcode lookup across granted catalog datasets (priority tie-break).", + "parameters": [ + { + "in": "path", + "name": "code", + "schema": { + "type": "string" + }, + "required": true + } + ], + "responses": { + "200": { + "description": "Found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "found": { + "type": "boolean" + }, + "result": { + "type": "object", + "propertyNames": { + "type": "string" + }, + "additionalProperties": {} + } + }, + "required": ["found"], + "additionalProperties": false + } + } + } + }, + "400": { + "$ref": "#/components/responses/ValidationErrorResponse" + }, + "401": { + "$ref": "#/components/responses/UnauthorizedResponse" + }, + "404": { + "description": "Not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "found": { + "type": "boolean" + } + }, + "required": ["found"], + "additionalProperties": false + } + } + } + } + } + } + }, + "/api/catalog/{id}/save": { + "post": { + "operationId": "saveCatalogFood", + "tags": ["Catalog"], + "description": "Instantiate a personal food from a catalog row (copy-on-use).", + "parameters": [ + { + "in": "path", + "name": "id", + "schema": { + "type": "string", + "format": "uuid", + "pattern": "^([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-8][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}|00000000-0000-0000-0000-000000000000|ffffffff-ffff-ffff-ffff-ffffffffffff)$" + }, + "required": true + } + ], + "responses": { + "201": { + "description": "Created", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FoodResponse" + } + } + } + }, + "401": { + "$ref": "#/components/responses/UnauthorizedResponse" + }, + "404": { + "$ref": "#/components/responses/NotFoundResponse" + }, + "409": { + "$ref": "#/components/responses/ConflictResponse" + } + } + } + }, + "/api/openfoodfacts/search": { + "get": { + "operationId": "searchOpenFoodFacts", + "tags": ["OpenFoodFacts"], + "description": "Text search Open Food Facts products. Online fallback used by the food picker when local + catalog results are sparse.", + "parameters": [ + { + "in": "query", + "name": "q", + "schema": { + "type": "string" + }, + "required": true + }, + { + "in": "query", + "name": "limit", + "schema": { + "type": "integer", + "minimum": -9007199254740991, + "maximum": 9007199254740991 + } + } + ], + "responses": { + "200": { + "description": "Success", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OpenFoodFactsSearchResponse" + } + } + } + }, + "401": { + "$ref": "#/components/responses/UnauthorizedResponse" + } + } + } + }, "/api/openfoodfacts/{barcode}": { "get": { "operationId": "lookupOpenFoodFacts", @@ -2416,6 +2618,54 @@ } } } + }, + "/api/openfoodfacts/{barcode}/save": { + "post": { + "operationId": "saveOpenFoodFactsProduct", + "tags": ["OpenFoodFacts"], + "description": "Instantiate a personal food from an Open Food Facts product by barcode (copy-on-use). Idempotent: returns the existing food if already saved.", + "parameters": [ + { + "in": "path", + "name": "barcode", + "schema": { + "type": "string" + }, + "required": true + } + ], + "responses": { + "200": { + "description": "Existing food returned", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FoodResponse" + } + } + } + }, + "201": { + "description": "Created", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FoodResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/ValidationErrorResponse" + }, + "401": { + "$ref": "#/components/responses/UnauthorizedResponse" + }, + "404": { + "$ref": "#/components/responses/NotFoundResponse" + } + } + } } }, "components": { @@ -9082,14 +9332,17 @@ "required": ["date", "eveningCalories", "sleepDurationMinutes", "sleepQuality"], "additionalProperties": false }, - "OpenFoodFactsResponse": { + "OpenFoodFactsSearchResponse": { "type": "object", "properties": { - "product": { - "$ref": "#/components/schemas/OpenFoodFactsProduct" + "results": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenFoodFactsProduct" + } } }, - "required": ["product"], + "required": ["results"], "additionalProperties": false }, "OpenFoodFactsProduct": { @@ -9653,6 +9906,16 @@ "ingredientsText" ], "additionalProperties": false + }, + "OpenFoodFactsResponse": { + "type": "object", + "properties": { + "product": { + "$ref": "#/components/schemas/OpenFoodFactsProduct" + } + }, + "required": ["product"], + "additionalProperties": false } }, "responses": { @@ -9688,6 +9951,16 @@ } } } + }, + "NotFoundResponse": { + "description": "Not found", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } } }, "securitySchemes": { diff --git a/docs/superpowers/plans/2026-05-18-base-food-catalog-foundation-and-integration.md b/docs/superpowers/plans/2026-05-18-base-food-catalog-foundation-and-integration.md new file mode 100644 index 00000000..4bc6fa50 --- /dev/null +++ b/docs/superpowers/plans/2026-05-18-base-food-catalog-foundation-and-integration.md @@ -0,0 +1,1859 @@ +# Base Food Catalog — Foundation & App Integration Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Ship the access-gated base food catalog end-to-end (DB tables, dataset file format, admin CLIs, online search/barcode/save endpoints, picker + barcode UI) proven against JSONL fixtures — no crawler yet. + +**Architecture:** Three new read-only tables (`catalog_datasets`, `catalog_foods`, `catalog_access`, M:N grants) populated by a `bun` CLI run on the server host from a Zod-validated JSONL file. The catalog is reached only through new online endpoints (`/api/catalog/*`), never synced into Dexie. Picking a catalog result calls a server-side instantiate endpoint that creates a normal personal `foods` row via the existing `createFood` path, which then syncs to Dexie and is logged normally. + +**Tech Stack:** SvelteKit 2 / Svelte 5 runes, Bun, Drizzle ORM + Postgres (postgres-js), Zod, Vitest (+ Testcontainers integration), Paraglide i18n, zod-openapi. + +**Spec:** `docs/superpowers/specs/2026-05-18-base-food-catalog-crawler-design.md` (this plan covers v1 Phases 1–2 only; OFF/Migros adapters = follow-on plans; Coop = v1.1). + +--- + +## File Structure + +**Phase 1 — Foundation** + +- Modify `src/lib/server/schema.ts` — add `catalogDatasets`, `catalogFoods`, `catalogAccess`. +- Generate+hand-edit `drizzle/0037_*.sql` — `CREATE EXTENSION pg_trgm` + GIN index appended. +- Create `src/lib/server/catalog/dataset-schema.ts` — Zod schema for the JSONL contract (header + product), shared by importer and (future) crawler. +- Create `src/lib/server/catalog/dataset-schema.test.ts` — unit tests. +- Create `scripts/catalog.ts` — one CLI with `import` / `grant` / `revoke` / `list` subcommands (shared DB-connect boilerplate, DRY). +- Modify `package.json` — `catalog:*` scripts. +- Modify `.gitignore` — ignore `data/catalog/`. +- Modify `.pre-commit-config.yaml` — local hook rejecting committed `data/catalog/*.jsonl`. +- Create `tests/integration-db/catalog-schema.test.ts` — drift guard + extension/index existence. +- Create `tests/integration-db/catalog-import.test.ts` — import/grant/revoke/list behavior. +- Create `tests/fixtures/catalog/mini.jsonl` — tiny valid dataset fixture. + +**Phase 2 — App integration** + +- Create `src/lib/server/nutrient-extract.ts` — pure nutrient-extraction helper (extracted from `openfoodfacts.ts`, Gap 8). +- Modify `src/lib/server/openfoodfacts.ts` — use the shared helper (behavior-preserving). +- Create `src/lib/server/catalog/queries.ts` — `catalogSearch`, `catalogByBarcode`, `instantiateCatalogFood`. +- Create `src/routes/api/catalog/search/+server.ts`, `src/routes/api/catalog/barcode/[code]/+server.ts`, `src/routes/api/catalog/[id]/save/+server.ts`. +- Modify `src/lib/server/openapi.ts` — declare the 3 catalog routes; regenerate `docs/openapi.json` + `src/lib/api/generated/schema.d.ts`. +- Modify `messages/en.json`, `messages/de.json` — catalog UI strings. +- Modify `src/lib/components/entries/FoodPicker.svelte` — online catalog search + source badge. +- Modify `src/lib/components/entries/AddFoodModal.svelte` — catalog pick → save → log. +- Modify `src/lib/services/food-service.svelte.ts` — `saveFromCatalog`. +- Modify `src/lib/components/entries/DayLog.svelte` — barcode miss → catalog before OFF. +- Create `tests/integration-db/catalog-endpoints.test.ts` — access gating, priority tie-break, instantiate, `/api/foods` isolation. +- Create `src/lib/server/nutrient-extract.test.ts` — unit test. + +--- + +# PHASE 1 — Catalog Foundation + +### Task 1: Catalog schema + migration (with hand-appended pg_trgm) + drift-guard test + +**Files:** +- Test: `tests/integration-db/catalog-schema.test.ts` (create) +- Modify: `src/lib/server/schema.ts` (append after the `foods` table block) +- Generate: `drizzle/0037_*.sql` + `drizzle/meta/_journal.json` (drizzle-kit), then hand-edit the `.sql` + +- [ ] **Step 1: Write the failing drift-guard + extension integration test** + +Create `tests/integration-db/catalog-schema.test.ts`: + +```typescript +import { describe, it, expect, beforeAll, afterAll } from 'vitest'; +import { sql } from 'drizzle-orm'; +import { createTestDatabase, dropTestDatabase, runTestMigrations, getTestDB, closeTestDB } from './helpers'; +import { ALL_NUTRIENTS } from '$lib/nutrients'; + +const DB_NAME = 'test_catalog_schema'; +let dbUrl: string; + +beforeAll(async () => { + dbUrl = await createTestDatabase(DB_NAME); + await runTestMigrations(dbUrl); +}); + +afterAll(async () => { + await closeTestDB(dbUrl); + await dropTestDatabase(DB_NAME); +}); + +async function columns(db: ReturnType, table: string): Promise> { + const rows = await db.execute( + sql`SELECT column_name FROM information_schema.columns WHERE table_name = ${table}` + ); + return new Set((rows as unknown as { column_name: string }[]).map((r) => r.column_name)); +} + +describe('catalog schema', () => { + it('catalog_foods nutrient columns match foods and ALL_NUTRIENTS exactly', async () => { + const db = getTestDB(dbUrl); + const catalogCols = await columns(db, 'catalog_foods'); + const foodCols = await columns(db, 'foods'); + for (const n of ALL_NUTRIENTS) { + expect(catalogCols.has(n.dbColumn), `catalog_foods missing ${n.dbColumn}`).toBe(true); + expect(foodCols.has(n.dbColumn), `foods missing ${n.dbColumn}`).toBe(true); + } + }); + + it('pg_trgm extension and GIN name index exist', async () => { + const db = getTestDB(dbUrl); + const ext = await db.execute(sql`SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm'`); + expect((ext as unknown as unknown[]).length).toBe(1); + const idx = await db.execute( + sql`SELECT 1 FROM pg_indexes WHERE indexname = 'idx_catalog_foods_name_trgm'` + ); + expect((idx as unknown as unknown[]).length).toBe(1); + }); + + it('catalog_datasets.key is unique and catalog_access has composite PK', async () => { + const db = getTestDB(dbUrl); + const ds = await columns(db, 'catalog_datasets'); + expect(ds.has('key')).toBe(true); + expect(ds.has('priority')).toBe(true); + const acc = await columns(db, 'catalog_access'); + expect(acc.has('user_id')).toBe(true); + expect(acc.has('dataset_id')).toBe(true); + }); +}); +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-schema.test.ts` +Expected: FAIL — migration has no `catalog_foods` table (`relation "catalog_foods" does not exist` or column assertions fail). + +- [ ] **Step 3: Add the three tables to `src/lib/server/schema.ts`** + +Append immediately after the closing `);` of the `foods` table definition (the import block at the top already includes `pgTable, uuid, text, timestamp, real, boolean, integer, index, uniqueIndex, primaryKey, check` and `sql` — no import changes needed): + +```typescript +export const catalogDatasets = pgTable('catalog_datasets', { + id: uuid('id').primaryKey().defaultRandom(), + key: text('key').notNull().unique(), + name: text('name').notNull(), + source: text('source').notNull(), + priority: integer('priority').notNull().default(100), + description: text('description'), + productCount: integer('product_count'), + version: text('version'), + snapshotAt: timestamp('snapshot_at', { withTimezone: true }), + createdAt: timestamp('created_at', { withTimezone: true }).defaultNow(), + updatedAt: timestamp('updated_at', { withTimezone: true }).defaultNow() +}); + +export const catalogFoods = pgTable( + 'catalog_foods', + { + id: uuid('id').primaryKey().defaultRandom(), + datasetId: uuid('dataset_id') + .notNull() + .references(() => catalogDatasets.id, { onDelete: 'cascade' }), + name: text('name').notNull(), + brand: text('brand'), + language: text('language'), + servingSize: real('serving_size').notNull(), + servingUnit: servingUnitEnum('serving_unit').notNull(), + calories: real('calories').notNull(), + protein: real('protein').notNull(), + carbs: real('carbs').notNull(), + fat: real('fat').notNull(), + fiber: real('fiber').notNull(), + // Advanced nutrients — fat breakdown + saturatedFat: real('saturated_fat'), + monounsaturatedFat: real('monounsaturated_fat'), + polyunsaturatedFat: real('polyunsaturated_fat'), + transFat: real('trans_fat'), + cholesterol: real('cholesterol'), + omega3: real('omega3'), + omega6: real('omega6'), + // Sugar & carb details + sugar: real('sugar'), + addedSugars: real('added_sugars'), + sugarAlcohols: real('sugar_alcohols'), + starch: real('starch'), + // Minerals + sodium: real('sodium'), + potassium: real('potassium'), + calcium: real('calcium'), + iron: real('iron'), + magnesium: real('magnesium'), + phosphorus: real('phosphorus'), + zinc: real('zinc'), + copper: real('copper'), + manganese: real('manganese'), + selenium: real('selenium'), + iodine: real('iodine'), + fluoride: real('fluoride'), + chromium: real('chromium'), + molybdenum: real('molybdenum'), + chloride: real('chloride'), + // Vitamins + vitaminA: real('vitamin_a'), + vitaminC: real('vitamin_c'), + vitaminD: real('vitamin_d'), + vitaminE: real('vitamin_e'), + vitaminK: real('vitamin_k'), + vitaminB1: real('vitamin_b1'), + vitaminB2: real('vitamin_b2'), + vitaminB3: real('vitamin_b3'), + vitaminB5: real('vitamin_b5'), + vitaminB6: real('vitamin_b6'), + vitaminB7: real('vitamin_b7'), + vitaminB9: real('vitamin_b9'), + vitaminB12: real('vitamin_b12'), + // Other + caffeine: real('caffeine'), + alcohol: real('alcohol'), + water: real('water'), + salt: real('salt'), + barcode: text('barcode'), + nutriScore: text('nutri_score'), + novaGroup: integer('nova_group'), + additives: text('additives').array(), + ingredientsText: text('ingredients_text'), + imageUrl: text('image_url'), + sourceUrl: text('source_url'), + sourceRef: text('source_ref'), + crawledAt: timestamp('crawled_at', { withTimezone: true }), + createdAt: timestamp('created_at', { withTimezone: true }).defaultNow() + }, + (table) => [ + index('idx_catalog_foods_dataset').on(table.datasetId), + index('idx_catalog_foods_dataset_barcode').on(table.datasetId, table.barcode), + check('catalog_foods_serving_positive', sql`${table.servingSize} > 0`), + check( + 'catalog_foods_nutrition_nonnegative', + sql`${table.calories} >= 0 AND ${table.protein} >= 0 AND ${table.carbs} >= 0 AND ${table.fat} >= 0 AND ${table.fiber} >= 0` + ) + ] +); + +export const catalogAccess = pgTable( + 'catalog_access', + { + userId: uuid('user_id') + .notNull() + .references(() => users.id, { onDelete: 'cascade' }), + datasetId: uuid('dataset_id') + .notNull() + .references(() => catalogDatasets.id, { onDelete: 'cascade' }), + grantedAt: timestamp('granted_at', { withTimezone: true }).defaultNow() + }, + (table) => [primaryKey({ columns: [table.userId, table.datasetId] })] +); +``` + +The `name gin_trgm_ops` index is intentionally NOT declared here — drizzle-kit cannot emit the opclass or `CREATE EXTENSION`; it is hand-appended to the migration (Step 5) and verified by the test. + +- [ ] **Step 4: Generate the migration** + +Run: `bun run db:generate` +Expected: a new file `drizzle/0037_.sql` is created and `drizzle/meta/_journal.json` gains an `idx: 37` entry. Note the exact generated filename. + +- [ ] **Step 5: Hand-append the pg_trgm extension + GIN index to the generated SQL** + +Open the generated `drizzle/0037_.sql`. At the very END of the file, append (the last existing statement already ends with `--> statement-breakpoint` or is the last `CREATE TABLE`; ensure a `--> statement-breakpoint` separates them): + +```sql +--> statement-breakpoint +CREATE EXTENSION IF NOT EXISTS pg_trgm;--> statement-breakpoint +CREATE INDEX "idx_catalog_foods_name_trgm" ON "catalog_foods" USING gin ("name" gin_trgm_ops); +``` + +Do NOT run `bun run db:generate` again after this edit (it would overwrite the manual SQL — this file is now frozen; the journal already records it). + +- [ ] **Step 6: Run the test to verify it passes** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-schema.test.ts` +Expected: PASS (3 tests). Testcontainers `postgres:18` ships `pg_trgm` in contrib. + +- [ ] **Step 7: Verify the dev server starts cleanly (migration safety rule)** + +Run: `timeout 25 bun run dev 2>&1 | head -40` +Expected: server boots, no "Migration failed" output. (Stop it; the migration applied via `runMigrations()`.) + +- [ ] **Step 8: Commit** + +```bash +git add src/lib/server/schema.ts drizzle/ tests/integration-db/catalog-schema.test.ts +git commit -m "feat: add catalog schema (datasets/foods/access) + pg_trgm index" +``` + +--- + +### Task 2: Dataset JSONL Zod schema (the crawler↔importer contract) + +**Files:** +- Test: `src/lib/server/catalog/dataset-schema.test.ts` (create) +- Create: `src/lib/server/catalog/dataset-schema.ts` + +- [ ] **Step 1: Write the failing unit test** + +Create `src/lib/server/catalog/dataset-schema.test.ts`: + +```typescript +import { describe, it, expect } from 'vitest'; +import { datasetHeaderSchema, datasetProductSchema } from './dataset-schema'; + +describe('dataset-schema', () => { + it('accepts a valid header record', () => { + const r = datasetHeaderSchema.safeParse({ + _dataset: { + key: 'migros', + name: 'Migros (Switzerland)', + source: 'migros', + priority: 10, + version: '2026.05.18', + snapshotAt: '2026-05-18T00:00:00.000Z' + } + }); + expect(r.success).toBe(true); + }); + + it('accepts a minimal valid product line', () => { + const r = datasetProductSchema.safeParse({ + name: 'Zweifel Paprika Chips', + servingSize: 100, + servingUnit: 'g', + calories: 515, + protein: 5.8, + carbs: 53, + fat: 30, + fiber: 5.6 + }); + expect(r.success).toBe(true); + }); + + it('accepts known extended nutrients and OFF quality fields', () => { + const r = datasetProductSchema.safeParse({ + name: 'X', + servingSize: 100, + servingUnit: 'g', + calories: 1, + protein: 1, + carbs: 1, + fat: 1, + fiber: 1, + saturatedFat: 5.1, + salt: 1.3, + barcode: '7610095131003', + language: 'de', + nutriScore: 'd', + novaGroup: 4, + additives: ['en:e330'], + sourceUrl: 'https://www.migros.ch/de/product/123', + sourceRef: '123' + }); + expect(r.success).toBe(true); + }); + + it('rejects a product missing required core macros', () => { + const r = datasetProductSchema.safeParse({ name: 'X', servingSize: 100, servingUnit: 'g' }); + expect(r.success).toBe(false); + }); + + it('rejects negative nutrients and bad nutriScore', () => { + expect( + datasetProductSchema.safeParse({ + name: 'X', + servingSize: 100, + servingUnit: 'g', + calories: -1, + protein: 0, + carbs: 0, + fat: 0, + fiber: 0 + }).success + ).toBe(false); + expect( + datasetProductSchema.safeParse({ + name: 'X', + servingSize: 100, + servingUnit: 'g', + calories: 0, + protein: 0, + carbs: 0, + fat: 0, + fiber: 0, + nutriScore: 'z' + }).success + ).toBe(false); + }); +}); +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: `bun --bun vitest run src/lib/server/catalog/dataset-schema.test.ts` +Expected: FAIL — `Cannot find module './dataset-schema'`. + +- [ ] **Step 3: Implement the schema** + +Create `src/lib/server/catalog/dataset-schema.ts`: + +```typescript +import { z } from 'zod'; +import { servingUnitSchema } from '$lib/units'; +import { ALL_NUTRIENT_KEYS } from '$lib/nutrients'; + +const optNutrient = z.coerce.number().nonnegative().optional().nullable(); +const nutrientFields = Object.fromEntries(ALL_NUTRIENT_KEYS.map((k) => [k, optNutrient])); + +export const datasetHeaderSchema = z.object({ + _dataset: z.object({ + key: z + .string() + .min(1) + .max(64) + .regex(/^[a-z0-9-]+$/), + name: z.string().min(1).max(200), + source: z.enum(['migros', 'off', 'coop']), + priority: z.coerce.number().int().min(0).max(1000), + version: z.string().max(64).optional().nullable(), + snapshotAt: z.string().datetime().optional().nullable() + }) +}); + +export const datasetProductSchema = z.object({ + name: z.string().min(1).max(500), + brand: z.string().max(500).optional().nullable(), + language: z.enum(['de', 'fr', 'it', 'en']).optional().nullable(), + servingSize: z.coerce.number().positive(), + servingUnit: servingUnitSchema, + calories: z.coerce.number().nonnegative(), + protein: z.coerce.number().nonnegative(), + carbs: z.coerce.number().nonnegative(), + fat: z.coerce.number().nonnegative(), + fiber: z.coerce.number().nonnegative(), + ...nutrientFields, + barcode: z.string().max(32).optional().nullable(), + nutriScore: z.enum(['a', 'b', 'c', 'd', 'e']).optional().nullable(), + novaGroup: z.coerce.number().int().min(1).max(4).optional().nullable(), + additives: z.array(z.string().max(100)).max(200).optional().nullable(), + ingredientsText: z.string().max(10000).optional().nullable(), + imageUrl: z.string().url().max(2000).optional().nullable(), + sourceUrl: z.string().url().max(2000).optional().nullable(), + sourceRef: z.string().max(200).optional().nullable(), + crawledAt: z.string().datetime().optional().nullable() +}); + +export type DatasetHeader = z.infer; +export type DatasetProduct = z.infer; +``` + +- [ ] **Step 4: Run the test to verify it passes** + +Run: `bun --bun vitest run src/lib/server/catalog/dataset-schema.test.ts` +Expected: PASS (5 tests). + +- [ ] **Step 5: Commit** + +```bash +git add src/lib/server/catalog/dataset-schema.ts src/lib/server/catalog/dataset-schema.test.ts +git commit -m "feat: add catalog dataset JSONL Zod schema" +``` + +--- + +### Task 3: Catalog CLI — `import` subcommand + fixture + integration test + +**Files:** +- Create: `tests/fixtures/catalog/mini.jsonl` +- Test: `tests/integration-db/catalog-import.test.ts` (create) +- Create: `scripts/catalog.ts` +- Modify: `package.json` (scripts block) + +- [ ] **Step 1: Create the fixture dataset** + +Create `tests/fixtures/catalog/mini.jsonl` (exactly these 3 lines; line 1 is the header): + +``` +{"_dataset":{"key":"testset","name":"Test Set","source":"migros","priority":10,"version":"t1","snapshotAt":"2026-05-18T00:00:00.000Z"}} +{"name":"Zweifel Paprika Chips","brand":"Zweifel","language":"de","servingSize":100,"servingUnit":"g","calories":515,"protein":5.8,"carbs":53,"fat":30,"fiber":5.6,"saturatedFat":1.8,"salt":1.3,"barcode":"7610095131003","sourceUrl":"https://www.migros.ch/de/product/1","sourceRef":"1"} +{"name":"Coop Naturaplan Bio Apfel","brand":"Coop","language":"de","servingSize":100,"servingUnit":"g","calories":52,"protein":0.3,"carbs":14,"fat":0.2,"fiber":2.4,"barcode":"7610095131004","sourceUrl":"https://www.migros.ch/de/product/2","sourceRef":"2"} +``` + +- [ ] **Step 2: Write the failing import integration test** + +Create `tests/integration-db/catalog-import.test.ts`: + +```typescript +import { describe, it, expect, beforeAll, afterAll } from 'vitest'; +import { eq } from 'drizzle-orm'; +import { $ } from 'bun'; +import { join } from 'node:path'; +import { createTestDatabase, dropTestDatabase, runTestMigrations, getTestDB, closeTestDB } from './helpers'; +import { catalogDatasets, catalogFoods } from '$lib/server/schema'; + +const DB_NAME = 'test_catalog_import'; +let dbUrl: string; +const FIXTURE = join(process.cwd(), 'tests/fixtures/catalog/mini.jsonl'); + +beforeAll(async () => { + dbUrl = await createTestDatabase(DB_NAME); + await runTestMigrations(dbUrl); +}); +afterAll(async () => { + await closeTestDB(dbUrl); + await dropTestDatabase(DB_NAME); +}); + +describe('catalog:import', () => { + it('imports a dataset, upserts by key, replaces rows on re-import', async () => { + await $`bun run scripts/catalog.ts import ${FIXTURE}`.env({ ...process.env, DATABASE_URL: dbUrl }); + const db = getTestDB(dbUrl); + const ds = await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'testset') }); + expect(ds).toBeDefined(); + expect(ds!.productCount).toBe(2); + const firstId = ds!.id; + const rows = await db.select().from(catalogFoods).where(eq(catalogFoods.datasetId, firstId)); + expect(rows.length).toBe(2); + expect(rows.find((r) => r.barcode === '7610095131003')!.name).toBe('Zweifel Paprika Chips'); + + // Re-import: same key reuses the dataset row (id stable), rows replaced + await $`bun run scripts/catalog.ts import ${FIXTURE}`.env({ ...process.env, DATABASE_URL: dbUrl }); + const ds2 = await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'testset') }); + expect(ds2!.id).toBe(firstId); + const rows2 = await db.select().from(catalogFoods).where(eq(catalogFoods.datasetId, firstId)); + expect(rows2.length).toBe(2); + }); + + it('fails closed on an invalid line and aborts the whole import', async () => { + const bad = join(process.cwd(), 'tests/fixtures/catalog/bad.jsonl'); + await Bun.write( + bad, + '{"_dataset":{"key":"badset","name":"Bad","source":"migros","priority":1}}\n{"name":"NoMacros","servingSize":100,"servingUnit":"g"}\n' + ); + let failed = false; + try { + await $`bun run scripts/catalog.ts import ${bad}` + .env({ ...process.env, DATABASE_URL: dbUrl }) + .quiet(); + } catch { + failed = true; + } + expect(failed).toBe(true); + const db = getTestDB(dbUrl); + const ds = await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'badset') }); + expect(ds).toBeUndefined(); + }); +}); +``` + +- [ ] **Step 3: Run the test to verify it fails** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-import.test.ts` +Expected: FAIL — `scripts/catalog.ts` does not exist (`bun run` errors / non-zero exit). + +- [ ] **Step 4: Implement `scripts/catalog.ts` (import subcommand)** + +Create `scripts/catalog.ts`: + +```typescript +#!/usr/bin/env bun +import { drizzle } from 'drizzle-orm/postgres-js'; +import { eq, sql } from 'drizzle-orm'; +import postgres from 'postgres'; +import { catalogDatasets, catalogFoods, catalogAccess, users } from '../src/lib/server/schema'; +import { datasetHeaderSchema, datasetProductSchema } from '../src/lib/server/catalog/dataset-schema'; +import { ALL_NUTRIENT_KEYS } from '../src/lib/nutrients'; + +const databaseUrl = process.env.DATABASE_URL; +if (!databaseUrl) { + console.error('DATABASE_URL environment variable is required'); + process.exit(1); +} +const client = postgres(databaseUrl, { max: 1 }); +const db = drizzle(client, { schema: { catalogDatasets, catalogFoods, catalogAccess, users } }); + +const TRGM_INDEX = 'idx_catalog_foods_name_trgm'; + +function pickNutrientCols(p: Record) { + return Object.fromEntries(ALL_NUTRIENT_KEYS.map((k) => [k, (p[k] as number | null | undefined) ?? null])); +} + +async function importDataset(file: string) { + const text = await Bun.file(file).text(); + const lines = text.split('\n').filter((l) => l.trim().length > 0); + if (lines.length === 0) throw new Error('Empty dataset file'); + + const header = datasetHeaderSchema.parse(JSON.parse(lines[0]))._dataset; + + const products = lines.slice(1).map((line, i) => { + const parsed = datasetProductSchema.safeParse(JSON.parse(line)); + if (!parsed.success) { + throw new Error(`Invalid product at line ${i + 2}: ${parsed.error.message}`); + } + return parsed.data; + }); + + await db.transaction(async (tx) => { + const [ds] = await tx + .insert(catalogDatasets) + .values({ + key: header.key, + name: header.name, + source: header.source, + priority: header.priority, + version: header.version ?? null, + snapshotAt: header.snapshotAt ? new Date(header.snapshotAt) : null, + productCount: products.length, + updatedAt: new Date() + }) + .onConflictDoUpdate({ + target: catalogDatasets.key, + set: { + name: header.name, + source: header.source, + priority: header.priority, + version: header.version ?? null, + snapshotAt: header.snapshotAt ? new Date(header.snapshotAt) : null, + productCount: products.length, + updatedAt: new Date() + } + }) + .returning(); + + await tx.execute(sql`DROP INDEX IF EXISTS ${sql.identifier(TRGM_INDEX)}`); + await tx.delete(catalogFoods).where(eq(catalogFoods.datasetId, ds.id)); + + const CHUNK = 2000; + for (let i = 0; i < products.length; i += CHUNK) { + const slice = products.slice(i, i + CHUNK).map((p) => ({ + datasetId: ds.id, + name: p.name, + brand: p.brand ?? null, + language: p.language ?? null, + servingSize: p.servingSize, + servingUnit: p.servingUnit, + calories: p.calories, + protein: p.protein, + carbs: p.carbs, + fat: p.fat, + fiber: p.fiber, + barcode: p.barcode ?? null, + nutriScore: p.nutriScore ?? null, + novaGroup: p.novaGroup ?? null, + additives: p.additives ?? null, + ingredientsText: p.ingredientsText ?? null, + imageUrl: p.imageUrl ?? null, + sourceUrl: p.sourceUrl ?? null, + sourceRef: p.sourceRef ?? null, + crawledAt: p.crawledAt ? new Date(p.crawledAt) : null, + ...pickNutrientCols(p as Record) + })); + if (slice.length > 0) await tx.insert(catalogFoods).values(slice); + } + + await tx.execute( + sql`CREATE INDEX ${sql.identifier(TRGM_INDEX)} ON ${catalogFoods} USING gin (${catalogFoods.name} gin_trgm_ops)` + ); + }); + + console.log(`Imported ${products.length} products into dataset "${header.key}"`); +} + +const [cmd, ...args] = process.argv.slice(2); + +try { + if (cmd === 'import') { + if (!args[0]) throw new Error('Usage: catalog import '); + await importDataset(args[0]); + } else { + throw new Error(`Unknown command: ${cmd ?? '(none)'}. Expected: import|grant|revoke|list`); + } + await client.end(); + process.exit(0); +} catch (e) { + console.error(e instanceof Error ? e.message : String(e)); + await client.end(); + process.exit(1); +} +``` + +- [ ] **Step 5: Add the `catalog:import` package.json script** + +In `package.json`, inside `"scripts"`, after the `"test:seed"` line add: + +```json + "catalog:import": "bun run scripts/catalog.ts import", + "catalog:grant": "bun run scripts/catalog.ts grant", + "catalog:revoke": "bun run scripts/catalog.ts revoke", + "catalog:list": "bun run scripts/catalog.ts list" +``` + +(Add a trailing comma to the preceding `"test:seed": ...` line so JSON stays valid; the last new line gets no trailing comma if it is the final scripts entry.) + +- [ ] **Step 6: Run the test to verify it passes** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-import.test.ts` +Expected: PASS (2 tests). + +- [ ] **Step 7: Commit** + +```bash +git add scripts/catalog.ts package.json tests/fixtures/catalog/mini.jsonl tests/integration-db/catalog-import.test.ts +git commit -m "feat: add catalog:import CLI (validated JSONL, batched replace, GIN recreate)" +``` + +--- + +### Task 4: Catalog CLI — `grant` / `revoke` / `list` subcommands + integration test + +**Files:** +- Test: `tests/integration-db/catalog-import.test.ts` (extend — add a `describe` block) +- Modify: `scripts/catalog.ts` + +- [ ] **Step 1: Add the failing test block** + +Append to `tests/integration-db/catalog-import.test.ts` (after the existing `describe('catalog:import', ...)` block), and add `users` to the schema import at the top of the file (`import { catalogDatasets, catalogFoods, catalogAccess, users } from '$lib/server/schema';` and `import { and } from 'drizzle-orm';` alongside `eq`): + +```typescript +describe('catalog:grant / revoke / list', () => { + it('grants and revokes dataset access by user email', async () => { + const db = getTestDB(dbUrl); + await $`bun run scripts/catalog.ts import ${FIXTURE}`.env({ ...process.env, DATABASE_URL: dbUrl }); + const [u] = await db + .insert(users) + .values({ infomaniakSub: 'sub-grant-1', email: 'fam@example.com' }) + .returning(); + const ds = (await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'testset') }))!; + + await $`bun run scripts/catalog.ts grant fam@example.com testset`.env({ ...process.env, DATABASE_URL: dbUrl }); + let grants = await db + .select() + .from(catalogAccess) + .where(and(eq(catalogAccess.userId, u.id), eq(catalogAccess.datasetId, ds.id))); + expect(grants.length).toBe(1); + + await $`bun run scripts/catalog.ts revoke fam@example.com testset`.env({ ...process.env, DATABASE_URL: dbUrl }); + grants = await db + .select() + .from(catalogAccess) + .where(and(eq(catalogAccess.userId, u.id), eq(catalogAccess.datasetId, ds.id))); + expect(grants.length).toBe(0); + }); + + it('list exits 0', async () => { + const r = await $`bun run scripts/catalog.ts list` + .env({ ...process.env, DATABASE_URL: dbUrl }) + .quiet(); + expect(r.exitCode).toBe(0); + }); +}); +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-import.test.ts` +Expected: FAIL — `Unknown command: grant`. + +- [ ] **Step 3: Implement the subcommands in `scripts/catalog.ts`** + +Add these functions above the `const [cmd, ...args] = ...` line: + +```typescript +async function resolveUserId(email: string): Promise { + const u = await db.query.users.findFirst({ where: eq(users.email, email) }); + if (!u) throw new Error(`No user with email ${email}`); + return u.id; +} + +async function resolveDatasetId(key: string): Promise { + const d = await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, key) }); + if (!d) throw new Error(`No dataset with key ${key}`); + return d.id; +} + +async function grant(email: string, key: string) { + const userId = await resolveUserId(email); + const datasetId = await resolveDatasetId(key); + await db.insert(catalogAccess).values({ userId, datasetId }).onConflictDoNothing(); + console.log(`Granted "${key}" to ${email}`); +} + +async function revoke(email: string, key: string) { + const userId = await resolveUserId(email); + const datasetId = await resolveDatasetId(key); + await db + .delete(catalogAccess) + .where(sql`${catalogAccess.userId} = ${userId} AND ${catalogAccess.datasetId} = ${datasetId}`); + console.log(`Revoked "${key}" from ${email}`); +} + +async function list() { + const datasets = await db.select().from(catalogDatasets); + for (const d of datasets) { + const grants = await db + .select({ email: users.email }) + .from(catalogAccess) + .innerJoin(users, eq(users.id, catalogAccess.userId)) + .where(eq(catalogAccess.datasetId, d.id)); + console.log( + `${d.key} (${d.source}, prio ${d.priority}, ${d.productCount ?? 0} products) -> ${ + grants.map((g) => g.email).join(', ') || '(no grants)' + }` + ); + } +} +``` + +Replace the command dispatch block with: + +```typescript +const [cmd, ...args] = process.argv.slice(2); + +try { + if (cmd === 'import') { + if (!args[0]) throw new Error('Usage: catalog import '); + await importDataset(args[0]); + } else if (cmd === 'grant') { + if (!args[0] || !args[1]) throw new Error('Usage: catalog grant '); + await grant(args[0], args[1]); + } else if (cmd === 'revoke') { + if (!args[0] || !args[1]) throw new Error('Usage: catalog revoke '); + await revoke(args[0], args[1]); + } else if (cmd === 'list') { + await list(); + } else { + throw new Error(`Unknown command: ${cmd ?? '(none)'}. Expected: import|grant|revoke|list`); + } + await client.end(); + process.exit(0); +} catch (e) { + console.error(e instanceof Error ? e.message : String(e)); + await client.end(); + process.exit(1); +} +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-import.test.ts` +Expected: PASS (4 tests total). + +- [ ] **Step 5: Commit** + +```bash +git add scripts/catalog.ts tests/integration-db/catalog-import.test.ts +git commit -m "feat: add catalog:grant/revoke/list CLI subcommands" +``` + +--- + +### Task 5: Repo guardrails — gitignore + prek hook blocking committed datasets + +**Files:** +- Modify: `.gitignore` +- Modify: `.pre-commit-config.yaml` + +- [ ] **Step 1: Add the ignore rule** + +Append to the end of `.gitignore`: + +``` + +# Crawled catalog datasets — never commit (public repo; private data) +data/catalog/ +``` + +- [ ] **Step 2: Add a local prek guard hook** + +In `.pre-commit-config.yaml`, add this hook as the last entry under `hooks:` (same `repo: local` block, matching the existing `language: system` style): + +```yaml + - id: no-catalog-data + name: no committed catalog datasets + entry: bash -c 'if git diff --cached --name-only | grep -E "^data/catalog/.*\.jsonl$"; then echo "ERROR: catalog dataset files must not be committed (public repo)"; exit 1; fi' + language: system + pass_filenames: false +``` + +- [ ] **Step 3: Verify the guard triggers** + +Run: + +```bash +mkdir -p data/catalog && cp tests/fixtures/catalog/mini.jsonl data/catalog/x.jsonl && git add -f data/catalog/x.jsonl && bunx prek run no-catalog-data --hook-stage pre-commit; echo "exit=$?" +``` + +Expected: prints the ERROR line and `exit=1`. Then clean up: + +```bash +git reset data/catalog/x.jsonl && rm -rf data/catalog +``` + +Expected: `data/catalog/` is gitignored and untracked. + +- [ ] **Step 4: Commit** + +```bash +git add .gitignore .pre-commit-config.yaml +git commit -m "chore: gitignore + prek guard against committing catalog datasets" +``` + +--- + +# PHASE 2 — App Integration + +### Task 6: Extract shared nutrient-extraction helper (Gap 8) + +**Files:** +- Test: `src/lib/server/nutrient-extract.test.ts` (create) +- Create: `src/lib/server/nutrient-extract.ts` +- Modify: `src/lib/server/openfoodfacts.ts` + +- [ ] **Step 1: Write the failing unit test** + +Create `src/lib/server/nutrient-extract.test.ts`: + +```typescript +import { describe, it, expect } from 'vitest'; +import { extractNutrient, extractAllNutrients } from './nutrient-extract'; + +describe('nutrient-extract', () => { + it('extractNutrient returns null for missing/NaN and rounds with conversion', () => { + expect(extractNutrient({}, 'x_100g')).toBeNull(); + expect(extractNutrient({ x_100g: 'abc' }, 'x_100g')).toBeNull(); + expect(extractNutrient({}, undefined)).toBeNull(); + expect(extractNutrient({ x_100g: 1.234 }, 'x_100g')).toBe(1.23); + expect(extractNutrient({ x_100g: 0.5 }, 'x_100g', 1000)).toBe(500); + expect(extractNutrient({ x_100g: '2.5' }, 'x_100g')).toBe(2.5); + }); + + it('extractAllNutrients maps every ALL_NUTRIENTS key', async () => { + const { ALL_NUTRIENT_KEYS } = await import('$lib/nutrients'); + const out = extractAllNutrients({ 'saturated-fat_100g': 5 }); + for (const k of ALL_NUTRIENT_KEYS) expect(k in out).toBe(true); + expect(out.saturatedFat).toBe(5); + }); +}); +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `bun --bun vitest run src/lib/server/nutrient-extract.test.ts` +Expected: FAIL — `Cannot find module './nutrient-extract'`. + +- [ ] **Step 3: Create the shared helper** + +Create `src/lib/server/nutrient-extract.ts` (logic copied verbatim from the current private `extractNutrient` in `openfoodfacts.ts`, plus the `ALL_NUTRIENTS` loop, so behavior is identical): + +```typescript +import { ALL_NUTRIENTS } from '$lib/nutrients'; + +export function extractNutrient( + nutriments: Record, + offKey: string | undefined, + conversion?: number +): number | null { + if (!offKey) return null; + const raw = nutriments[offKey]; + if (raw == null) return null; + const num = typeof raw === 'string' ? parseFloat(raw) : raw; + if (isNaN(num)) return null; + if (conversion) return Math.round(num * conversion * 100) / 100; + return Math.round(num * 100) / 100; +} + +export function extractAllNutrients( + nutriments: Record +): Record { + const out: Record = {}; + for (const n of ALL_NUTRIENTS) { + out[n.key] = extractNutrient(nutriments, n.offKey, n.offConversion); + } + return out; +} +``` + +- [ ] **Step 4: Refactor `openfoodfacts.ts` to use it (behavior-preserving)** + +In `src/lib/server/openfoodfacts.ts`: change the import line `import { ALL_NUTRIENTS } from '$lib/nutrients';` to `import { extractAllNutrients } from '$lib/server/nutrient-extract';`. Delete the private `extractNutrient` function (lines 67–79). In `mapSearchProduct`, replace the loop: + +```typescript + for (const nutrient of ALL_NUTRIENTS) { + result[nutrient.key] = extractNutrient(n, nutrient.offKey, nutrient.offConversion); + } +``` + +with: + +```typescript + Object.assign(result, extractAllNutrients(n)); +``` + +- [ ] **Step 5: Run the OFF + new helper tests to verify no regression** + +Run: `bun --bun vitest run src/lib/server/nutrient-extract.test.ts && bun run check` +Expected: nutrient-extract tests PASS; `bun run check` exits 0 (no type errors from the refactor). If OFF has existing tests, run `bun --bun vitest run src/lib/server/openfoodfacts` — Expected: still PASS. + +- [ ] **Step 6: Commit** + +```bash +git add src/lib/server/nutrient-extract.ts src/lib/server/nutrient-extract.test.ts src/lib/server/openfoodfacts.ts +git commit -m "refactor: extract shared nutrient-extraction helper from openfoodfacts" +``` + +--- + +### Task 7: Catalog server queries (search / barcode / instantiate) + +**Files:** +- Test: `tests/integration-db/catalog-endpoints.test.ts` (create — query-layer tests first) +- Create: `src/lib/server/catalog/queries.ts` + +- [ ] **Step 1: Write the failing query-layer integration test** + +Create `tests/integration-db/catalog-endpoints.test.ts`: + +```typescript +import { describe, it, expect, beforeAll, afterAll } from 'vitest'; +import { eq } from 'drizzle-orm'; +import { $ } from 'bun'; +import { join } from 'node:path'; +import { createTestDatabase, dropTestDatabase, runTestMigrations, getTestDB, closeTestDB } from './helpers'; +import { catalogDatasets, catalogFoods, catalogAccess, users, foods } from '$lib/server/schema'; + +const DB_NAME = 'test_catalog_endpoints'; +let dbUrl: string; +const FIXTURE = join(process.cwd(), 'tests/fixtures/catalog/mini.jsonl'); + +beforeAll(async () => { + dbUrl = await createTestDatabase(DB_NAME); + await runTestMigrations(dbUrl); + await $`bun run scripts/catalog.ts import ${FIXTURE}`.env({ ...process.env, DATABASE_URL: dbUrl }); +}); +afterAll(async () => { + await closeTestDB(dbUrl); + await dropTestDatabase(DB_NAME); +}); + +describe('catalog queries', () => { + it('catalogSearch returns nothing for an ungranted user, results for a granted one', async () => { + const db = getTestDB(dbUrl); + const { catalogSearch } = await import('$lib/server/catalog/queries'); + const [u] = await db + .insert(users) + .values({ infomaniakSub: 'sub-q-1', email: 'q1@example.com' }) + .returning(); + + expect((await catalogSearch(db as never, u.id, 'Zweifel', 10)).length).toBe(0); + + const ds = (await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'testset') }))!; + await db.insert(catalogAccess).values({ userId: u.id, datasetId: ds.id }); + + const res = await catalogSearch(db as never, u.id, 'Zweifel', 10); + expect(res.length).toBe(1); + expect(res[0].name).toBe('Zweifel Paprika Chips'); + expect(res[0].datasetKey).toBe('testset'); + }); + + it('catalogByBarcode honors access and dataset priority tie-break', async () => { + const db = getTestDB(dbUrl); + const { catalogByBarcode } = await import('$lib/server/catalog/queries'); + const [u] = await db + .insert(users) + .values({ infomaniakSub: 'sub-q-2', email: 'q2@example.com' }) + .returning(); + expect(await catalogByBarcode(db as never, u.id, '7610095131003')).toBeNull(); + + const dsLow = (await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'testset') }))!; + // Second dataset, same barcode, lower priority number (=higher precedence) + const [dsHi] = await db + .insert(catalogDatasets) + .values({ key: 'prio', name: 'Prio', source: 'migros', priority: 1, productCount: 1 }) + .returning(); + await db.insert(catalogFoods).values({ + datasetId: dsHi.id, + name: 'PRIO WINNER', + servingSize: 100, + servingUnit: 'g', + calories: 1, + protein: 1, + carbs: 1, + fat: 1, + fiber: 1, + barcode: '7610095131003' + }); + await db.insert(catalogAccess).values({ userId: u.id, datasetId: dsLow.id }); + await db.insert(catalogAccess).values({ userId: u.id, datasetId: dsHi.id }); + + const hit = await catalogByBarcode(db as never, u.id, '7610095131003'); + expect(hit!.name).toBe('PRIO WINNER'); + }); + + it('instantiateCatalogFood creates a personal food and never mutates the catalog row', async () => { + const db = getTestDB(dbUrl); + const { instantiateCatalogFood } = await import('$lib/server/catalog/queries'); + const [u] = await db + .insert(users) + .values({ infomaniakSub: 'sub-q-3', email: 'q3@example.com' }) + .returning(); + const ds = (await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'testset') }))!; + await db.insert(catalogAccess).values({ userId: u.id, datasetId: ds.id }); + const [cf] = await db + .select() + .from(catalogFoods) + .where(eq(catalogFoods.datasetId, ds.id)) + .limit(1); + + const food = await instantiateCatalogFood(db as never, u.id, cf.id); + expect(food).toBeTruthy(); + expect(food!.userId).toBe(u.id); + expect(food!.name).toBe(cf.name); + + const personal = await db.select().from(foods).where(eq(foods.userId, u.id)); + expect(personal.length).toBe(1); + const stillThere = await db.select().from(catalogFoods).where(eq(catalogFoods.id, cf.id)); + expect(stillThere.length).toBe(1); + + // Ungranted user cannot instantiate + const [u2] = await db + .insert(users) + .values({ infomaniakSub: 'sub-q-4', email: 'q4@example.com' }) + .returning(); + await expect(instantiateCatalogFood(db as never, u2.id, cf.id)).resolves.toBeNull(); + }); +}); +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-endpoints.test.ts` +Expected: FAIL — `Cannot find module '$lib/server/catalog/queries'`. + +- [ ] **Step 3: Implement `src/lib/server/catalog/queries.ts`** + +Create `src/lib/server/catalog/queries.ts`: + +```typescript +import { and, eq, ilike, asc } from 'drizzle-orm'; +import type { getDB } from '$lib/server/db'; +import { catalogFoods, catalogDatasets, catalogAccess, foods } from '$lib/server/schema'; +import { createFood } from '$lib/server/foods'; +import { pickNutrients } from '$lib/nutrients'; +import type { Result } from '$lib/server/types'; + +type DB = ReturnType; + +export type CatalogResult = typeof catalogFoods.$inferSelect & { + datasetKey: string; + source: string; +}; + +function escapeLike(q: string): string { + return q.replace(/\\/g, '\\\\').replace(/%/g, '\\%').replace(/_/g, '\\_'); +} + +export async function catalogSearch( + db: DB, + userId: string, + query: string, + limit: number +): Promise { + const q = escapeLike(query.trim()); + if (q.length === 0) return []; + const rows = await db + .select({ + cf: catalogFoods, + datasetKey: catalogDatasets.key, + source: catalogDatasets.source, + priority: catalogDatasets.priority + }) + .from(catalogFoods) + .innerJoin(catalogDatasets, eq(catalogDatasets.id, catalogFoods.datasetId)) + .innerJoin( + catalogAccess, + and(eq(catalogAccess.datasetId, catalogDatasets.id), eq(catalogAccess.userId, userId)) + ) + .where(ilike(catalogFoods.name, `%${q}%`)) + .orderBy(asc(catalogDatasets.priority), asc(catalogFoods.name)) + .limit(limit); + return rows.map((r) => ({ ...r.cf, datasetKey: r.datasetKey, source: r.source })); +} + +export async function catalogByBarcode( + db: DB, + userId: string, + barcode: string +): Promise { + const rows = await db + .select({ + cf: catalogFoods, + datasetKey: catalogDatasets.key, + source: catalogDatasets.source + }) + .from(catalogFoods) + .innerJoin(catalogDatasets, eq(catalogDatasets.id, catalogFoods.datasetId)) + .innerJoin( + catalogAccess, + and(eq(catalogAccess.datasetId, catalogDatasets.id), eq(catalogAccess.userId, userId)) + ) + .where(eq(catalogFoods.barcode, barcode)) + .orderBy(asc(catalogDatasets.priority)) + .limit(1); + const r = rows[0]; + return r ? { ...r.cf, datasetKey: r.datasetKey, source: r.source } : null; +} + +export async function instantiateCatalogFood( + db: DB, + userId: string, + catalogFoodId: string +): Promise { + const rows = await db + .select({ cf: catalogFoods }) + .from(catalogFoods) + .innerJoin(catalogDatasets, eq(catalogDatasets.id, catalogFoods.datasetId)) + .innerJoin( + catalogAccess, + and(eq(catalogAccess.datasetId, catalogDatasets.id), eq(catalogAccess.userId, userId)) + ) + .where(eq(catalogFoods.id, catalogFoodId)) + .limit(1); + const cf = rows[0]?.cf; + if (!cf) return null; + + const payload = { + name: cf.name, + brand: cf.brand, + servingSize: cf.servingSize, + servingUnit: cf.servingUnit, + calories: cf.calories, + protein: cf.protein, + carbs: cf.carbs, + fat: cf.fat, + fiber: cf.fiber, + barcode: cf.barcode, + nutriScore: cf.nutriScore as 'a' | 'b' | 'c' | 'd' | 'e' | null, + novaGroup: cf.novaGroup, + additives: cf.additives, + ingredientsText: cf.ingredientsText, + imageUrl: cf.imageUrl, + ...pickNutrients(cf as Record) + }; + const result: Result = await createFood(userId, payload); + if (!result.success) { + // Barcode already in the user's personal DB → treat as a benign no-op miss + return null; + } + return result.data; +} +``` + +Note: `catalogSearch`/`catalogByBarcode`/`instantiateCatalogFood` take an explicit `db` (so tests can pass the test DB). Production callers pass `getDB()`. + +- [ ] **Step 4: Run to verify it passes** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-endpoints.test.ts` +Expected: PASS (3 tests). + +- [ ] **Step 5: Commit** + +```bash +git add src/lib/server/catalog/queries.ts tests/integration-db/catalog-endpoints.test.ts +git commit -m "feat: add access-gated catalog query layer (search/barcode/instantiate)" +``` + +--- + +### Task 8: Catalog API endpoints + OpenAPI + isolation test + +**Files:** +- Test: `tests/integration-db/catalog-endpoints.test.ts` (extend — `/api/foods` isolation) +- Create: `src/routes/api/catalog/search/+server.ts` +- Create: `src/routes/api/catalog/barcode/[code]/+server.ts` +- Create: `src/routes/api/catalog/[id]/save/+server.ts` +- Modify: `src/lib/server/openapi.ts` +- Regenerate: `docs/openapi.json`, `src/lib/api/generated/schema.d.ts` + +- [ ] **Step 1: Add the failing isolation test** + +Append to `tests/integration-db/catalog-endpoints.test.ts`: + +```typescript +import { listFoods } from '$lib/server/foods'; + +describe('catalog isolation from personal foods', () => { + it('listFoods (the /api/foods source) never returns catalog rows', async () => { + const db = getTestDB(dbUrl); + const [u] = await db + .insert(users) + .values({ infomaniakSub: 'sub-iso-1', email: 'iso@example.com' }) + .returning(); + const ds = (await db.query.catalogDatasets.findFirst({ where: eq(catalogDatasets.key, 'testset') }))!; + await db.insert(catalogAccess).values({ userId: u.id, datasetId: ds.id }); + // listFoods uses getDB() internally; this asserts the personal-food query + // is unaffected by catalog presence for a user with zero personal foods. + const { items } = await listFoods(u.id, { query: 'Zweifel' }); + expect(items.length).toBe(0); + }); +}); +``` + +(`listFoods` uses `getDB()`/`DATABASE_URL`; the integration runner sets `DATABASE_URL` to the test DB via `helpers`. If `listFoods` cannot see the test DB in this harness, assert instead that `/api/foods` route output excludes catalog by inspecting the route in Step 4’s manual check — but the query-level assertion above is the primary guard since `catalogFoods` is a separate table never referenced by `listFoods`.) + +- [ ] **Step 2: Run to verify it fails or is red** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-endpoints.test.ts` +Expected: the new test FAILS only if `listFoods` accidentally unions catalog (it must not) — i.e. it should pass once endpoints exist but is added now to lock the invariant. If it errors on DB wiring, keep it and proceed; it is the regression guard. + +- [ ] **Step 3: Create `src/routes/api/catalog/search/+server.ts`** + +```typescript +import { json } from '@sveltejs/kit'; +import type { RequestHandler } from './$types'; +import { requireAuth, handleApiError } from '$lib/server/errors'; +import { getDB } from '$lib/server/db'; +import { catalogSearch } from '$lib/server/catalog/queries'; + +export const GET: RequestHandler = async ({ locals, url }) => { + try { + const userId = requireAuth(locals); + const q = url.searchParams.get('q') ?? ''; + const limitRaw = Number(url.searchParams.get('limit') ?? '20'); + const limit = Number.isFinite(limitRaw) ? Math.min(Math.max(limitRaw, 1), 50) : 20; + if (q.trim().length < 2) { + return json({ results: [] }); + } + const results = await catalogSearch(getDB(), userId, q, limit); + return json({ results }); + } catch (error) { + return handleApiError(error); + } +}; +``` + +- [ ] **Step 4: Create `src/routes/api/catalog/barcode/[code]/+server.ts`** + +```typescript +import { json } from '@sveltejs/kit'; +import type { RequestHandler } from './$types'; +import { requireAuth, handleApiError } from '$lib/server/errors'; +import { isValidBarcode } from '$lib/utils/barcode'; +import { getDB } from '$lib/server/db'; +import { catalogByBarcode } from '$lib/server/catalog/queries'; + +export const GET: RequestHandler = async ({ locals, params }) => { + try { + const userId = requireAuth(locals); + const { code } = params; + if (!isValidBarcode(code)) { + return json({ error: 'Invalid barcode format' }, { status: 400 }); + } + const result = await catalogByBarcode(getDB(), userId, code); + if (!result) return json({ found: false }, { status: 404 }); + return json({ found: true, result }); + } catch (error) { + return handleApiError(error); + } +}; +``` + +- [ ] **Step 5: Create `src/routes/api/catalog/[id]/save/+server.ts`** + +```typescript +import { json } from '@sveltejs/kit'; +import type { RequestHandler } from './$types'; +import { requireAuth, requireUuid, handleApiError } from '$lib/server/errors'; +import { getDB } from '$lib/server/db'; +import { instantiateCatalogFood } from '$lib/server/catalog/queries'; + +export const POST: RequestHandler = async ({ locals, params }) => { + try { + const userId = requireAuth(locals); + const id = requireUuid(params.id); + const food = await instantiateCatalogFood(getDB(), userId, id); + if (!food) { + return json({ error: 'Catalog food not found or not accessible' }, { status: 404 }); + } + return json({ food }, { status: 201 }); + } catch (error) { + return handleApiError(error); + } +}; +``` + +- [ ] **Step 6: Declare the three routes in `src/lib/server/openapi.ts`** + +Open `src/lib/server/openapi.ts`. Near the top, ensure `z` and the catalog query types are usable; add a response schema near the other schemas and add the three paths into the object passed to the path map (mirror the `'/api/foods'` entry style). Add this block alongside the other `'/api/...'` keys: + +```typescript + '/api/catalog/search': { + get: { + operationId: 'catalogSearch', + tags: ['Catalog'], + description: 'Online catalog search across the requesting user’s granted datasets.', + requestParams: { + query: z.object({ q: z.string(), limit: z.number().int().optional() }) + }, + responses: { + '200': { + description: 'Success', + content: { + 'application/json': { + schema: z.object({ + results: z.array(z.record(z.string(), z.unknown())) + }) + } + } + }, + '401': res401 + } + } + }, + '/api/catalog/barcode/{code}': { + get: { + operationId: 'catalogByBarcode', + tags: ['Catalog'], + description: 'Barcode lookup across granted catalog datasets (priority tie-break).', + requestParams: { path: z.object({ code: z.string() }) }, + responses: { + '200': { + description: 'Found', + content: { + 'application/json': { + schema: z.object({ + found: z.boolean(), + result: z.record(z.string(), z.unknown()).optional() + }) + } + } + }, + '400': res400, + '401': res401, + '404': { + description: 'Not found', + content: { + 'application/json': { schema: z.object({ found: z.boolean() }) } + } + } + } + } + }, + '/api/catalog/{id}/save': { + post: { + operationId: 'saveCatalogFood', + tags: ['Catalog'], + description: 'Instantiate a personal food from a catalog row (copy-on-use).', + requestParams: { path: z.object({ id: z.string() }) }, + responses: { + '201': { + description: 'Created', + content: { 'application/json': { schema: foodResponseSchema } } + }, + '401': res401, + '404': res404 + } + } + }, +``` + +(Use the same `res401`/`res400`/`res404`/`foodResponseSchema` symbols already defined in this file for the `/api/foods` routes. If `res404` is not defined there, reuse the inline 404 shape shown for `/api/catalog/barcode/{code}`.) + +- [ ] **Step 7: Regenerate the OpenAPI spec + typed client** + +Run: `bun run api:generate:ts && bunx prettier --write docs/openapi.json src/lib/api/generated/` +Expected: `docs/openapi.json` and `src/lib/api/generated/schema.d.ts` now contain `/api/catalog/search`, `/api/catalog/barcode/{code}`, `/api/catalog/{id}/save`. + +Run: `bun run api:check` +Expected: exits 0 (no diff after regen). This is the same check CI runs. + +- [ ] **Step 8: Run integration + type check** + +Run: `bun run test:integration-db -- tests/integration-db/catalog-endpoints.test.ts && bun run check` +Expected: tests PASS; `bun run check` exits 0. + +- [ ] **Step 9: Commit** + +```bash +git add src/routes/api/catalog src/lib/server/openapi.ts docs/openapi.json src/lib/api/generated tests/integration-db/catalog-endpoints.test.ts +git commit -m "feat: add /api/catalog search/barcode/save endpoints + openapi" +``` + +--- + +### Task 9: i18n strings for catalog UI + +**Files:** +- Modify: `messages/en.json` +- Modify: `messages/de.json` + +- [ ] **Step 1: Add English strings** + +In `messages/en.json`, add these keys (keep the file’s existing alphabetical/grouping convention; place near other `add_food_*` keys): + +```json + "catalog_source_badge": "{source}", + "add_food_catalog_searching": "Searching catalog…", + "add_food_catalog_section": "From catalog", + "add_food_catalog_add_failed": "Could not add this product. It may already be in your foods." +``` + +- [ ] **Step 2: Add German strings** + +In `messages/de.json`, add the same keys: + +```json + "catalog_source_badge": "{source}", + "add_food_catalog_searching": "Katalog wird durchsucht…", + "add_food_catalog_section": "Aus Katalog", + "add_food_catalog_add_failed": "Produkt konnte nicht hinzugefügt werden. Es ist evtl. schon in deinen Lebensmitteln." +``` + +- [ ] **Step 3: Compile messages + typecheck** + +Run: `bun run paraglide:compile && bun run check` +Expected: compiles; `bun run check` exits 0; `m.add_food_catalog_section` etc. are now typed. + +- [ ] **Step 4: Commit** + +```bash +git add messages/en.json messages/de.json +git commit -m "feat: add i18n strings for catalog picker" +``` + +--- + +### Task 10: FoodPicker — online catalog search + source badge + +**Files:** +- Modify: `src/lib/components/entries/FoodPicker.svelte` + +- [ ] **Step 1: Extend `PickerSelection` and add catalog state** + +In the ` @@ -151,6 +223,71 @@ {/each} + {#if catalogLoading} +

{m.add_food_catalog_searching()}

+ {:else if catalogResults.length > 0} +

{m.add_food_catalog_section()}

+
    + {#each catalogResults as hit (hit.id)} +
  • + + {hit.name} + {m.catalog_source_badge({ source: hit.source })} + + +
  • + {/each} +
+ {/if} + {#if offLoading} +

{m.add_food_off_searching()}

+ {:else if offResults.length > 0} +

{m.add_food_off_section()}

+
    + {#each offResults as hit (hit.barcode)} +
  • + + {hit.name} + {#if hit.brand} · {hit.brand}{/if} + {m.add_food_off_badge()} + + +
  • + {/each} +
+ {/if} diff --git a/src/lib/server/catalog/dataset-schema.test.ts b/src/lib/server/catalog/dataset-schema.test.ts new file mode 100644 index 00000000..2cb11884 --- /dev/null +++ b/src/lib/server/catalog/dataset-schema.test.ts @@ -0,0 +1,88 @@ +import { describe, it, expect } from 'vitest'; +import { datasetHeaderSchema, datasetProductSchema } from './dataset-schema'; + +describe('dataset-schema', () => { + it('accepts a valid header record', () => { + const r = datasetHeaderSchema.safeParse({ + _dataset: { + key: 'migros', + name: 'Migros (Switzerland)', + source: 'migros', + priority: 10, + version: '2026.05.18', + snapshotAt: '2026-05-18T00:00:00.000Z' + } + }); + expect(r.success).toBe(true); + }); + + it('accepts a minimal valid product line', () => { + const r = datasetProductSchema.safeParse({ + name: 'Zweifel Paprika Chips', + servingSize: 100, + servingUnit: 'g', + calories: 515, + protein: 5.8, + carbs: 53, + fat: 30, + fiber: 5.6 + }); + expect(r.success).toBe(true); + }); + + it('accepts known extended nutrients and OFF quality fields', () => { + const r = datasetProductSchema.safeParse({ + name: 'X', + servingSize: 100, + servingUnit: 'g', + calories: 1, + protein: 1, + carbs: 1, + fat: 1, + fiber: 1, + saturatedFat: 5.1, + salt: 1.3, + barcode: '7610095131003', + language: 'de', + nutriScore: 'd', + novaGroup: 4, + additives: ['en:e330'], + sourceUrl: 'https://www.migros.ch/de/product/123', + sourceRef: '123' + }); + expect(r.success).toBe(true); + }); + + it('rejects a product missing required core macros', () => { + const r = datasetProductSchema.safeParse({ name: 'X', servingSize: 100, servingUnit: 'g' }); + expect(r.success).toBe(false); + }); + + it('rejects negative nutrients and bad nutriScore', () => { + expect( + datasetProductSchema.safeParse({ + name: 'X', + servingSize: 100, + servingUnit: 'g', + calories: -1, + protein: 0, + carbs: 0, + fat: 0, + fiber: 0 + }).success + ).toBe(false); + expect( + datasetProductSchema.safeParse({ + name: 'X', + servingSize: 100, + servingUnit: 'g', + calories: 0, + protein: 0, + carbs: 0, + fat: 0, + fiber: 0, + nutriScore: 'z' + }).success + ).toBe(false); + }); +}); diff --git a/src/lib/server/catalog/dataset-schema.ts b/src/lib/server/catalog/dataset-schema.ts new file mode 100644 index 00000000..3750246f --- /dev/null +++ b/src/lib/server/catalog/dataset-schema.ts @@ -0,0 +1,47 @@ +import { z } from 'zod'; +import { servingUnitSchema } from '$lib/units'; +import { ALL_NUTRIENT_KEYS } from '$lib/nutrients'; + +const optNutrient = z.coerce.number().nonnegative().optional().nullable(); +const nutrientFields = Object.fromEntries(ALL_NUTRIENT_KEYS.map((k) => [k, optNutrient])); + +export const datasetHeaderSchema = z.object({ + _dataset: z.object({ + key: z + .string() + .min(1) + .max(64) + .regex(/^[a-z0-9-]+$/), + name: z.string().min(1).max(200), + source: z.enum(['migros', 'off', 'coop']), + priority: z.coerce.number().int().min(0).max(1000), + version: z.string().max(64).optional().nullable(), + snapshotAt: z.string().datetime().optional().nullable() + }) +}); + +export const datasetProductSchema = z.object({ + name: z.string().min(1).max(500), + brand: z.string().max(500).optional().nullable(), + language: z.enum(['de', 'fr', 'it', 'en']).optional().nullable(), + servingSize: z.coerce.number().positive(), + servingUnit: servingUnitSchema, + calories: z.coerce.number().nonnegative(), + protein: z.coerce.number().nonnegative(), + carbs: z.coerce.number().nonnegative(), + fat: z.coerce.number().nonnegative(), + fiber: z.coerce.number().nonnegative(), + ...nutrientFields, + barcode: z.string().max(32).optional().nullable(), + nutriScore: z.enum(['a', 'b', 'c', 'd', 'e']).optional().nullable(), + novaGroup: z.coerce.number().int().min(1).max(4).optional().nullable(), + additives: z.array(z.string().max(100)).max(200).optional().nullable(), + ingredientsText: z.string().max(10000).optional().nullable(), + imageUrl: z.string().url().max(2000).optional().nullable(), + sourceUrl: z.string().url().max(2000).optional().nullable(), + sourceRef: z.string().max(200).optional().nullable(), + crawledAt: z.string().datetime().optional().nullable() +}); + +export type DatasetHeader = z.infer; +export type DatasetProduct = z.infer; diff --git a/src/lib/server/catalog/queries.ts b/src/lib/server/catalog/queries.ts new file mode 100644 index 00000000..ee916268 --- /dev/null +++ b/src/lib/server/catalog/queries.ts @@ -0,0 +1,107 @@ +import { and, eq, ilike, asc } from 'drizzle-orm'; +import type { getDB } from '$lib/server/db'; +import { catalogFoods, catalogDatasets, catalogAccess, foods } from '$lib/server/schema'; +import { createFood } from '$lib/server/foods'; +import { pickNutrients } from '$lib/nutrients'; +import type { Result } from '$lib/server/types'; + +type DB = ReturnType; + +export type CatalogResult = typeof catalogFoods.$inferSelect & { + datasetKey: string; + source: string; +}; + +function escapeLike(q: string): string { + return q.replace(/\\/g, '\\\\').replace(/%/g, '\\%').replace(/_/g, '\\_'); +} + +export async function catalogSearch( + db: DB, + userId: string, + query: string, + limit: number +): Promise { + const q = escapeLike(query.trim()); + if (q.length === 0) return []; + const rows = await db + .select({ + cf: catalogFoods, + datasetKey: catalogDatasets.key, + source: catalogDatasets.source, + priority: catalogDatasets.priority + }) + .from(catalogFoods) + .innerJoin(catalogDatasets, eq(catalogDatasets.id, catalogFoods.datasetId)) + .innerJoin( + catalogAccess, + and(eq(catalogAccess.datasetId, catalogDatasets.id), eq(catalogAccess.userId, userId)) + ) + .where(ilike(catalogFoods.name, `%${q}%`)) + .orderBy(asc(catalogDatasets.priority), asc(catalogFoods.name)) + .limit(limit); + return rows.map((r) => ({ ...r.cf, datasetKey: r.datasetKey, source: r.source })); +} + +export async function catalogByBarcode( + db: DB, + userId: string, + barcode: string +): Promise { + const rows = await db + .select({ + cf: catalogFoods, + datasetKey: catalogDatasets.key, + source: catalogDatasets.source + }) + .from(catalogFoods) + .innerJoin(catalogDatasets, eq(catalogDatasets.id, catalogFoods.datasetId)) + .innerJoin( + catalogAccess, + and(eq(catalogAccess.datasetId, catalogDatasets.id), eq(catalogAccess.userId, userId)) + ) + .where(eq(catalogFoods.barcode, barcode)) + .orderBy(asc(catalogDatasets.priority)) + .limit(1); + const r = rows[0]; + return r ? { ...r.cf, datasetKey: r.datasetKey, source: r.source } : null; +} + +export async function instantiateCatalogFood( + db: DB, + userId: string, + catalogFoodId: string +): Promise | null> { + const rows = await db + .select({ cf: catalogFoods }) + .from(catalogFoods) + .innerJoin(catalogDatasets, eq(catalogDatasets.id, catalogFoods.datasetId)) + .innerJoin( + catalogAccess, + and(eq(catalogAccess.datasetId, catalogDatasets.id), eq(catalogAccess.userId, userId)) + ) + .where(eq(catalogFoods.id, catalogFoodId)) + .limit(1); + const cf = rows[0]?.cf; + if (!cf) return null; + + const payload = { + name: cf.name, + brand: cf.brand, + servingSize: cf.servingSize, + servingUnit: cf.servingUnit, + calories: cf.calories, + protein: cf.protein, + carbs: cf.carbs, + fat: cf.fat, + fiber: cf.fiber, + barcode: cf.barcode, + nutriScore: cf.nutriScore as 'a' | 'b' | 'c' | 'd' | 'e' | null, + novaGroup: cf.novaGroup, + additives: cf.additives, + ingredientsText: cf.ingredientsText, + imageUrl: cf.imageUrl, + ...pickNutrients(cf as Record) + }; + return await createFood(userId, payload, db); +} diff --git a/src/lib/server/foods.ts b/src/lib/server/foods.ts index c279f81f..66ec42b9 100644 --- a/src/lib/server/foods.ts +++ b/src/lib/server/foods.ts @@ -18,10 +18,11 @@ function isDuplicateBarcodeError(error: unknown): boolean { async function handleBarcodeConflict( error: unknown, userId: string, - barcode: string | null | undefined + barcode: string | null | undefined, + dbOverride?: ReturnType ): Promise | null> { if (!isDuplicateBarcodeError(error) || !barcode) return null; - const existing = await findFoodByBarcode(userId, barcode).catch(() => null); + const existing = await findFoodByBarcode(userId, barcode, dbOverride).catch(() => null); const name = existing?.name ?? 'unknown'; return { success: false, @@ -93,7 +94,8 @@ export const listFoods = async ( export const createFood = async ( userId: string, - payload: unknown + payload: unknown, + dbOverride?: ReturnType ): Promise> => { const result = foodCreateSchema.safeParse(payload); if (!result.success) { @@ -101,7 +103,7 @@ export const createFood = async ( } try { - const db = getDB(); + const db = dbOverride ?? getDB(); const [created] = await db.insert(foods).values(toFoodInsert(userId, result.data)).returning(); if (!created) { return { success: false, error: new Error('Failed to create food') }; @@ -109,7 +111,7 @@ export const createFood = async ( return { success: true, data: roundNutrition(created) }; } catch (error) { return ( - (await handleBarcodeConflict(error, userId, result.data.barcode)) ?? { + (await handleBarcodeConflict(error, userId, result.data.barcode, dbOverride)) ?? { success: false, error: error as Error } @@ -204,8 +206,12 @@ export const deleteFood = async ( }); }; -export const findFoodByBarcode = async (userId: string, barcode: string) => { - const db = getDB(); +export const findFoodByBarcode = async ( + userId: string, + barcode: string, + dbOverride?: ReturnType +) => { + const db = dbOverride ?? getDB(); const [food] = await db .select() .from(foods) diff --git a/src/lib/server/nutrient-extract.test.ts b/src/lib/server/nutrient-extract.test.ts new file mode 100644 index 00000000..c3347544 --- /dev/null +++ b/src/lib/server/nutrient-extract.test.ts @@ -0,0 +1,20 @@ +import { describe, it, expect } from 'vitest'; +import { extractNutrient, extractAllNutrients } from './nutrient-extract'; + +describe('nutrient-extract', () => { + it('extractNutrient returns null for missing/NaN and rounds with conversion', () => { + expect(extractNutrient({}, 'x_100g')).toBeNull(); + expect(extractNutrient({ x_100g: 'abc' }, 'x_100g')).toBeNull(); + expect(extractNutrient({}, undefined)).toBeNull(); + expect(extractNutrient({ x_100g: 1.234 }, 'x_100g')).toBe(1.23); + expect(extractNutrient({ x_100g: 0.5 }, 'x_100g', 1000)).toBe(500); + expect(extractNutrient({ x_100g: '2.5' }, 'x_100g')).toBe(2.5); + }); + + it('extractAllNutrients maps every ALL_NUTRIENTS key', async () => { + const { ALL_NUTRIENT_KEYS } = await import('$lib/nutrients'); + const out = extractAllNutrients({ 'saturated-fat_100g': 5 }); + for (const k of ALL_NUTRIENT_KEYS) expect(k in out).toBe(true); + expect(out.saturatedFat).toBe(5); + }); +}); diff --git a/src/lib/server/nutrient-extract.ts b/src/lib/server/nutrient-extract.ts new file mode 100644 index 00000000..6cffdc56 --- /dev/null +++ b/src/lib/server/nutrient-extract.ts @@ -0,0 +1,25 @@ +import { ALL_NUTRIENTS } from '$lib/nutrients'; + +export function extractNutrient( + nutriments: Record, + offKey: string | undefined, + conversion?: number +): number | null { + if (!offKey) return null; + const raw = nutriments[offKey]; + if (raw == null) return null; + const num = typeof raw === 'string' ? parseFloat(raw) : raw; + if (isNaN(num)) return null; + if (conversion) return Math.round(num * conversion * 100) / 100; + return Math.round(num * 100) / 100; +} + +export function extractAllNutrients( + nutriments: Record +): Record { + const out: Record = {}; + for (const n of ALL_NUTRIENTS) { + out[n.key] = extractNutrient(nutriments, n.offKey, n.offConversion); + } + return out; +} diff --git a/src/lib/server/openapi.ts b/src/lib/server/openapi.ts index 1cfb2a23..f5ecf33c 100644 --- a/src/lib/server/openapi.ts +++ b/src/lib/server/openapi.ts @@ -60,7 +60,10 @@ import { import { favoritesResponseSchema } from './validation/responses/favorites'; import { maintenanceResponseSchema } from './validation/responses/maintenance'; import { imageUploadResponseSchema } from './validation/responses/images'; -import { openfoodfactsResponseSchema } from './validation/responses/openfoodfacts'; +import { + openfoodfactsResponseSchema, + openfoodfactsSearchResponseSchema +} from './validation/responses/openfoodfacts'; import { goalsResponseSchema, goalsSetResponseSchema } from './validation/responses/goals'; import { dayPropertiesResponseSchema, @@ -104,6 +107,12 @@ const res204: ZodOpenApiResponseObject = { description: 'Deleted' }; +const res404: ZodOpenApiResponseObject = { + id: 'NotFoundResponse', + description: 'Not found', + content: { 'application/json': { schema: errorResponseSchema } } +}; + export function generateSpec() { return createDocument({ openapi: '3.1.0', @@ -1287,7 +1296,96 @@ export function generateSpec() { } }, + // ── Catalog ─────────────────────────────────────────── + '/api/catalog/search': { + get: { + operationId: 'catalogSearch', + tags: ['Catalog'], + description: "Online catalog search across the requesting user's granted datasets.", + requestParams: { + query: z.object({ q: z.string(), limit: z.number().int().optional() }) + }, + responses: { + '200': { + description: 'Success', + content: { + 'application/json': { + schema: z.object({ + results: z.array(z.record(z.string(), z.unknown())) + }) + } + } + }, + '401': res401 + } + } + }, + '/api/catalog/barcode/{code}': { + get: { + operationId: 'catalogByBarcode', + tags: ['Catalog'], + description: 'Barcode lookup across granted catalog datasets (priority tie-break).', + requestParams: { path: z.object({ code: z.string() }) }, + responses: { + '200': { + description: 'Found', + content: { + 'application/json': { + schema: z.object({ + found: z.boolean(), + result: z.record(z.string(), z.unknown()).optional() + }) + } + } + }, + '400': res400, + '401': res401, + '404': { + description: 'Not found', + content: { + 'application/json': { schema: z.object({ found: z.boolean() }) } + } + } + } + } + }, + '/api/catalog/{id}/save': { + post: { + operationId: 'saveCatalogFood', + tags: ['Catalog'], + description: 'Instantiate a personal food from a catalog row (copy-on-use).', + requestParams: { path: z.object({ id: z.string().uuid() }) }, + responses: { + '201': { + description: 'Created', + content: { 'application/json': { schema: foodResponseSchema } } + }, + '401': res401, + '404': res404, + '409': res409 + } + } + }, + // ── Open Food Facts ─────────────────────────────────── + '/api/openfoodfacts/search': { + get: { + operationId: 'searchOpenFoodFacts', + tags: ['OpenFoodFacts'], + description: + 'Text search Open Food Facts products. Online fallback used by the food picker when local + catalog results are sparse.', + requestParams: { + query: z.object({ q: z.string(), limit: z.number().int().optional() }) + }, + responses: { + '200': { + description: 'Success', + content: { 'application/json': { schema: openfoodfactsSearchResponseSchema } } + }, + '401': res401 + } + } + }, '/api/openfoodfacts/{barcode}': { get: { operationId: 'lookupOpenFoodFacts', @@ -1304,6 +1402,28 @@ export function generateSpec() { '401': res401 } } + }, + '/api/openfoodfacts/{barcode}/save': { + post: { + operationId: 'saveOpenFoodFactsProduct', + tags: ['OpenFoodFacts'], + description: + 'Instantiate a personal food from an Open Food Facts product by barcode (copy-on-use). Idempotent: returns the existing food if already saved.', + requestParams: { path: z.object({ barcode: z.string() }) }, + responses: { + '200': { + description: 'Existing food returned', + content: { 'application/json': { schema: foodResponseSchema } } + }, + '201': { + description: 'Created', + content: { 'application/json': { schema: foodResponseSchema } } + }, + '400': res400, + '401': res401, + '404': res404 + } + } } } }); diff --git a/src/lib/server/openfoodfacts.test.ts b/src/lib/server/openfoodfacts.test.ts new file mode 100644 index 00000000..0775480e --- /dev/null +++ b/src/lib/server/openfoodfacts.test.ts @@ -0,0 +1,87 @@ +import { describe, it, expect, vi, afterEach } from 'vitest'; +import { searchProducts } from './openfoodfacts'; + +const okResponse = (body: unknown) => + new Response(JSON.stringify(body), { headers: { 'content-type': 'application/json' } }); + +describe('searchProducts (Open Food Facts text search)', () => { + afterEach(() => vi.unstubAllGlobals()); + + it('maps OFF search results to the OFFProduct shape', async () => { + const fetchMock = vi.fn().mockResolvedValue( + okResponse({ + products: [ + { + code: '7610200004444', + product_name: 'Test Chocolate', + brands: 'Frey', + nutriscore_grade: 'd', + nutriments: { + 'energy-kcal_100g': 540, + proteins_100g: 7.2, + carbohydrates_100g: 55, + fat_100g: 32, + fiber_100g: 4 + } + } + ] + }) + ); + vi.stubGlobal('fetch', fetchMock); + + const results = await searchProducts('chocolate', 5); + + expect(results).toHaveLength(1); + expect(results[0]).toMatchObject({ + name: 'Test Chocolate', + brand: 'Frey', + barcode: '7610200004444', + calories: 540, + protein: 7.2, + carbs: 55, + fat: 32, + fiber: 4, + servingSize: 100, + servingUnit: 'g', + nutriScore: 'd' + }); + }); + + it('filters out products without a name', async () => { + vi.stubGlobal( + 'fetch', + vi.fn().mockResolvedValue( + okResponse({ + products: [ + { code: '111', product_name: '', nutriments: {} }, + { code: '222', product_name: 'Has Name', nutriments: {} } + ] + }) + ) + ); + + const results = await searchProducts('x', 5); + expect(results.map((r) => r.name)).toEqual(['Has Name']); + }); + + it('returns [] on a non-ok response', async () => { + vi.stubGlobal('fetch', vi.fn().mockResolvedValue(new Response('', { status: 503 }))); + expect(await searchProducts('x')).toEqual([]); + }); + + it('returns [] on a network error', async () => { + vi.stubGlobal('fetch', vi.fn().mockRejectedValue(new Error('network down'))); + expect(await searchProducts('x')).toEqual([]); + }); + + it('clamps page_size to 20 and forwards the search term', async () => { + const fetchMock = vi.fn().mockResolvedValue(okResponse({ products: [] })); + vi.stubGlobal('fetch', fetchMock); + + await searchProducts('milk', 100); + + const url = String(fetchMock.mock.calls[0][0]); + expect(url).toContain('search_terms=milk'); + expect(url).toContain('page_size=20'); + }); +}); diff --git a/src/lib/server/openfoodfacts.ts b/src/lib/server/openfoodfacts.ts index 3b281c86..a3e75806 100644 --- a/src/lib/server/openfoodfacts.ts +++ b/src/lib/server/openfoodfacts.ts @@ -1,5 +1,5 @@ import { z } from 'zod'; -import { ALL_NUTRIENTS } from '$lib/nutrients'; +import { extractAllNutrients } from '$lib/server/nutrient-extract'; const OFF_API_BASE = 'https://world.openfoodfacts.net/api/v2/product'; const OFF_SEARCH_BASE = 'https://world.openfoodfacts.net/cgi/search.pl'; @@ -64,20 +64,6 @@ export type OFFProduct = { [key: string]: unknown; }; -function extractNutrient( - nutriments: Record, - offKey: string | undefined, - conversion?: number -): number | null { - if (!offKey) return null; - const raw = nutriments[offKey]; - if (raw == null) return null; - const num = typeof raw === 'string' ? parseFloat(raw) : raw; - if (isNaN(num)) return null; - if (conversion) return Math.round(num * conversion * 100) / 100; - return Math.round(num * 100) / 100; -} - function mapSearchProduct( p: z.infer & { code?: string }, barcode: string @@ -104,9 +90,7 @@ function mapSearchProduct( barcode }; - for (const nutrient of ALL_NUTRIENTS) { - result[nutrient.key] = extractNutrient(n, nutrient.offKey, nutrient.offConversion); - } + Object.assign(result, extractAllNutrients(n)); return result; } diff --git a/src/lib/server/rate-limit.test.ts b/src/lib/server/rate-limit.test.ts new file mode 100644 index 00000000..0bc517fc --- /dev/null +++ b/src/lib/server/rate-limit.test.ts @@ -0,0 +1,25 @@ +import { describe, it, expect } from 'vitest'; +import { rateLimit } from './rate-limit'; +import { ApiError } from './errors'; + +describe('rateLimit', () => { + it('does not throw while under the limit', () => { + expect(() => { + for (let i = 0; i < 3; i++) rateLimit('under-limit', 3, 60_000); + }).not.toThrow(); + }); + + it('throws an ApiError with status 429 once the limit is exceeded', () => { + for (let i = 0; i < 3; i++) rateLimit('over-limit', 3, 60_000); + + let caught: unknown; + try { + rateLimit('over-limit', 3, 60_000); + } catch (e) { + caught = e; + } + + expect(caught).toBeInstanceOf(ApiError); + expect((caught as ApiError).status).toBe(429); + }); +}); diff --git a/src/lib/server/rate-limit.ts b/src/lib/server/rate-limit.ts index 204cd863..03d02eae 100644 --- a/src/lib/server/rate-limit.ts +++ b/src/lib/server/rate-limit.ts @@ -1,3 +1,5 @@ +import { ApiError } from './errors'; + const buckets = new Map(); let callsSinceCleanup = 0; @@ -37,6 +39,6 @@ export const rateLimit = (key: string, max: number, windowMs: number) => { buckets.set(key, { count: 1, resetAt: now + windowMs }); return; } - if (bucket.count >= max) throw new Error('Rate limit exceeded'); + if (bucket.count >= max) throw new ApiError(429, 'Rate limit exceeded'); bucket.count += 1; }; diff --git a/src/lib/server/schema.ts b/src/lib/server/schema.ts index 5e881a08..10a3bf3a 100644 --- a/src/lib/server/schema.ts +++ b/src/lib/server/schema.ts @@ -604,6 +604,121 @@ export const oauthAuthorizationCodes = pgTable( ] ); +export const catalogDatasets = pgTable('catalog_datasets', { + id: uuid('id').primaryKey().defaultRandom(), + key: text('key').notNull().unique(), + name: text('name').notNull(), + source: text('source').notNull(), + priority: integer('priority').notNull().default(100), + description: text('description'), + productCount: integer('product_count'), + version: text('version'), + snapshotAt: timestamp('snapshot_at', { withTimezone: true }), + createdAt: timestamp('created_at', { withTimezone: true }).defaultNow(), + updatedAt: timestamp('updated_at', { withTimezone: true }).defaultNow() +}); + +export const catalogFoods = pgTable( + 'catalog_foods', + { + id: uuid('id').primaryKey().defaultRandom(), + datasetId: uuid('dataset_id') + .notNull() + .references(() => catalogDatasets.id, { onDelete: 'cascade' }), + name: text('name').notNull(), + brand: text('brand'), + language: text('language'), + servingSize: real('serving_size').notNull(), + servingUnit: servingUnitEnum('serving_unit').notNull(), + calories: real('calories').notNull(), + protein: real('protein').notNull(), + carbs: real('carbs').notNull(), + fat: real('fat').notNull(), + fiber: real('fiber').notNull(), + // Advanced nutrients — fat breakdown + saturatedFat: real('saturated_fat'), + monounsaturatedFat: real('monounsaturated_fat'), + polyunsaturatedFat: real('polyunsaturated_fat'), + transFat: real('trans_fat'), + cholesterol: real('cholesterol'), + omega3: real('omega3'), + omega6: real('omega6'), + // Sugar & carb details + sugar: real('sugar'), + addedSugars: real('added_sugars'), + sugarAlcohols: real('sugar_alcohols'), + starch: real('starch'), + // Minerals + sodium: real('sodium'), + potassium: real('potassium'), + calcium: real('calcium'), + iron: real('iron'), + magnesium: real('magnesium'), + phosphorus: real('phosphorus'), + zinc: real('zinc'), + copper: real('copper'), + manganese: real('manganese'), + selenium: real('selenium'), + iodine: real('iodine'), + fluoride: real('fluoride'), + chromium: real('chromium'), + molybdenum: real('molybdenum'), + chloride: real('chloride'), + // Vitamins + vitaminA: real('vitamin_a'), + vitaminC: real('vitamin_c'), + vitaminD: real('vitamin_d'), + vitaminE: real('vitamin_e'), + vitaminK: real('vitamin_k'), + vitaminB1: real('vitamin_b1'), + vitaminB2: real('vitamin_b2'), + vitaminB3: real('vitamin_b3'), + vitaminB5: real('vitamin_b5'), + vitaminB6: real('vitamin_b6'), + vitaminB7: real('vitamin_b7'), + vitaminB9: real('vitamin_b9'), + vitaminB12: real('vitamin_b12'), + // Other + caffeine: real('caffeine'), + alcohol: real('alcohol'), + water: real('water'), + salt: real('salt'), + barcode: text('barcode'), + nutriScore: text('nutri_score'), + novaGroup: integer('nova_group'), + additives: text('additives').array(), + ingredientsText: text('ingredients_text'), + imageUrl: text('image_url'), + sourceUrl: text('source_url'), + sourceRef: text('source_ref'), + crawledAt: timestamp('crawled_at', { withTimezone: true }), + createdAt: timestamp('created_at', { withTimezone: true }).defaultNow() + }, + (table) => [ + index('idx_catalog_foods_dataset').on(table.datasetId), + index('idx_catalog_foods_dataset_barcode').on(table.datasetId, table.barcode), + check('catalog_foods_serving_positive', sql`${table.servingSize} > 0`), + check( + 'catalog_foods_nutrition_nonnegative', + sql`${table.calories} >= 0 AND ${table.protein} >= 0 AND ${table.carbs} >= 0 AND ${table.fat} >= 0 AND ${table.fiber} >= 0` + ) + ] +); + +export const catalogAccess = pgTable( + 'catalog_access', + { + userId: uuid('user_id') + .notNull() + .references(() => users.id, { onDelete: 'cascade' }), + datasetId: uuid('dataset_id') + .notNull() + .references(() => catalogDatasets.id, { onDelete: 'cascade' }), + grantedAt: timestamp('granted_at', { withTimezone: true }).defaultNow() + }, + (table) => [primaryKey({ columns: [table.userId, table.datasetId] })] +); + // Type exports export type User = typeof users.$inferSelect; export type NewUser = typeof users.$inferInsert; diff --git a/src/lib/server/validation/responses/openfoodfacts.ts b/src/lib/server/validation/responses/openfoodfacts.ts index b09e8e88..9c6a6179 100644 --- a/src/lib/server/validation/responses/openfoodfacts.ts +++ b/src/lib/server/validation/responses/openfoodfacts.ts @@ -32,3 +32,9 @@ export const openfoodfactsResponseSchema = z product: productSchema }) .meta({ id: 'OpenFoodFactsResponse' }); + +export const openfoodfactsSearchResponseSchema = z + .object({ + results: z.array(productSchema) + }) + .meta({ id: 'OpenFoodFactsSearchResponse' }); diff --git a/src/lib/services/food-service.svelte.ts b/src/lib/services/food-service.svelte.ts index 64fde79d..121c4704 100644 --- a/src/lib/services/food-service.svelte.ts +++ b/src/lib/services/food-service.svelte.ts @@ -198,6 +198,26 @@ async function findByBarcode(barcode: string): Promise { } } +async function saveFromCatalog(catalogId: string): Promise { + const { data } = await api.POST('/api/catalog/{id}/save', { + params: { path: { id: catalogId } } + }); + if (!data?.food) return null; + const food = data.food as unknown as DexieFood; + await db.foods.put(food); + return food; +} + +async function saveFromOFF(barcode: string): Promise { + const { data } = await api.POST('/api/openfoodfacts/{barcode}/save', { + params: { path: { barcode } } + }); + if (!data?.food) return null; + const food = data.food as unknown as DexieFood; + await db.foods.put(food); + return food; +} + export const foodService = { allFoods, foodById, @@ -208,5 +228,7 @@ export const foodService = { create, update, delete: deleteFood, - findByBarcode + findByBarcode, + saveFromCatalog, + saveFromOFF }; diff --git a/src/routes/(app)/foods/+page.svelte b/src/routes/(app)/foods/+page.svelte index b4ad0e3b..3c7c1bfe 100644 --- a/src/routes/(app)/foods/+page.svelte +++ b/src/routes/(app)/foods/+page.svelte @@ -37,6 +37,10 @@ let offLoading = $state(false); let offNotFound = $state(false); let activeBarcode = $state(''); + let offResults = $state([]); + let offSearchLoading = $state(false); + // Below this many local matches, offer Open Food Facts results to fill the gap. + const OFF_FALLBACK_THRESHOLD = 5; let forceDeleteId: string | null = $state(null); let forceDeleteCount = $state(0); let qualityOpen = $state(false); @@ -94,6 +98,33 @@ const foods = $derived(debouncedQuery ? searchResults.value : allFoodsQuery.value); + // Online Open Food Facts fallback when the personal DB has few matches. + $effect(() => { + const q = debouncedQuery.trim(); + const localCount = foods.length; + if (!browser || q.length < 2 || localCount >= OFF_FALLBACK_THRESHOLD) { + offResults = []; + offSearchLoading = false; + return; + } + let cancelled = false; + offSearchLoading = true; + api + .GET('/api/openfoodfacts/search', { params: { query: { q } } }) + .then(({ data }) => { + if (!cancelled) offResults = data?.results ?? []; + }) + .catch(() => { + if (!cancelled) offResults = []; + }) + .finally(() => { + if (!cancelled) offSearchLoading = false; + }); + return () => { + cancelled = true; + }; + }); + $effect(() => { if (browser) { foodService.refresh(); @@ -252,6 +283,13 @@ } } + const prefillFromOff = (product: components['schemas']['OpenFoodFactsProduct']) => { + resetFormState(); + offData = product; + activeBarcode = product.barcode; + showForm = true; + }; + // Load visible nutrients preference (once) $effect(() => { if (browser) { @@ -330,7 +368,7 @@ /> - {#if query && foods.length === 0} + {#if query && foods.length === 0 && !offSearchLoading && offResults.length === 0}

{m.foods_no_results()}

{:else} {/if} + + {#if debouncedQuery && (offSearchLoading || offResults.length > 0)} +
+

{m.add_food_off_section()}

+ {#if offSearchLoading} +

{m.add_food_off_searching()}

+ {:else} +
    + {#each offResults as product (product.barcode)} +
  • + + {product.name} + {#if product.brand} + · {product.brand}{/if} + + +
  • + {/each} +
+ {/if} +
+ {/if}