diff --git a/src/config/models/mlc-models.json b/src/config/models/mlc-models.json index 6efa090..3d66c4f 100644 --- a/src/config/models/mlc-models.json +++ b/src/config/models/mlc-models.json @@ -1,4 +1,27 @@ { + "gemma-3-1b-it": { + "engine": "mlc", + "modelName": "Gemma-3-1B-Instruct", + "modelType": "text-generation", + "repo": "mlc-ai/gemma-3-1b-it-{quantization}-MLC", + "quantizations": [ + "q4f16_1", + "q4f32_1" + ], + "defaultQuantization": "q4f32_1", + "defaultParams": { + "temperature": 0.7, + "maxTokens": 2048 + }, + "pipeline": "text-generation", + "required_features": [ + "shader-f16" + ], + "metadata": { + "context_window_size": 32768, + "description": "Google Gemma 3 1B Instruct — smallest Gemma variant, fits in ~1 GB VRAM" + } + }, "llama-3.2-1b-instruct": { "engine": "mlc", "modelName": "Llama-3.2-1B-Instruct", diff --git a/src/core/agent/html-cleaner.ts b/src/core/agent/html-cleaner.ts index af95d5d..842e618 100644 --- a/src/core/agent/html-cleaner.ts +++ b/src/core/agent/html-cleaner.ts @@ -41,6 +41,62 @@ export class HTMLCleaner { return doc.body; } + /** + * Removes elements that are explicitly hidden or are responsive-design duplicates. + * + * Many sites include the same content twice — once for mobile and once for desktop — + * controlling visibility via CSS classes or inline styles. Since DOMParser runs without + * stylesheets we cannot resolve class-based visibility, but we can prune elements that + * use explicit DOM signals to indicate they are non-primary content: + * + * 1. `aria-hidden="true"` — explicitly hidden from assistive technologies; almost always + * a decorative duplicate (e.g. a mobile nav icon that mirrors visible text). + * 2. `style="display:none"` / `style="visibility:hidden"` — inline-hidden elements that + * are never visible regardless of which CSS is loaded. + * 3. CSS class names containing "mobile", "desktop", "tablet" combined with common + * hide/show utility words ("hidden", "only", "show", "hide", "visible") — a heuristic + * that catches Tailwind / Bootstrap / custom responsive utilities. + * + * @private + * @param {HTMLElement} root - The parsed DOM element to prune in-place + */ + private removeHiddenAndDuplicateElements(root: HTMLElement): void { + // Responsive class patterns that signal show/hide intent. + // Match e.g. "mobile-only", "desktop-hidden", "show-on-tablet", "hidden-xs" + const RESPONSIVE_PATTERN = + /\b(mobile|desktop|tablet|phone|sm|md|lg|xl)\b.{0,10}\b(only|hidden|hide|show|visible|invisible)\b|\b(hidden|hide|show|visible|invisible)\b.{0,10}\b(mobile|desktop|tablet|phone|sm|md|lg|xl)\b/i; + + const toRemove: Element[] = []; + + root.querySelectorAll('*').forEach((el) => { + // 1. aria-hidden="true" + if (el.getAttribute('aria-hidden') === 'true') { + toRemove.push(el); + return; + } + + // 2. Inline display:none or visibility:hidden + const style = (el as HTMLElement).style; + if (style) { + if (style.display === 'none' || style.visibility === 'hidden') { + toRemove.push(el); + return; + } + } + + // 3. Responsive class heuristic + const className = el.className; + if (typeof className === 'string' && RESPONSIVE_PATTERN.test(className)) { + toRemove.push(el); + } + }); + + // Remove in reverse document order so parent removal doesn't invalidate children + for (let i = toRemove.length - 1; i >= 0; i--) { + toRemove[i].remove(); + } + } + /** * Cleans HTML content by removing specified tags and attributes, returning only text content. * @param {string} html - The HTML content to clean @@ -49,6 +105,9 @@ export class HTMLCleaner { clean(html: string): string { const tempElement = this.parseHTML(html); + // Remove hidden elements and responsive-design duplicates before extracting text + this.removeHiddenAndDuplicateElements(tempElement); + this.tagsToRemove.forEach((tag) => { let elements = tempElement.querySelectorAll(tag); elements.forEach((el) => el.remove()); @@ -71,6 +130,7 @@ export class HTMLCleaner { */ cleanSemantic(html: string): string { const tempElement = this.parseHTML(html); + this.removeHiddenAndDuplicateElements(tempElement); let importantText = ''; const importantTags = [ 'article', @@ -141,6 +201,7 @@ export class HTMLCleaner { */ preserveSemanticHierarchy(html: string): string { const tempElement = this.parseHTML(html); + this.removeHiddenAndDuplicateElements(tempElement); const headingLevels = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']; let structuredContent = ''; diff --git a/src/engines/transformer-engine-wrapper.ts b/src/engines/transformer-engine-wrapper.ts index c60f80e..39be468 100644 --- a/src/engines/transformer-engine-wrapper.ts +++ b/src/engines/transformer-engine-wrapper.ts @@ -13,6 +13,22 @@ import { ModelConfig } from '../config/models/types'; import { TTSEngine, SAMPLE_RATE as TTS_SAMPLE_RATE } from './tts-engine'; import { AutoProcessor, MultiModalityCausalLM } from '../libs/transformers/transformers'; +/** + * Detect whether a usable WebGPU adapter is available. + * Falls back to CPU (ONNX WASM backend) when WebGPU is absent or fails. + */ +async function detectBestDevice(): Promise<'webgpu' | 'cpu'> { + if (typeof navigator === 'undefined' || !('gpu' in navigator)) return 'cpu'; + try { + const adapter = await ( + navigator as unknown as { gpu: { requestAdapter(): Promise } } + ).gpu.requestAdapter(); + return adapter ? 'webgpu' : 'cpu'; + } catch { + return 'cpu'; + } +} + export class TransformersEngineWrapper { private transformersPipeline: | TextGenerationPipeline @@ -48,7 +64,15 @@ export class TransformersEngineWrapper { this.modelType = modelConfig.modelType; - options.device = 'webgpu'; + // Detect the best available compute device; fall back to CPU/WASM when + // WebGPU is unavailable (e.g. Firefox, older Chromium, Node.js). + // Callers may still override by passing `options.device` explicitly. + if (!options.device) { + options.device = await detectBestDevice(); + if (options.device === 'cpu') { + console.info('[Transformers] WebGPU unavailable — falling back to CPU/WASM inference'); + } + } // Configure pipeline options with proper worker settings const pipelineOptions = { @@ -67,7 +91,6 @@ export class TransformersEngineWrapper { // Initialize image processor for multimodal models if (modelConfig.modelType === 'multimodal') { - options.device = 'webgpu'; // console.log('Loading multimodal model...'); this.imageProcessor = await AutoProcessor.from_pretrained(modelConfig.repo, pipelineOptions); // console.log('Image processor loaded');