sauravpanda · sauravpanda · Apr 16, 2026 · Apr 15, 2026
diff --git a/src/config/models/flare-models.json b/src/config/models/flare-models.json
@@ -0,0 +1,128 @@
+{
+  "smollm2-135m-flare": {
+    "engine": "flare",
+    "modelName": "SmolLM2-135M-Instruct",
+    "modelType": "text-generation",
+    "repo": "HuggingFaceTB/smollm2-135M-instruct-GGUF",
+    "url": "https://huggingface.co/HuggingFaceTB/smollm2-135M-instruct-GGUF/resolve/main/smollm2-135m-instruct-q8_0.gguf",
+    "pipeline": "text-generation",
+    "defaultQuantization": "Q8_0",
+    "quantizations": ["Q8_0"],
+    "architecture": "llama",
+    "downloadSizeMB": 138,
+    "contextLength": 2048,
+    "defaultParams": {
+      "temperature": 0.7,
+      "maxTokens": 512
+    },
+    "metadata": {
+      "description": "Smallest Flare model — instant load, great for demos",
+      "tier": 1
+    }
+  },
+  "smollm2-135m-flare-q4": {
+    "engine": "flare",
+    "modelName": "SmolLM2-135M-Instruct-Q4",
+    "modelType": "text-generation",
+    "repo": "HuggingFaceTB/smollm2-135M-instruct-GGUF",
+    "url": "https://huggingface.co/HuggingFaceTB/smollm2-135M-instruct-GGUF/resolve/main/smollm2-135m-instruct-q4_k_m.gguf",
+    "pipeline": "text-generation",
+    "defaultQuantization": "Q4_K_M",
+    "quantizations": ["Q4_K_M"],
+    "architecture": "llama",
+    "downloadSizeMB": 75,
+    "contextLength": 2048,
+    "defaultParams": {
+      "temperature": 0.7,
+      "maxTokens": 512
+    },
+    "metadata": {
+      "description": "Smallest download (~75 MB), great for bandwidth-constrained environments",
+      "tier": 1
+    }
+  },
+  "smollm2-360m-flare": {
+    "engine": "flare",
+    "modelName": "SmolLM2-360M-Instruct",
+    "modelType": "text-generation",
+    "repo": "HuggingFaceTB/smollm2-360M-instruct-GGUF",
+    "url": "https://huggingface.co/HuggingFaceTB/smollm2-360M-instruct-GGUF/resolve/main/smollm2-360m-instruct-q8_0.gguf",
+    "pipeline": "text-generation",
+    "defaultQuantization": "Q8_0",
+    "quantizations": ["Q8_0"],
+    "architecture": "llama",
+    "downloadSizeMB": 350,
+    "contextLength": 2048,
+    "defaultParams": {
+      "temperature": 0.7,
+      "maxTokens": 512
+    },
+    "metadata": {
+      "description": "Better quality than 135M while still loading quickly",
+      "tier": 2
+    }
+  },
+  "qwen2.5-0.5b-flare": {
+    "engine": "flare",
+    "modelName": "Qwen2.5-0.5B-Instruct",
+    "modelType": "text-generation",
+    "repo": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
+    "url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf",
+    "pipeline": "text-generation",
+    "defaultQuantization": "Q4_K_M",
+    "quantizations": ["Q4_K_M"],
+    "architecture": "qwen2",
+    "downloadSizeMB": 350,
+    "contextLength": 4096,
+    "defaultParams": {
+      "temperature": 0.7,
+      "maxTokens": 512
+    },
+    "metadata": {
+      "description": "Multilingual model with strong reasoning — Alibaba Qwen2.5 family",
+      "tier": 2
+    }
+  },
+  "llama-3.2-1b-flare": {
+    "engine": "flare",
+    "modelName": "Llama-3.2-1B-Instruct",
+    "modelType": "text-generation",
+    "repo": "bartowski/Llama-3.2-1B-Instruct-GGUF",
+    "url": "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q8_0.gguf",
+    "pipeline": "text-generation",
+    "defaultQuantization": "Q8_0",
+    "quantizations": ["Q8_0"],
+    "architecture": "llama",
+    "downloadSizeMB": 1200,
+    "contextLength": 4096,
+    "defaultParams": {
+      "temperature": 0.7,
+      "maxTokens": 512
+    },
+    "metadata": {
+      "description": "Best quality in the Flare tier — Meta Llama 3.2 1B full precision Q8",
+      "tier": 3
+    }
+  },
+  "llama-3.2-1b-flare-q4": {
+    "engine": "flare",
+    "modelName": "Llama-3.2-1B-Instruct-Q4",
+    "modelType": "text-generation",
+    "repo": "bartowski/Llama-3.2-1B-Instruct-GGUF",
+    "url": "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
+    "pipeline": "text-generation",
+    "defaultQuantization": "Q4_K_M",
+    "quantizations": ["Q4_K_M"],
+    "architecture": "llama",
+    "downloadSizeMB": 600,
+    "contextLength": 4096,
+    "defaultParams": {
+      "temperature": 0.7,
+      "maxTokens": 512
+    },
+    "metadata": {
+      "description": "Balanced quality/size — Llama 3.2 1B at Q4_K_M quantization",
+      "tier": 3
+    }
+  }
+}
diff --git a/src/config/models/types.ts b/src/config/models/types.ts
@@ -44,4 +44,16 @@ export interface DemucsConfig extends BaseModelConfig {
   executionProviders?: ('webgpu' | 'wasm')[];
 }
 
-export type ModelConfig = MLCConfig | TransformersConfig | DemucsConfig;
+export interface FlareConfig extends BaseModelConfig {
+  engine: 'flare';
+  /** Direct URL to the GGUF model file (overrides registry URL) */
+  url?: string;
+  /** Model architecture hint (e.g. 'llama', 'mistral', 'qwen2') */
+  architecture?: string;
+  /** Quantization string (e.g. 'Q8_0', 'Q4_K_M') */
+  quantization?: string;
+  /** Approximate download size in MB */
+  downloadSizeMB?: number;
+}
+
+export type ModelConfig = MLCConfig | TransformersConfig | DemucsConfig | FlareConfig;
diff --git a/src/core/llm/index.ts b/src/core/llm/index.ts
@@ -4,20 +4,23 @@ import { MLCEngineWrapper } from '../../engines/mlc-engine-wrapper';
 import { TransformersEngineWrapper } from '../../engines/transformer-engine-wrapper';
 import { DemucsEngine } from '../../engines/demucs-engine';
 import type { SeparateOptions, SeparationResult } from '../../engines/demucs-engine';
-import type { ModelConfig, MLCConfig, TransformersConfig, DemucsConfig } from '../../config/models/types';
+import { FlareEngineWrapper, FlareAdapterOptions } from '../../engines/flare-engine-wrapper';
+import type { ModelConfig, MLCConfig, TransformersConfig, DemucsConfig, FlareConfig } from '../../config/models/types';
 import mlcModels from '../../config/models/mlc-models.json';
 import transformersModels from '../../config/models/transformers-models.json';
 import demucsModels from '../../config/models/demucs-models.json';
+import flareModels from '../../config/models/flare-models.json';
 
 // Combine model configurations
 const MODEL_CONFIG: Record<string, ModelConfig> = {
   ...(mlcModels as Record<string, MLCConfig>),
   ...(transformersModels as Record<string, TransformersConfig>),
   ...(demucsModels as Record<string, DemucsConfig>),
+  ...(flareModels as Record<string, FlareConfig>),
 };
 
 export class BrowserAI {
-  private engine: MLCEngineWrapper | TransformersEngineWrapper | DemucsEngine | null;
+  private engine: MLCEngineWrapper | TransformersEngineWrapper | DemucsEngine | FlareEngineWrapper | null;
   public currentModel: ModelConfig | null;
   private mediaRecorder: MediaRecorder | null = null;
   private mediaStream: MediaStream | null = null;
@@ -46,13 +49,12 @@ export class BrowserAI {
       throw new Error(`Model identifier "${this.modelIdentifier}" not recognized.`);
     }
 
-    // Check if model exists in both MLC and Transformers configs
+    // Check if model exists in MLC config (preferred for text-generation)
     const mlcVersion = (mlcModels as Record<string, MLCConfig>)[this.modelIdentifier];
-    // const transformersVersion = (transformersModels as Record<string, TransformersConfig>)[modelIdentifier];
 
-    // For text-generation models, prefer MLC if available
+    // For text-generation models, prefer MLC if available (unless explicitly requesting flare)
     let engineToUse = modelConfig.engine;
-    if (modelConfig.modelType === 'text-generation' && mlcVersion) {
+    if (modelConfig.modelType === 'text-generation' && mlcVersion && engineToUse !== 'flare') {
       engineToUse = 'mlc';
     }
 
@@ -69,6 +71,12 @@ export class BrowserAI {
         this.engine = new DemucsEngine();
         await this.engine.loadModel(modelConfig, options);
         break;
+      case 'flare': {
+        const flareEngine = new FlareEngineWrapper();
+        await flareEngine.loadModel(modelConfig as FlareConfig, options);
+        this.engine = flareEngine;
+        break;
+      }
       default:
         throw new Error(`Engine "${engineToUse}" not supported.`);
     }
@@ -100,6 +108,11 @@ export class BrowserAI {
     if (this.engine instanceof DemucsEngine) {
       throw new Error('Current engine does not support embeddings.');
     }
+    if (this.engine instanceof FlareEngineWrapper) {
+      throw new Error(
+        'Flare engine does not support embeddings. Use a Transformers.js feature-extraction model instead.',
+      );
+    }
     return await this.engine.embed(input, options);
   }
 
@@ -270,6 +283,43 @@ export class BrowserAI {
     throw new Error('Current engine does not support multimodal generation');
   }
 
+  /**
+   * Load a LoRA adapter into the current Flare engine.
+   *
+   * Only supported when using the Flare engine. The adapter must be in
+   * SafeTensors format and compatible with the loaded base model.
+   *
+   * @example
+   * ```ts
+   * await ai.loadModel('llama-3.2-1b-flare');
+   * await ai.loadAdapter({ url: 'https://hf.co/.../adapter.safetensors' });
+   * ```
+   */
+  async loadAdapter(options: FlareAdapterOptions): Promise<void> {
+    if (!(this.engine instanceof FlareEngineWrapper)) {
+      throw new Error('loadAdapter is only supported with the Flare engine.');
+    }
+    return this.engine.loadAdapter(options);
+  }
+
+  /**
+   * Check whether the current Flare model is cached in OPFS for instant reload.
+   */
+  async isFlareModelCached(): Promise<boolean> {
+    if (!(this.engine instanceof FlareEngineWrapper)) return false;
+    return this.engine.isCached();
+  }
+
+  /**
+   * Delete the OPFS cache entry for the current Flare model.
+   */
+  async clearFlareModelCache(): Promise<void> {
+    if (!(this.engine instanceof FlareEngineWrapper)) {
+      throw new Error('clearFlareModelCache is only supported with the Flare engine.');
+    }
+    return this.engine.clearCache();
+  }
+
   async clearModelCache(): Promise<void> {
     try {
       const cacheNames = ['webllm/config', 'webllm/wasm', 'webllm/model'];