diff --git a/src/main/constants/gemini.ts b/src/main/constants/gemini.ts index e44551f..97f33af 100644 --- a/src/main/constants/gemini.ts +++ b/src/main/constants/gemini.ts @@ -5,6 +5,22 @@ export const GEMINI_MODEL_2_5_FLASH = 'gemini-2.5-flash' export const GEMINI_MODEL_2_5_FLASH_LITE = 'gemini-2.5-flash-lite' export const GEMINI_MODEL_3_FLASH_PREVIEW = 'gemini-3-flash-preview' export const GEMINI_MODEL_3_1_FLASH_IMAGE_PREVIEW = 'gemini-3.1-flash-image-preview' +export const GEMINI_MODEL_3_PRO_IMAGE_PREVIEW = 'gemini-3-pro-image-preview' + +export const MODEL_METADATA: Record = { + [GEMINI_MODEL_2_5_PRO]: { label: 'Gemini 2.5 Pro' }, + [GEMINI_MODEL_2_5_FLASH]: { label: 'Gemini 2.5 Flash' }, + [GEMINI_MODEL_2_5_FLASH_LITE]: { label: 'Gemini 2.5 Flash Lite' }, + [GEMINI_MODEL_3_FLASH_PREVIEW]: { label: 'Gemini 3 Flash Preview' }, + [GEMINI_MODEL_3_1_FLASH_IMAGE_PREVIEW]: { + label: 'Nano Banana 2', + description: 'Pro-level visual intelligence with Flash-speed efficiency and reality-grounded generation capabilities.' + }, + [GEMINI_MODEL_3_PRO_IMAGE_PREVIEW]: { + label: 'Nano Banana Pro', + description: 'State-of-the-art image generation and editing model.' + } +} export const DEFAULT_MODEL_SETTINGS: ModelSettings = { pricing: { @@ -24,7 +40,7 @@ export const DEFAULT_MODEL_SETTINGS: ModelSettings = { input: { text: 0.30, audio: 1.00, - standard: 0.30 // Fallback for simple calc + standard: 0.30 }, output: { standard: 2.50 @@ -58,6 +74,15 @@ export const DEFAULT_MODEL_SETTINGS: ModelSettings = { standard: 3.00, image: 0.0672 } + }, + [GEMINI_MODEL_3_PRO_IMAGE_PREVIEW]: { + input: { + standard: 2.00 + }, + output: { + standard: 12.00, + image: 0.134 + } } }, selection: { diff --git a/src/main/gemini/adapter.ts b/src/main/gemini/adapter.ts index 9230446..efb1af1 100644 --- a/src/main/gemini/adapter.ts +++ b/src/main/gemini/adapter.ts @@ -28,12 +28,15 @@ export class GeminiAdapter { } private extractUsage(response: any): Usage { + if (!response) { + return { promptTokens: 0, candidatesTokens: 0, thinkingTokens: 0, totalTokens: 0 }; + } const usage = response.usageMetadata || { promptTokenCount: 0, candidatesTokenCount: 0, totalTokenCount: 0, thoughtsTokenCount: 0 }; return { - promptTokens: usage.promptTokenCount!, - candidatesTokens: usage.candidatesTokenCount!, - thinkingTokens: usage.thoughtsTokenCount, - totalTokens: usage.totalTokenCount! + promptTokens: usage.promptTokenCount || 0, + candidatesTokens: usage.candidatesTokenCount || 0, + thinkingTokens: usage.thoughtsTokenCount || 0, + totalTokens: usage.totalTokenCount || 0 }; } @@ -126,26 +129,49 @@ export class GeminiAdapter { modelName: string, userPrompt: string, systemInstruction?: string, - signal?: AbortSignal + signal?: AbortSignal, + imagePaths?: string[] ): Promise<{ text: string, record: UsageRecord }> { - const request: GenerateContentParameters = { + const parts: any[] = [] + + // Add image parts if provided + const validImagePaths: string[] = [] + if (imagePaths && imagePaths.length > 0) { + for (const imgPath of imagePaths) { + if (fs.existsSync(imgPath)) { + const data = fs.readFileSync(imgPath).toString('base64') + parts.push({ + inlineData: { + data, + mimeType: 'image/jpeg' + } + }) + validImagePaths.push(imgPath) + } + } + } + + // Add text part LAST + parts.push({ text: userPrompt }) + + const request: any = { model: modelName, - contents: [{ role: 'user', parts: [{ text: userPrompt }] }] + contents: [{ role: 'user', parts }] }; if (systemInstruction) { - request.config = { - systemInstruction: systemInstruction + request.systemInstruction = { + parts: [{ text: systemInstruction }] }; } try { const response = await this.withRetry( - () => (this.client.models as any).generateContent(request, { signal }) as Promise, + () => (this.client as any).models.generateContent(request, { signal }), signal ); const usage = this.extractUsage(response); - const cost = GeminiAdapter.calculateCost(modelName, usage); + const cost = GeminiAdapter.calculateCost(modelName, usage, 0, validImagePaths.length); return { text: this.extractResultText(response), diff --git a/src/main/index.ts b/src/main/index.ts index 5c573bd..a72c2ef 100644 --- a/src/main/index.ts +++ b/src/main/index.ts @@ -22,6 +22,7 @@ import { checkFFmpegAvailability, getVideoMetadata } from './ffmpeg' import { checkScenedetectAvailability } from './scenedetect' import { checkYtDlpAvailability, downloadVideo, getVideoFormats } from './ytdlp' import { THREAD_DIRS } from './constants/paths' +import { GEMINI_MODEL_2_5_FLASH, MODEL_METADATA } from './constants/gemini' import { electronApp, optimizer, is } from '@electron-toolkit/utils' const activePipelines = new Map() @@ -310,6 +311,10 @@ app.whenReady().then(() => { return settingsManager.resetModelSettings() }) + ipcMain.handle('get-model-metadata', () => { + return MODEL_METADATA + }) + // Thread Management ipcMain.handle('create-thread', async (_event, { videoPath, videoName, imagePaths }) => { const newThread = await threadManager.createThread(videoPath, videoName, imagePaths) @@ -375,9 +380,9 @@ app.whenReady().then(() => { return await threadManager.toggleThreadReferenceFrame(threadId, filePath) }) - ipcMain.handle('show-confirmation', async (_event, { title, message, detail, type = 'question', buttons = ['Cancel', 'Yes'], defaultId = 1, cancelId = 0 }) => { + ipcMain.handle('show-confirmation', async (_event, { title, message, detail, type = 'question', buttons = ['Cancel', 'Yes'], defaultId = 1, cancelId = 0, checkboxLabel }) => { const focusedWindow = BrowserWindow.getFocusedWindow() - if (!focusedWindow) return cancelId + if (!focusedWindow) return { response: cancelId, checkboxChecked: false } const result = await dialog.showMessageBox(focusedWindow, { type: type as any, @@ -386,10 +391,14 @@ app.whenReady().then(() => { cancelId, title, message, - detail + detail, + checkboxLabel }) - return result.response + return { + response: result.response, + checkboxChecked: result.checkboxChecked + } }) ipcMain.handle('save-video', async (_event, sourcePath: string) => { @@ -457,6 +466,47 @@ app.whenReady().then(() => { return result.path }) + ipcMain.handle('improvise-message', async (_event, { threadId, messageId }) => { + console.log(`[DEBUG IPC] improvise-message called: threadId=${threadId}, messageId=${messageId}`) + + const thread = threadManager.getThread(threadId) + if (!thread) throw new Error('Thread not found') + + const message = thread.messages.find(m => m.id === messageId) + if (!message) throw new Error('Message not found') + + // Get the history leading up to this message (including the message itself) + const { text: context } = threadManager.getBranchContext(threadId, messageId) + + // Collect attached images from THIS message only for visual context in improvisation + const attachedImages = message.attachedImages || [] + + const modelSettings = settingsManager.getModelSettings() + const modelName = modelSettings.selection['intent'] || GEMINI_MODEL_2_5_FLASH + const adapter = GeminiAdapter.create() + + const systemInstruction = `You are a high-level Prompt Engineer. Your task is to IMPROVISE and REWRITE a user's latest prompt to be much more descriptive, detailed, and effective for high-fidelity AI generation. +- The conversation history is provided ONLY for context. You MUST NOT improve the history, only the latest prompt. +- Use the attached images for visual context to make the prompt more vivid and accurate. +- Maintain the original intent exactly. +- USE natural, descriptive, and professional language. +- DO NOT return JSON, technical metadata, coordinates, bounding boxes, or "box_2d" tags. +- DO NOT return object detection results. +- DO NOT include any explanations, labels like "Improvised:", or surrounding quotes. +- Return ONLY the clean, improved prompt text itself.` + + const userPrompt = `[CONTEXT/HISTORY]:\n${context}\n\n[TASK]: Based on the context above and the attached images, rewrite the user's latest message to be a high-performance, descriptive generation prompt. Return only the improved text.` + + const result = await adapter.generateText(modelName, userPrompt, systemInstruction, undefined, attachedImages) + + // Track usage for the message that initiated it + if (result.record) { + await threadManager.updateMessageUsage(threadId, messageId, result.record) + } + + return result.text + }) + app.on('activate', function () { if (BrowserWindow.getAllWindows().length === 0) createWindow() }) diff --git a/src/main/threads/index.ts b/src/main/threads/index.ts index 9c8fcd3..417b83b 100644 --- a/src/main/threads/index.ts +++ b/src/main/threads/index.ts @@ -128,6 +128,7 @@ class ThreadManager { messages: [], versionCounter: 0, videoMetadata, + usageHistory: [], createdAt: Date.now(), updatedAt: Date.now() } @@ -277,6 +278,11 @@ class ThreadManager { // Try to repair paths if they seem broken/moved thread = this.repairThreadPaths(thread) + // Migrate usage if needed + if (this.migrateThreadUsage(thread)) { + this.saveThread(thread) + } + // Mark as missing if artifacts directory is not found, but DO NOT DELETE metadata. // This allows the user to repair the path or recover files manually. thread.missing = !!(thread.tempDir && !fs.existsSync(thread.tempDir)) @@ -300,7 +306,14 @@ class ThreadManager { try { const content = fs.readFileSync(filePath, 'utf-8') - return JSON.parse(content) + const thread = JSON.parse(content) as Thread + + // Migrate usage if needed + if (this.migrateThreadUsage(thread)) { + this.saveThread(thread) + } + + return thread } catch (error) { console.error(`Failed to read thread ${id}:`, error) return null @@ -460,8 +473,18 @@ class ThreadManager { // Update usage and cost for a message async updateMessageUsage(threadId: string, messageId: string, record: UsageRecord): Promise { + const thread = this.getThread(threadId) + if (!thread) return false + + const historyRecord: UsageRecord = { + ...record, + timestamp: Date.now(), + messageId + } + const result = await this.updateThread(threadId, { - messages: (this.getThread(threadId)?.messages || []).map(m => { + usageHistory: [...(thread.usageHistory || []), historyRecord], + messages: (thread.messages || []).map(m => { if (m.id !== messageId) return m const newUsage: Usage = { @@ -677,6 +700,32 @@ class ThreadManager { return !!result } + + // Migrate usage from messages to usageHistory if history is empty + private migrateThreadUsage(thread: Thread): boolean { + // If usageHistory already has data, no migration needed + if (thread.usageHistory && thread.usageHistory.length > 0) return false + + const history: UsageRecord[] = [] + for (const msg of thread.messages) { + if (msg.usage && msg.cost && msg.cost > 0) { + history.push({ + usage: msg.usage, + cost: msg.cost, + timestamp: msg.createdAt, + messageId: msg.id + }) + } + } + + // Only migrate if we found usage or if usageHistory is missing (to initialize it) + if (history.length > 0 || thread.usageHistory === undefined) { + thread.usageHistory = history + return true + } + + return false + } } export const threadManager = new ThreadManager() diff --git a/src/main/timeline/index.ts b/src/main/timeline/index.ts index 4684c26..4025d9c 100644 --- a/src/main/timeline/index.ts +++ b/src/main/timeline/index.ts @@ -136,13 +136,17 @@ Task: Provide a list of indices representing the new timeline after applying the `; try { - const { text: responseText, record } = await geminiAdapter.generateText(modelName, prompt, systemInstruction, signal); - console.log(`Gemini response (Edit Mode):`, responseText); + const { data: indices, record } = await geminiAdapter.generateStructuredText( + modelName, + prompt, + INDICES_SCHEMA, + systemInstruction, + signal + ); + console.log(`Gemini response (Edit Mode):`, indices); // Record usage for the edit call onRecordUsage?.(record); - - const indices = parseIndicesFromResponse(responseText); const newTimeline: EnrichedTimelineSegment[] = []; for (const idx of indices) { @@ -251,14 +255,18 @@ Task: Pick the next 3 segments to add to the timeline. `; try { - const { text: responseText, record } = await geminiAdapter.generateText(modelName, prompt, systemInstruction, signal); + const { data: indices, record } = await geminiAdapter.generateStructuredText( + modelName, + prompt, + INDICES_SCHEMA, + systemInstruction, + signal + ); // Record usage IMMEDIATELY after call finishes, before any abort checks onRecordUsage?.(record); - console.log(`Gemini response (Iteration ${iterationCount}):`, responseText); - - const indices = parseIndicesFromResponse(responseText); + console.log(`Gemini response (Iteration ${iterationCount}):`, indices); if (indices.length === 0) { console.warn("No indices returned from Gemini, stopping loop."); @@ -338,13 +346,10 @@ function formatEnrichedTranscript(items: EnrichedTimelineSegment[]): string { } -function parseIndicesFromResponse(text: string): number[] { - const jsonMatch = text.match(/\[([\d,\s]+)\]/); - if (jsonMatch) { - return jsonMatch[1] - .split(',') - .map(s => parseInt(s.trim())) - .filter(n => !isNaN(n)); - } - return []; -} \ No newline at end of file +/** + * JSON schema that forces Gemini to return a plain array of segment indices. + */ +const INDICES_SCHEMA = { + type: 'ARRAY', + items: { type: 'INTEGER' } +}; \ No newline at end of file diff --git a/src/preload/index.ts b/src/preload/index.ts index ba8ba01..f8c378f 100644 --- a/src/preload/index.ts +++ b/src/preload/index.ts @@ -67,7 +67,10 @@ const api = { getVideoMetadata: (filePath: string) => ipcRenderer.invoke('get-video-metadata', filePath), showOpenDialog: (options: any) => ipcRenderer.invoke('show-open-dialog', options), upscaleImage: (data: { threadId: string, messageId: string, imagePath: string, upscaleFactor: string }) => - ipcRenderer.invoke('upscale-image', data) + ipcRenderer.invoke('upscale-image', data), + improviseMessage: (threadId: string, messageId: string) => + ipcRenderer.invoke('improvise-message', { threadId, messageId }), + getModelMetadata: () => ipcRenderer.invoke('get-model-metadata') } if (process.contextIsolated) { diff --git a/src/renderer/src/assets/main.css b/src/renderer/src/assets/main.css index f6123f4..798b0da 100644 --- a/src/renderer/src/assets/main.css +++ b/src/renderer/src/assets/main.css @@ -34,7 +34,8 @@ } .glass-card-hover { - @apply hover:shadow-premium-hover hover:border-primary/30 transition-all duration-500; + @apply hover:shadow-premium-hover hover:border-primary/30; + transition: box-shadow 0.5s cubic-bezier(0.4, 0, 0.2, 1), border-color 0.5s cubic-bezier(0.4, 0, 0.2, 1); } .input-focus-ring { diff --git a/src/renderer/src/components/chat/BaseMessageInput.vue b/src/renderer/src/components/chat/BaseMessageInput.vue index 604f261..bcacdc1 100644 --- a/src/renderer/src/components/chat/BaseMessageInput.vue +++ b/src/renderer/src/components/chat/BaseMessageInput.vue @@ -1,21 +1,22 @@