diff --git a/README.md b/README.md index e4cb94d..3ff53a7 100644 --- a/README.md +++ b/README.md @@ -65,17 +65,32 @@ If a browser tab already translates it, you don't need WinLens. For everything e - Upscales the screenshot before OCR and picks the best recognizer per text block, which helps accuracy on small text. - Dark control panel. You can change the target language straight from the overlay. - Right-click a block to copy the original text or the translation. +- Optional OpenAI-compatible API engine (OpenAI, DeepSeek, Ollama, …) for higher-quality, context-aware translation. Off by default; the free built-in engine stays the fallback. - Optional "launch at startup". Otherwise it stays out of the way in the tray. ## Control panel A small tray app with a dark control panel. Pick the target language, set the hotkey, -choose the OCR source language, and toggle launch-at-startup. +choose the OCR source language, pick the translation engine, and toggle launch-at-startup.
WinLens control panel
+### Translation engine + +WinLens translates with the free built-in engine (Google, MyMemory fallback) out of the box — +no setup, no key. If you want higher-quality, context-aware translation, pick **OpenAI-compatible +API** under *Translation engine* and fill in: + +- **API base URL** — e.g. `https://api.openai.com/v1`, `https://api.deepseek.com/v1`, or a local `http://localhost:11434/v1` (Ollama). +- **API key** — your bearer token. +- **Model** — e.g. `gpt-4o-mini`, `deepseek-chat`. + +The key is stored locally in `%APPDATA%\WinLens\settings.json` and sent only to the endpoint you +configure. If a request fails (bad key, no network, rate limit), WinLens falls back to the +built-in engine so text is never left untranslated. + ## Installation ### Download @@ -144,7 +159,7 @@ Hotkey > capture screen > upscale > OCR (per script) > translate > overlay in pl 2. Upscale the image about 2x so small UI text is recognized more reliably. 3. Run every installed OCR recognizer and keep from each only the blocks whose script matches it (Latin from the Latin engine, CJK from the CJK engine), then drop overlapping duplicates. -4. Translate each line (Google endpoint, with a MyMemory fallback), cached per session. +4. Translate each line, cached per session. By default the free built-in engine is used (Google endpoint, with a MyMemory fallback). If an OpenAI-compatible API is configured in the control panel, it is tried first and the built-in engine becomes the fallback. 5. Draw an opaque, color- and font-matched box over each original line. ## Roadmap diff --git a/src/Models/UserSettings.cs b/src/Models/UserSettings.cs index 26d4d64..851b967 100644 --- a/src/Models/UserSettings.cs +++ b/src/Models/UserSettings.cs @@ -3,6 +3,16 @@ namespace WinLens.Models; +/// Which backend translates the captured text. +public enum TranslationProvider +{ + /// Built-in free engines: Google gtx with a MyMemory fallback (the default). + BuiltIn, + + /// An OpenAI-compatible chat endpoint (OpenAI, DeepSeek, Ollama, etc.). + OpenAiCompatible, +} + public sealed class UserSettings { public string TargetLanguage { get; set; } = "en"; @@ -15,4 +25,19 @@ public sealed class UserSettings public HotkeyModifiers HotkeyModifiers { get; set; } = HotkeyModifiers.Control | HotkeyModifiers.Alt; public Key HotkeyKey { get; set; } = Key.T; + + /// + /// Translation backend. Defaults to so + /// existing installs keep the free Google/MyMemory behaviour untouched. + /// + public TranslationProvider TranslationProvider { get; set; } = TranslationProvider.BuiltIn; + + /// Base URL of the OpenAI-compatible API, e.g. https://api.openai.com/v1. + public string LlmBaseUrl { get; set; } = ""; + + /// Bearer token for the OpenAI-compatible API. + public string LlmApiKey { get; set; } = ""; + + /// Chat model id, e.g. gpt-4o-mini. + public string LlmModel { get; set; } = ""; } diff --git a/src/Services/TranslationService.cs b/src/Services/TranslationService.cs index 66a386a..f2a702d 100644 --- a/src/Services/TranslationService.cs +++ b/src/Services/TranslationService.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Concurrent; +using System.Collections.Generic; using System.IO; using System.Net.Http; using System.Text; @@ -7,12 +8,15 @@ using System.Threading; using System.Threading.Tasks; using System.Web; +using WinLens.Models; namespace WinLens.Services; /// -/// Translates short strings. Tries Google gtx first (auto source detection), -/// falls back to MyMemory. Logs each failure to %TEMP%\winlens.log. +/// Translates short strings. By default it uses the free built-in engines +/// (Google gtx with a MyMemory fallback). When the user opts into an +/// OpenAI-compatible API in settings, that is tried first and the built-in +/// engines act as the fallback. Logs each failure to %TEMP%\winlens.log. /// public sealed class TranslationService : IDisposable { @@ -20,17 +24,33 @@ public sealed class TranslationService : IDisposable private const string MyMemoryEndpoint = "https://api.mymemory.translated.net/get"; private readonly HttpClient _http; + private readonly HttpClient _llmHttp; private readonly ConcurrentDictionary<(string text, string tgt), string> _cache = new(); private readonly string _logPath; + private UserSettings? _config; public TranslationService() { _http = new HttpClient { Timeout = TimeSpan.FromSeconds(10) }; _http.DefaultRequestHeaders.UserAgent.ParseAdd( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) WinLens/1.0"); + // LLM calls reason over the text and can take longer than the web engines. + _llmHttp = new HttpClient { Timeout = TimeSpan.FromSeconds(30) }; _logPath = Path.Combine(Path.GetTempPath(), "winlens.log"); } + /// + /// Supplies the live settings object (shared with the UI, so edits there + /// apply immediately). Until called, only the built-in engines are used. + /// + public void Configure(UserSettings settings) => _config = settings; + + private bool UseLlm => + _config is { TranslationProvider: TranslationProvider.OpenAiCompatible } c && + !string.IsNullOrWhiteSpace(c.LlmApiKey) && + !string.IsNullOrWhiteSpace(c.LlmBaseUrl) && + !string.IsNullOrWhiteSpace(c.LlmModel); + public async Task TranslateAsync( string text, string targetLang, @@ -38,7 +58,7 @@ public async Task TranslateAsync( CancellationToken ct = default) { // sourceLang from the OCR engine is the engine's profile language, not - // the actual content language. Ignore it — let Google auto-detect. + // the actual content language. Ignore it — let the engine auto-detect. _ = sourceLang; if (string.IsNullOrWhiteSpace(text)) @@ -49,6 +69,18 @@ public async Task TranslateAsync( if (_cache.TryGetValue(key, out var cached)) return cached; + // Opt-in LLM engine first; on any failure fall through to the built-in + // engines so a misconfigured key never leaves text untranslated. + if (UseLlm) + { + var llm = await TryLlmAsync(text, tgt, ct); + if (llm != null) + { + _cache[key] = llm; + return llm; + } + } + var google = await TryGoogleAsync(text, tgt, ct); if (google != null) { @@ -145,6 +177,64 @@ public async Task TranslateAsync( } } + private async Task TryLlmAsync(string text, string tgt, CancellationToken ct) + { + var cfg = _config!; + var url = cfg.LlmBaseUrl.TrimEnd('/') + "/chat/completions"; + var payload = new Dictionary + { + ["model"] = cfg.LlmModel, + ["temperature"] = 0.1, + ["messages"] = new object[] + { + new + { + role = "system", + content = + "You are a translation engine. The user sends a single short string captured by OCR; " + + "it may contain small OCR errors (l/I confusion, missing spaces) — infer the intended text. " + + $"Translate it into the language whose BCP-47 code is \"{tgt}\". " + + "Reply with ONLY the translation: no quotes, no explanations, no markdown. " + + "Keep numbers, file names and proper nouns unchanged.", + }, + new { role = "user", content = text }, + }, + }; + // DeepSeek's reasoning models default to "thinking" mode, which burns + // tokens and latency a short translation doesn't need. The flag is + // DeepSeek-specific, so only send it to that endpoint. + if (url.Contains("deepseek", StringComparison.OrdinalIgnoreCase)) + payload["thinking"] = new { type = "disabled" }; + + try + { + using var req = new HttpRequestMessage(HttpMethod.Post, url); + req.Headers.TryAddWithoutValidation("Authorization", "Bearer " + cfg.LlmApiKey.Trim()); + req.Content = new StringContent(JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json"); + + using var resp = await _llmHttp.SendAsync(req, ct); + var body = await resp.Content.ReadAsStringAsync(ct); + if (!resp.IsSuccessStatusCode) + { + Log($"llm http {(int)resp.StatusCode} for tgt={tgt}"); + return null; + } + + using var doc = JsonDocument.Parse(body); + var content = doc.RootElement + .GetProperty("choices")[0] + .GetProperty("message") + .GetProperty("content") + .GetString(); + return string.IsNullOrWhiteSpace(content) ? null : content.Trim(); + } + catch (Exception ex) + { + Log($"llm exception: {ex.GetType().Name}: {ex.Message}"); + return null; + } + } + private void Log(string line) { try @@ -155,5 +245,9 @@ private void Log(string line) catch { /* logging must never throw */ } } - public void Dispose() => _http.Dispose(); + public void Dispose() + { + _http.Dispose(); + _llmHttp.Dispose(); + } } diff --git a/src/Theme/Theme.xaml b/src/Theme/Theme.xaml index d165d87..404690d 100644 --- a/src/Theme/Theme.xaml +++ b/src/Theme/Theme.xaml @@ -268,6 +268,36 @@ + + +